lewiuberg/functions.py

## functions.py
def unpack_list(lst: list) -> str:
    """
    Divides a list or string of list items up, and separates them with commas.

    Args:
        lst (list/str): Data passed into the function.

    Returns:
        str: det input data seperated by commas.
    """
    if not isinstance(lst, str):
        lst = [str(item) for item in lst]
    if len(lst) == 0:
        return
    if len(lst) == 1:
        return ", ".join(lst)
    if len(lst) == 2:
        return ", and ".join(lst)
    else:
        first_part = lst[:-1]
        last_part = lst[-1]
        return ", ".join(first_part) + ", and " + last_part


def word_search(df, *words) -> None:
    """
    Word search in a pandas dataframe.
    Will print out the amount the word(s) are found.

    Args:
        df (DataFrame): The dataframe to be searched.
        words (str/list): The word(s) that is searched for.
    """
    if not words or len(words[0]) < 1:
        return

    col_count: int = 0
    sum_words: int = 0
    found_words: str = []

    if isinstance(words[0], str):
        words = [word for word in words]
    else:
        words = list(*words)
        print(words)

    for word in words:
        col_count = 0
        sum_word = 0
        for column in df:
            if df[column].dtype == object or df[column].dtype == str:
                col_count += 1
                sum_word += df[column].str.contains(f"^{word}$").sum()
                if df[column].str.contains(f"^{word}$").any():
                    if word not in found_words:
                        found_words.append(word)
        sum_words += sum_word
    if len(found_words) == 0:
        found_words = words
    print("Columns of dtype str or object:", col_count)
    print(
        f"Instances of {unpack_list(found_words)} in the dataframe: {sum_words}"
    )


def find_missing_values(df) -> None:
    """
    Finds the amount of missing values in a Pandas DataFrame.

    Args:
        df (DataFrame): The dataframe being searched for missing values.
    """
    column_names = (df.columns[df.isnull().any() is True]).format()
    miss_columns = df.isna().any().sum()
    miss_values = df.isna().sum().sum()
    print(f"Instances of missing data: {miss_values}")
    print(f"Columns with missing data: {miss_columns}")
    print(f"Column names with missing data: {unpack_list(column_names)}")


def path_checker(path: str) -> bool:
    """
    Print to the user if the path exists, is a file or a directory.

    Args:
        path (str): The path to check.

    Returns:
        bool: True/False
    """
    from pathlib import Path
    path = Path(path)

    if path.exists():
        if path.is_dir():
            print(f"'{path}' is directory")
        else:
            if path.is_file():
                print(f"'{path}' is a file")
        return True

    else:
        print(f"'{path}' does not exist.")
        return False


def df_location_data(df, search_col):
    """
    Add columns with geographical location data based on location name or address.

    Args:
        df (DataFrame): The database holding the name/address column.
        search_col (Series): The column holding the name/address.

    Returns:
        DataFrame: The original DataFrame with location data columns included.
    """
    import pandas as pd
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    geolocator = Nominatim(user_agent="my_geocoder")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=.1)
    # Find the location.
    df['location'] = df[search_col].apply(geocode)
    # Extract point to its own columns.
    df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
    # Split point column into latitude, longitude and altitude columns.
    df[['latitude', 'longitude',
        'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

    return df


def missing_location(df):
    """
    Isolates rows, columns or both, containing missing values.

    Args:
        df (DataFrame): [description]

    Returns:
        DataFrame: Missing columns, Missing rows, and their intersection.
    """
    col_criteria = df.isnull().any(axis=0)
    miss_col = df[col_criteria.index[col_criteria]]

    miss_only = miss_col[miss_col.isnull().any(axis=1)]

    row_criteria = df.isnull().any(axis=1)
    miss_row = df[row_criteria]

    return miss_col, miss_row, miss_only


def replace_df_ax_name(df, find, replace_with="", axis=0):
    """
    Rename rows or columns. May also be used to reformat columns/rows to datetime.

    Args:
        df (DataFrame): The base dataframe.
        find (str): The (sub)string one wants to change.
        replace_with (str, optional): The (sub)string one wants to change to. Defaults to "".
        axis (int, optional): Rows(0), columns(1). Defaults to 0.

    Returns:
        DataFrame: Renamed/reformatted dataframe.
    """
    import pandas as pd
    dff = df.copy()

    if axis == 1:  # <-- Columns
        dff = dff.T

    dff_row = dff.index.to_list()
    dff_dict = {i: dff_row[i] for i in range(len(dff_row))}

    change_index: list = []
    change_dict: dict = {}

    for i, v in dff_dict.items():
        if find in v:
            change_index.append(i)
            if replace_with == "d_to_datetime":
                v = pd.to_datetime(v)
            else:
                v = v.replace(find, replace_with)
            change_dict[i] = v

    dff_dict.update(change_dict)
    dff.index = list(dff_dict.values())

    if axis == 1:  # <-- Columns
        dff = dff.T

    return dff
	def unpack_list(lst: list) -> str:
	"""
	Divides a list or string of list items up, and separates them with commas.

	Args:
	lst (list/str): Data passed into the function.

	Returns:
	str: det input data seperated by commas.
	"""
	if not isinstance(lst, str):
	lst = [str(item) for item in lst]
	if len(lst) == 0:
	return
	if len(lst) == 1:
	return ", ".join(lst)
	if len(lst) == 2:
	return ", and ".join(lst)
	else:
	first_part = lst[:-1]
	last_part = lst[-1]
	return ", ".join(first_part) + ", and " + last_part


	def word_search(df, *words) -> None:
	"""
	Word search in a pandas dataframe.
	Will print out the amount the word(s) are found.

	Args:
	df (DataFrame): The dataframe to be searched.
	words (str/list): The word(s) that is searched for.
	"""
	if not words or len(words[0]) < 1:
	return

	col_count: int = 0
	sum_words: int = 0
	found_words: str = []

	if isinstance(words[0], str):
	words = [word for word in words]
	else:
	words = list(*words)
	print(words)

	for word in words:
	col_count = 0
	sum_word = 0
	for column in df:
	if df[column].dtype == object or df[column].dtype == str:
	col_count += 1
	sum_word += df[column].str.contains(f"^{word}$").sum()
	if df[column].str.contains(f"^{word}$").any():
	if word not in found_words:
	found_words.append(word)
	sum_words += sum_word
	if len(found_words) == 0:
	found_words = words
	print("Columns of dtype str or object:", col_count)
	print(
	f"Instances of {unpack_list(found_words)} in the dataframe: {sum_words}"
	)


	def find_missing_values(df) -> None:
	"""
	Finds the amount of missing values in a Pandas DataFrame.

	Args:
	df (DataFrame): The dataframe being searched for missing values.
	"""
	column_names = (df.columns[df.isnull().any() is True]).format()
	miss_columns = df.isna().any().sum()
	miss_values = df.isna().sum().sum()
	print(f"Instances of missing data: {miss_values}")
	print(f"Columns with missing data: {miss_columns}")
	print(f"Column names with missing data: {unpack_list(column_names)}")


	def path_checker(path: str) -> bool:
	"""
	Print to the user if the path exists, is a file or a directory.

	Args:
	path (str): The path to check.

	Returns:
	bool: True/False
	"""
	from pathlib import Path
	path = Path(path)

	if path.exists():
	if path.is_dir():
	print(f"'{path}' is directory")
	else:
	if path.is_file():
	print(f"'{path}' is a file")
	return True

	else:
	print(f"'{path}' does not exist.")
	return False


	def df_location_data(df, search_col):
	"""
	Add columns with geographical location data based on location name or address.

	Args:
	df (DataFrame): The database holding the name/address column.
	search_col (Series): The column holding the name/address.

	Returns:
	DataFrame: The original DataFrame with location data columns included.
	"""
	import pandas as pd
	from geopy.geocoders import Nominatim
	from geopy.extra.rate_limiter import RateLimiter
	geolocator = Nominatim(user_agent="my_geocoder")
	geocode = RateLimiter(geolocator.geocode, min_delay_seconds=.1)
	# Find the location.
	df['location'] = df[search_col].apply(geocode)
	# Extract point to its own columns.
	df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
	# Split point column into latitude, longitude and altitude columns.
	df[['latitude', 'longitude',
	'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

	return df


	def missing_location(df):
	"""
	Isolates rows, columns or both, containing missing values.

	Args:
	df (DataFrame): [description]

	Returns:
	DataFrame: Missing columns, Missing rows, and their intersection.
	"""
	col_criteria = df.isnull().any(axis=0)
	miss_col = df[col_criteria.index[col_criteria]]

	miss_only = miss_col[miss_col.isnull().any(axis=1)]

	row_criteria = df.isnull().any(axis=1)
	miss_row = df[row_criteria]

	return miss_col, miss_row, miss_only


	def replace_df_ax_name(df, find, replace_with="", axis=0):
	"""
	Rename rows or columns. May also be used to reformat columns/rows to datetime.

	Args:
	df (DataFrame): The base dataframe.
	find (str): The (sub)string one wants to change.
	replace_with (str, optional): The (sub)string one wants to change to. Defaults to "".
	axis (int, optional): Rows(0), columns(1). Defaults to 0.

	Returns:
	DataFrame: Renamed/reformatted dataframe.
	"""
	import pandas as pd
	dff = df.copy()

	if axis == 1: # <-- Columns
	dff = dff.T

	dff_row = dff.index.to_list()
	dff_dict = {i: dff_row[i] for i in range(len(dff_row))}

	change_index: list = []
	change_dict: dict = {}

	for i, v in dff_dict.items():
	if find in v:
	change_index.append(i)
	if replace_with == "d_to_datetime":
	v = pd.to_datetime(v)
	else:
	v = v.replace(find, replace_with)
	change_dict[i] = v

	dff_dict.update(change_dict)
	dff.index = list(dff_dict.values())

	if axis == 1: # <-- Columns
	dff = dff.T

	return dff