chrisdmell/lab_utils_cls.py

## lab_utils_cls.py
import statistics

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
import os
import tempfile
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

class Utils:

    @staticmethod
    def load_data(path):
        """
        Read a CSV file from a given path and return a Pandas DataFrame
        :param path: path to csv file
        :return: returns Pandas DataFrame
        """

        df = pd.read_csv(path)
        return df

    @staticmethod
    def plot_graphs(x_data, y_data, x_label, y_label, title):
        """
        Use the Mathplot lib to plot data points provide and respective x-axis and y-axis labels
        :param x_data: Data for x-axis
        :param y_data: Data for y-axis
        :param x_label: Label for x-axis
        :param y_label: Label FOR Y-axis
        :param title: Title for the plot
        :return: return tuple (fig, ax)
        """

        plt.clf()

        fig, ax = plt.subplots()
        ax.plot(x_data, y_data)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.set_title(title)

        return (fig, ax)

    @staticmethod
    def plot_residual_graphs(predictions, y_test, x_label, y_label, title):
        """
        Create residual plot using seaborn plotting library
        https://seaborn.pydata.org/tutorial/regression.html
        :param predictions: predictions from the run
        :param y_test: actual labels
        :param x_label: name for the x-axis
        :param y_label: name for the y-axis
        :param title:  title for the plot
        :return: tuple of plt, fig, ax
        """

        fig, ax = plt.subplots()

        sns.residplot(predictions, y_test, lowess=True)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.title(title)

        return (plt, fig, ax)

    @staticmethod
    def get_mlflow_directory_path(*paths, create_dir=True):
        """
        Get the current running path where mlruns is created. This is the directory from which
        the python file containing MLflow code is executed. This method is used for artifacts, such
        as images, where we want to store plots.
        :param paths: list of directories below mlfruns, experimentID, mlflow_run_id
        :param create_dir: detfault is True
        :return: path to directory.
        """

        cwd = os.getcwd()
        dir = os.path.join(cwd, "mlruns", *paths)
        if create_dir:
            if not os.path.exists(dir):
                os.mkdir(dir, mode=0o755)
        return dir

    @staticmethod
    def get_temporary_directory_path(prefix, suffix):
        """
        Get a temporary directory and files for artifacts
        :param prefix: name of the file
        :param suffix: .csv, .txt, .png etc
        :return: object to tempfile.
        """

        temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix)
        return temp

    @staticmethod
    def print_pandas_dataset(d):
        """
        Given a Pandas dataFrame show the dimensions sizes
        :param d: Pandas dataFrame
        :return: None
        """
        print("rows = %d; columns=%d" % (d.shape[0], d.shape[1]))
        print(d.head())

    @staticmethod
    def plot_confusion_matrix(y_true, y_pred, classes,
                              normalize=False,
                              title=None,
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        Borrowed from the scikit-learn library documentation

        :param y_true: the actual value of y
        :param y_pred: the predicted valuye of y
        :param classes: list of label classes to be predicted
        :param normalize: normalize the data
        :param title: title of the plot for confusion matrix
        :param cmap: color of plot
        :return: returns a tuple of (plt, fig, ax)
        """

        if not title:
            if normalize:
                title = 'Normalized confusion matrix'
            else:
                title = 'Confusion matrix, without normalization'

        # Compute confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Only use the labels that appear in the data
        classes = classes[unique_labels(y_true, y_pred)]
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
        print(cm)

        fig, ax = plt.subplots()
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(xticks=np.arange(cm.shape[1]),
               yticks=np.arange(cm.shape[0]),
               # ... and label them with the respective list entries
               xticklabels=classes, yticklabels=classes,
               title=title,
               ylabel='True label',
               xlabel='Predicted label')

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # Loop over data dimensions and create text annotations.
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")
        fig.tight_layout()
        return (plt, fig, ax)

    @staticmethod
    def rmse(y_true, y_pred):
          """
          Use only for Keras models
          """
          from keras import backend
          return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))


    @staticmethod
    def df_attributes(data_ins):
        '''
        Use the pandas inherant methods and give a gist of the data

        1. Info
        2. Describe
        3. Dtypes
        4. columns
        5. check null

        '''
        print("Dataframe Info")
        print(data_ins.info())
        print("===============================================================")

        print("Dataframe Describe Method")
        print(data_ins.describe())
        print("===============================================================")

        print("Dataframe Dtype")
        print(data_ins.dtypes)
        print("===============================================================")

        print("Dataframe Columns")
        print(data_ins.columns)
        print("===============================================================")

        print("Dataframe Check if null")
        print(data_ins.isnull().sum())
        print("===============================================================")


    @staticmethod
    def hist_flt_int(dataset):
        '''
        From the df.dtypes, which is pandas series, we convert to dict then get use list comprehension to get the
        column name with int and float

        input - dataframe
        output - histogram of int and floats
        '''
        ## TODO : Image size config

        df = dict(dataset.dtypes)
        hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

        fig = plt.figure(figsize = (15,20))
        ax = fig.gca()
        return dataset[hist_cols].hist(ax = ax)

#     @staticmethod
#     def datetime_from_Y_M_D(dataset, ymd_dict):
#         '''
#         given data set, and 3 columns get datetime object

#         You cannot add string like this like multiple strings, so this wont work

#         Do an apply(lambda x: "-".join(x))
#         '''

#         dataset["str_year"] = dataset[ymd_dict["year_col"]].astype(str) + '-' + dataset[ymd_dict["month_col"]].astype(str)\
#         + '-' + dataset[ymd_dict["day_col"]].astype(str)
#         dataset["date"] = datetime.datetime.strptime(dataset.str_year, '%Y-%m-%d')

#         dataset.drop(["str_year"], axis = 1, inplace = True)

#         return dataset

    @staticmethod
    def datetime_from_Y_M_D(df, ymd_dict):
        '''
        given data set, and 3 columns get datetime object

        You cannot add string like this like multiple strings, so this wont work

        Do an apply(lambda x: "-".join(x))
        '''
        ## TODO

        df['year'] =df['year'].astype(str)
        df['month']=df['month'].astype(str)
        df['day']  =df['day'].astype(str)

        df['date'] = df[['year', 'month', 'day']].apply(lambda x: '-'.join(x), axis=1)
        df["date"] = df['date'].apply(lambda x: datetime.datetime.strptime(x , '%Y-%m-%d'))

        return df

    @staticmethod
    def reg_evaluation(y_test, y_pred):
        '''

        '''
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)

        reg_eval_dict = {}
        reg_eval_dict["mae"] = mae
        reg_eval_dict["mse"] = mse
        reg_eval_dict["rmse"] = rmse
        reg_eval_dict["r2_test"] = r2

        return reg_eval_dict

    @staticmethod
    def get_dummbies():
        '''
        '''

        features = pd.get_dummies(features)
        features.head(5)
        return None

    @staticmethod
    def plot_corr(df_insurance_train):
        '''
        Given a dataframe plot it
        df_insurance_train - pandas dataframe
        '''
        ## filter float and int columns
        df = dict(df_insurance_train.dtypes)
        hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

        ## plot seaborn plot
        f, ax = plt.subplots(figsize=(10, 8))
        corr = df_insurance_train[hist_cols].corr()
        sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240, 10, as_cmap=True),
                    square=True, ax=ax)

        ## plot pandas plot
        corr1 = df_insurance_train[hist_cols].corr()
        return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)


    @staticmethod
    def plot_corr_pandas(df_insurance_train):
        '''
        Given a dataframe plot it
        df_insurance_train - pandas dataframe

        Pandas plots needs to be returns to view
        '''
        ## filter float and int columns
        df = dict(df_insurance_train.dtypes)
        hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

        ## plot pandas plot
        ## pandas plot needs to returns or else it wont show
        corr1 = df_insurance_train[hist_cols].corr()
        return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)

    @staticmethod
    def plot_categorical_bar(df_insurance_train):
        '''
        Input data frame -
        Bar plot for all columns which are not float or int

        Keep top ten sorted high to low - this can be a variable
        '''

        df = dict(df_insurance_train.dtypes)
        hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

        a = list(df_insurance_train.columns)
        b = hist_cols
        categorical_columns = list(set(a)-set(b))


        for col in categorical_columns:
            print(col)

            ## the output of value_counts is pandas series and we can directly pass it to pandas DataFrame method to get a df
            ## value_count gives multi index so reset
            v_cont = pd.DataFrame(df_insurance_train[[col]].value_counts().reset_index())

            ## assign common column names
            v_cont.columns = ["feature", "count"]

            ## Sort descning and limit to top 10
            v_cont.sort_values("count", axis = 0, ascending = False, inplace = True)
            v_cont = v_cont[0:10]


            ax = sns.barplot(x="feature", y="count", data=v_cont)

            ##reset index as iterrows() will iterate over index
            v_cont.reset_index(inplace = True)
            for index, row in v_cont.iterrows():
                ax.text(row.name,row["count"], round(row["count"],2), color='black', ha="center")

            ## dropping pandas bar plot
            #ax = v_cont.plot.bar()

            plt.xticks(rotation = 45)  ## rotate x lables by 45 degrees
            plt.title(col)
            plt.show()

        #     v_cont.index = v_cont.feature
        #     for index, value in enumerate(v_cont.count):
        #         plt.text(index,value, str(value))
	import statistics

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.utils.multiclass import unique_labels
	from sklearn.metrics import confusion_matrix
	import os
	import tempfile
	import numpy as np

	import matplotlib.pyplot as plt
	import seaborn as sns

	class Utils:

	@staticmethod
	def load_data(path):
	"""
	Read a CSV file from a given path and return a Pandas DataFrame
	:param path: path to csv file
	:return: returns Pandas DataFrame
	"""

	df = pd.read_csv(path)
	return df

	@staticmethod
	def plot_graphs(x_data, y_data, x_label, y_label, title):
	"""
	Use the Mathplot lib to plot data points provide and respective x-axis and y-axis labels
	:param x_data: Data for x-axis
	:param y_data: Data for y-axis
	:param x_label: Label for x-axis
	:param y_label: Label FOR Y-axis
	:param title: Title for the plot
	:return: return tuple (fig, ax)
	"""

	plt.clf()

	fig, ax = plt.subplots()
	ax.plot(x_data, y_data)
	ax.set_xlabel(x_label)
	ax.set_ylabel(y_label)
	ax.set_title(title)

	return (fig, ax)

	@staticmethod
	def plot_residual_graphs(predictions, y_test, x_label, y_label, title):
	"""
	Create residual plot using seaborn plotting library
	https://seaborn.pydata.org/tutorial/regression.html
	:param predictions: predictions from the run
	:param y_test: actual labels
	:param x_label: name for the x-axis
	:param y_label: name for the y-axis
	:param title: title for the plot
	:return: tuple of plt, fig, ax
	"""

	fig, ax = plt.subplots()

	sns.residplot(predictions, y_test, lowess=True)
	plt.xlabel(x_label)
	plt.ylabel(y_label)
	plt.title(title)

	return (plt, fig, ax)

	@staticmethod
	def get_mlflow_directory_path(*paths, create_dir=True):
	"""
	Get the current running path where mlruns is created. This is the directory from which
	the python file containing MLflow code is executed. This method is used for artifacts, such
	as images, where we want to store plots.
	:param paths: list of directories below mlfruns, experimentID, mlflow_run_id
	:param create_dir: detfault is True
	:return: path to directory.
	"""

	cwd = os.getcwd()
	dir = os.path.join(cwd, "mlruns", *paths)
	if create_dir:
	if not os.path.exists(dir):
	os.mkdir(dir, mode=0o755)
	return dir

	@staticmethod
	def get_temporary_directory_path(prefix, suffix):
	"""
	Get a temporary directory and files for artifacts
	:param prefix: name of the file
	:param suffix: .csv, .txt, .png etc
	:return: object to tempfile.
	"""

	temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix)
	return temp

	@staticmethod
	def print_pandas_dataset(d):
	"""
	Given a Pandas dataFrame show the dimensions sizes
	:param d: Pandas dataFrame
	:return: None
	"""
	print("rows = %d; columns=%d" % (d.shape[0], d.shape[1]))
	print(d.head())

	@staticmethod
	def plot_confusion_matrix(y_true, y_pred, classes,
	normalize=False,
	title=None,
	cmap=plt.cm.Blues):
	"""
	This function prints and plots the confusion matrix.
	Normalization can be applied by setting `normalize=True`.
	Borrowed from the scikit-learn library documentation

	:param y_true: the actual value of y
	:param y_pred: the predicted valuye of y
	:param classes: list of label classes to be predicted
	:param normalize: normalize the data
	:param title: title of the plot for confusion matrix
	:param cmap: color of plot
	:return: returns a tuple of (plt, fig, ax)
	"""

	if not title:
	if normalize:
	title = 'Normalized confusion matrix'
	else:
	title = 'Confusion matrix, without normalization'

	# Compute confusion matrix
	cm = confusion_matrix(y_true, y_pred)
	# Only use the labels that appear in the data
	classes = classes[unique_labels(y_true, y_pred)]
	if normalize:
	cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
	print("Normalized confusion matrix")
	else:
	print('Confusion matrix, without normalization')
	print(cm)

	fig, ax = plt.subplots()
	im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
	ax.figure.colorbar(im, ax=ax)
	# We want to show all ticks...
	ax.set(xticks=np.arange(cm.shape[1]),
	yticks=np.arange(cm.shape[0]),
	# ... and label them with the respective list entries
	xticklabels=classes, yticklabels=classes,
	title=title,
	ylabel='True label',
	xlabel='Predicted label')

	# Rotate the tick labels and set their alignment.
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
	rotation_mode="anchor")

	# Loop over data dimensions and create text annotations.
	fmt = '.2f' if normalize else 'd'
	thresh = cm.max() / 2.
	for i in range(cm.shape[0]):
	for j in range(cm.shape[1]):
	ax.text(j, i, format(cm[i, j], fmt),
	ha="center", va="center",
	color="white" if cm[i, j] > thresh else "black")
	fig.tight_layout()
	return (plt, fig, ax)

	@staticmethod
	def rmse(y_true, y_pred):
	"""
	Use only for Keras models
	"""
	from keras import backend
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))


	@staticmethod
	def df_attributes(data_ins):
	'''
	Use the pandas inherant methods and give a gist of the data

	1. Info
	2. Describe
	3. Dtypes
	4. columns
	5. check null

	'''
	print("Dataframe Info")
	print(data_ins.info())
	print("===============================================================")

	print("Dataframe Describe Method")
	print(data_ins.describe())
	print("===============================================================")

	print("Dataframe Dtype")
	print(data_ins.dtypes)
	print("===============================================================")

	print("Dataframe Columns")
	print(data_ins.columns)
	print("===============================================================")

	print("Dataframe Check if null")
	print(data_ins.isnull().sum())
	print("===============================================================")




	@staticmethod
	def hist_flt_int(dataset):
	'''
	From the df.dtypes, which is pandas series, we convert to dict then get use list comprehension to get the
	column name with int and float

	input - dataframe
	output - histogram of int and floats
	'''
	## TODO : Image size config

	df = dict(dataset.dtypes)
	hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

	fig = plt.figure(figsize = (15,20))
	ax = fig.gca()
	return dataset[hist_cols].hist(ax = ax)

	# @staticmethod
	# def datetime_from_Y_M_D(dataset, ymd_dict):
	# '''
	# given data set, and 3 columns get datetime object

	# You cannot add string like this like multiple strings, so this wont work

	# Do an apply(lambda x: "-".join(x))
	# '''

	# dataset["str_year"] = dataset[ymd_dict["year_col"]].astype(str) + '-' + dataset[ymd_dict["month_col"]].astype(str)\
	# + '-' + dataset[ymd_dict["day_col"]].astype(str)
	# dataset["date"] = datetime.datetime.strptime(dataset.str_year, '%Y-%m-%d')

	# dataset.drop(["str_year"], axis = 1, inplace = True)

	# return dataset

	@staticmethod
	def datetime_from_Y_M_D(df, ymd_dict):
	'''
	given data set, and 3 columns get datetime object

	You cannot add string like this like multiple strings, so this wont work

	Do an apply(lambda x: "-".join(x))
	'''
	## TODO

	df['year'] =df['year'].astype(str)
	df['month']=df['month'].astype(str)
	df['day'] =df['day'].astype(str)

	df['date'] = df[['year', 'month', 'day']].apply(lambda x: '-'.join(x), axis=1)
	df["date"] = df['date'].apply(lambda x: datetime.datetime.strptime(x , '%Y-%m-%d'))

	return df

	@staticmethod
	def reg_evaluation(y_test, y_pred):
	'''

	'''
	mae = metrics.mean_absolute_error(y_test, y_pred)
	mse = metrics.mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = metrics.r2_score(y_test, y_pred)

	reg_eval_dict = {}
	reg_eval_dict["mae"] = mae
	reg_eval_dict["mse"] = mse
	reg_eval_dict["rmse"] = rmse
	reg_eval_dict["r2_test"] = r2

	return reg_eval_dict

	@staticmethod
	def get_dummbies():
	'''
	'''

	features = pd.get_dummies(features)
	features.head(5)
	return None

	@staticmethod
	def plot_corr(df_insurance_train):
	'''
	Given a dataframe plot it
	df_insurance_train - pandas dataframe
	'''
	## filter float and int columns
	df = dict(df_insurance_train.dtypes)
	hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

	## plot seaborn plot
	f, ax = plt.subplots(figsize=(10, 8))
	corr = df_insurance_train[hist_cols].corr()
	sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240, 10, as_cmap=True),
	square=True, ax=ax)

	## plot pandas plot
	corr1 = df_insurance_train[hist_cols].corr()
	return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)


	@staticmethod
	def plot_corr_pandas(df_insurance_train):
	'''
	Given a dataframe plot it
	df_insurance_train - pandas dataframe

	Pandas plots needs to be returns to view
	'''
	## filter float and int columns
	df = dict(df_insurance_train.dtypes)
	hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

	## plot pandas plot
	## pandas plot needs to returns or else it wont show
	corr1 = df_insurance_train[hist_cols].corr()
	return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)

	@staticmethod
	def plot_categorical_bar(df_insurance_train):
	'''
	Input data frame -
	Bar plot for all columns which are not float or int

	Keep top ten sorted high to low - this can be a variable
	'''

	df = dict(df_insurance_train.dtypes)
	hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]

	a = list(df_insurance_train.columns)
	b = hist_cols
	categorical_columns = list(set(a)-set(b))


	for col in categorical_columns:
	print(col)

	## the output of value_counts is pandas series and we can directly pass it to pandas DataFrame method to get a df
	## value_count gives multi index so reset
	v_cont = pd.DataFrame(df_insurance_train[[col]].value_counts().reset_index())

	## assign common column names
	v_cont.columns = ["feature", "count"]

	## Sort descning and limit to top 10
	v_cont.sort_values("count", axis = 0, ascending = False, inplace = True)
	v_cont = v_cont[0:10]


	ax = sns.barplot(x="feature", y="count", data=v_cont)

	##reset index as iterrows() will iterate over index
	v_cont.reset_index(inplace = True)
	for index, row in v_cont.iterrows():
	ax.text(row.name,row["count"], round(row["count"],2), color='black', ha="center")

	## dropping pandas bar plot
	#ax = v_cont.plot.bar()

	plt.xticks(rotation = 45) ## rotate x lables by 45 degrees
	plt.title(col)
	plt.show()

	# v_cont.index = v_cont.feature
	# for index, value in enumerate(v_cont.count):
	# plt.text(index,value, str(value))