Skip to content

Instantly share code, notes, and snippets.

@chrisdmell
Created December 8, 2021 15:17
Show Gist options
  • Save chrisdmell/c6904f864495659326eed1d151bd36e9 to your computer and use it in GitHub Desktop.
Save chrisdmell/c6904f864495659326eed1d151bd36e9 to your computer and use it in GitHub Desktop.
catboost regression
import statistics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
import os
import tempfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class Utils:
@staticmethod
def load_data(path):
"""
Read a CSV file from a given path and return a Pandas DataFrame
:param path: path to csv file
:return: returns Pandas DataFrame
"""
df = pd.read_csv(path)
return df
@staticmethod
def plot_graphs(x_data, y_data, x_label, y_label, title):
"""
Use the Mathplot lib to plot data points provide and respective x-axis and y-axis labels
:param x_data: Data for x-axis
:param y_data: Data for y-axis
:param x_label: Label for x-axis
:param y_label: Label FOR Y-axis
:param title: Title for the plot
:return: return tuple (fig, ax)
"""
plt.clf()
fig, ax = plt.subplots()
ax.plot(x_data, y_data)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title)
return (fig, ax)
@staticmethod
def plot_residual_graphs(predictions, y_test, x_label, y_label, title):
"""
Create residual plot using seaborn plotting library
https://seaborn.pydata.org/tutorial/regression.html
:param predictions: predictions from the run
:param y_test: actual labels
:param x_label: name for the x-axis
:param y_label: name for the y-axis
:param title: title for the plot
:return: tuple of plt, fig, ax
"""
fig, ax = plt.subplots()
sns.residplot(predictions, y_test, lowess=True)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
return (plt, fig, ax)
@staticmethod
def get_mlflow_directory_path(*paths, create_dir=True):
"""
Get the current running path where mlruns is created. This is the directory from which
the python file containing MLflow code is executed. This method is used for artifacts, such
as images, where we want to store plots.
:param paths: list of directories below mlfruns, experimentID, mlflow_run_id
:param create_dir: detfault is True
:return: path to directory.
"""
cwd = os.getcwd()
dir = os.path.join(cwd, "mlruns", *paths)
if create_dir:
if not os.path.exists(dir):
os.mkdir(dir, mode=0o755)
return dir
@staticmethod
def get_temporary_directory_path(prefix, suffix):
"""
Get a temporary directory and files for artifacts
:param prefix: name of the file
:param suffix: .csv, .txt, .png etc
:return: object to tempfile.
"""
temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix)
return temp
@staticmethod
def print_pandas_dataset(d):
"""
Given a Pandas dataFrame show the dimensions sizes
:param d: Pandas dataFrame
:return: None
"""
print("rows = %d; columns=%d" % (d.shape[0], d.shape[1]))
print(d.head())
@staticmethod
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
Borrowed from the scikit-learn library documentation
:param y_true: the actual value of y
:param y_pred: the predicted valuye of y
:param classes: list of label classes to be predicted
:param normalize: normalize the data
:param title: title of the plot for confusion matrix
:param cmap: color of plot
:return: returns a tuple of (plt, fig, ax)
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return (plt, fig, ax)
@staticmethod
def rmse(y_true, y_pred):
"""
Use only for Keras models
"""
from keras import backend
return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
@staticmethod
def df_attributes(data_ins):
'''
Use the pandas inherant methods and give a gist of the data
1. Info
2. Describe
3. Dtypes
4. columns
5. check null
'''
print("Dataframe Info")
print(data_ins.info())
print("===============================================================")
print("Dataframe Describe Method")
print(data_ins.describe())
print("===============================================================")
print("Dataframe Dtype")
print(data_ins.dtypes)
print("===============================================================")
print("Dataframe Columns")
print(data_ins.columns)
print("===============================================================")
print("Dataframe Check if null")
print(data_ins.isnull().sum())
print("===============================================================")
@staticmethod
def hist_flt_int(dataset):
'''
From the df.dtypes, which is pandas series, we convert to dict then get use list comprehension to get the
column name with int and float
input - dataframe
output - histogram of int and floats
'''
## TODO : Image size config
df = dict(dataset.dtypes)
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
return dataset[hist_cols].hist(ax = ax)
# @staticmethod
# def datetime_from_Y_M_D(dataset, ymd_dict):
# '''
# given data set, and 3 columns get datetime object
# You cannot add string like this like multiple strings, so this wont work
# Do an apply(lambda x: "-".join(x))
# '''
# dataset["str_year"] = dataset[ymd_dict["year_col"]].astype(str) + '-' + dataset[ymd_dict["month_col"]].astype(str)\
# + '-' + dataset[ymd_dict["day_col"]].astype(str)
# dataset["date"] = datetime.datetime.strptime(dataset.str_year, '%Y-%m-%d')
# dataset.drop(["str_year"], axis = 1, inplace = True)
# return dataset
@staticmethod
def datetime_from_Y_M_D(df, ymd_dict):
'''
given data set, and 3 columns get datetime object
You cannot add string like this like multiple strings, so this wont work
Do an apply(lambda x: "-".join(x))
'''
## TODO
df['year'] =df['year'].astype(str)
df['month']=df['month'].astype(str)
df['day'] =df['day'].astype(str)
df['date'] = df[['year', 'month', 'day']].apply(lambda x: '-'.join(x), axis=1)
df["date"] = df['date'].apply(lambda x: datetime.datetime.strptime(x , '%Y-%m-%d'))
return df
@staticmethod
def reg_evaluation(y_test, y_pred):
'''
'''
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_test, y_pred)
reg_eval_dict = {}
reg_eval_dict["mae"] = mae
reg_eval_dict["mse"] = mse
reg_eval_dict["rmse"] = rmse
reg_eval_dict["r2_test"] = r2
return reg_eval_dict
@staticmethod
def get_dummbies():
'''
'''
features = pd.get_dummies(features)
features.head(5)
return None
@staticmethod
def plot_corr(df_insurance_train):
'''
Given a dataframe plot it
df_insurance_train - pandas dataframe
'''
## filter float and int columns
df = dict(df_insurance_train.dtypes)
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]
## plot seaborn plot
f, ax = plt.subplots(figsize=(10, 8))
corr = df_insurance_train[hist_cols].corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240, 10, as_cmap=True),
square=True, ax=ax)
## plot pandas plot
corr1 = df_insurance_train[hist_cols].corr()
return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)
@staticmethod
def plot_corr_pandas(df_insurance_train):
'''
Given a dataframe plot it
df_insurance_train - pandas dataframe
Pandas plots needs to be returns to view
'''
## filter float and int columns
df = dict(df_insurance_train.dtypes)
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]
## plot pandas plot
## pandas plot needs to returns or else it wont show
corr1 = df_insurance_train[hist_cols].corr()
return corr1.style.background_gradient(cmap='coolwarm').set_precision(2)
@staticmethod
def plot_categorical_bar(df_insurance_train):
'''
Input data frame -
Bar plot for all columns which are not float or int
Keep top ten sorted high to low - this can be a variable
'''
df = dict(df_insurance_train.dtypes)
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]
a = list(df_insurance_train.columns)
b = hist_cols
categorical_columns = list(set(a)-set(b))
for col in categorical_columns:
print(col)
## the output of value_counts is pandas series and we can directly pass it to pandas DataFrame method to get a df
## value_count gives multi index so reset
v_cont = pd.DataFrame(df_insurance_train[[col]].value_counts().reset_index())
## assign common column names
v_cont.columns = ["feature", "count"]
## Sort descning and limit to top 10
v_cont.sort_values("count", axis = 0, ascending = False, inplace = True)
v_cont = v_cont[0:10]
ax = sns.barplot(x="feature", y="count", data=v_cont)
##reset index as iterrows() will iterate over index
v_cont.reset_index(inplace = True)
for index, row in v_cont.iterrows():
ax.text(row.name,row["count"], round(row["count"],2), color='black', ha="center")
## dropping pandas bar plot
#ax = v_cont.plot.bar()
plt.xticks(rotation = 45) ## rotate x lables by 45 degrees
plt.title(col)
plt.show()
# v_cont.index = v_cont.feature
# for index, value in enumerate(v_cont.count):
# plt.text(index,value, str(value))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment