Created
December 8, 2021 15:17
-
-
Save chrisdmell/c6904f864495659326eed1d151bd36e9 to your computer and use it in GitHub Desktop.
catboost regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import statistics | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.utils.multiclass import unique_labels | |
from sklearn.metrics import confusion_matrix | |
import os | |
import tempfile | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
class Utils: | |
@staticmethod | |
def load_data(path): | |
""" | |
Read a CSV file from a given path and return a Pandas DataFrame | |
:param path: path to csv file | |
:return: returns Pandas DataFrame | |
""" | |
df = pd.read_csv(path) | |
return df | |
@staticmethod | |
def plot_graphs(x_data, y_data, x_label, y_label, title): | |
""" | |
Use the Mathplot lib to plot data points provide and respective x-axis and y-axis labels | |
:param x_data: Data for x-axis | |
:param y_data: Data for y-axis | |
:param x_label: Label for x-axis | |
:param y_label: Label FOR Y-axis | |
:param title: Title for the plot | |
:return: return tuple (fig, ax) | |
""" | |
plt.clf() | |
fig, ax = plt.subplots() | |
ax.plot(x_data, y_data) | |
ax.set_xlabel(x_label) | |
ax.set_ylabel(y_label) | |
ax.set_title(title) | |
return (fig, ax) | |
@staticmethod | |
def plot_residual_graphs(predictions, y_test, x_label, y_label, title): | |
""" | |
Create residual plot using seaborn plotting library | |
https://seaborn.pydata.org/tutorial/regression.html | |
:param predictions: predictions from the run | |
:param y_test: actual labels | |
:param x_label: name for the x-axis | |
:param y_label: name for the y-axis | |
:param title: title for the plot | |
:return: tuple of plt, fig, ax | |
""" | |
fig, ax = plt.subplots() | |
sns.residplot(predictions, y_test, lowess=True) | |
plt.xlabel(x_label) | |
plt.ylabel(y_label) | |
plt.title(title) | |
return (plt, fig, ax) | |
@staticmethod | |
def get_mlflow_directory_path(*paths, create_dir=True): | |
""" | |
Get the current running path where mlruns is created. This is the directory from which | |
the python file containing MLflow code is executed. This method is used for artifacts, such | |
as images, where we want to store plots. | |
:param paths: list of directories below mlfruns, experimentID, mlflow_run_id | |
:param create_dir: detfault is True | |
:return: path to directory. | |
""" | |
cwd = os.getcwd() | |
dir = os.path.join(cwd, "mlruns", *paths) | |
if create_dir: | |
if not os.path.exists(dir): | |
os.mkdir(dir, mode=0o755) | |
return dir | |
@staticmethod | |
def get_temporary_directory_path(prefix, suffix): | |
""" | |
Get a temporary directory and files for artifacts | |
:param prefix: name of the file | |
:param suffix: .csv, .txt, .png etc | |
:return: object to tempfile. | |
""" | |
temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix) | |
return temp | |
@staticmethod | |
def print_pandas_dataset(d): | |
""" | |
Given a Pandas dataFrame show the dimensions sizes | |
:param d: Pandas dataFrame | |
:return: None | |
""" | |
print("rows = %d; columns=%d" % (d.shape[0], d.shape[1])) | |
print(d.head()) | |
@staticmethod | |
def plot_confusion_matrix(y_true, y_pred, classes, | |
normalize=False, | |
title=None, | |
cmap=plt.cm.Blues): | |
""" | |
This function prints and plots the confusion matrix. | |
Normalization can be applied by setting `normalize=True`. | |
Borrowed from the scikit-learn library documentation | |
:param y_true: the actual value of y | |
:param y_pred: the predicted valuye of y | |
:param classes: list of label classes to be predicted | |
:param normalize: normalize the data | |
:param title: title of the plot for confusion matrix | |
:param cmap: color of plot | |
:return: returns a tuple of (plt, fig, ax) | |
""" | |
if not title: | |
if normalize: | |
title = 'Normalized confusion matrix' | |
else: | |
title = 'Confusion matrix, without normalization' | |
# Compute confusion matrix | |
cm = confusion_matrix(y_true, y_pred) | |
# Only use the labels that appear in the data | |
classes = classes[unique_labels(y_true, y_pred)] | |
if normalize: | |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
print("Normalized confusion matrix") | |
else: | |
print('Confusion matrix, without normalization') | |
print(cm) | |
fig, ax = plt.subplots() | |
im = ax.imshow(cm, interpolation='nearest', cmap=cmap) | |
ax.figure.colorbar(im, ax=ax) | |
# We want to show all ticks... | |
ax.set(xticks=np.arange(cm.shape[1]), | |
yticks=np.arange(cm.shape[0]), | |
# ... and label them with the respective list entries | |
xticklabels=classes, yticklabels=classes, | |
title=title, | |
ylabel='True label', | |
xlabel='Predicted label') | |
# Rotate the tick labels and set their alignment. | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", | |
rotation_mode="anchor") | |
# Loop over data dimensions and create text annotations. | |
fmt = '.2f' if normalize else 'd' | |
thresh = cm.max() / 2. | |
for i in range(cm.shape[0]): | |
for j in range(cm.shape[1]): | |
ax.text(j, i, format(cm[i, j], fmt), | |
ha="center", va="center", | |
color="white" if cm[i, j] > thresh else "black") | |
fig.tight_layout() | |
return (plt, fig, ax) | |
@staticmethod | |
def rmse(y_true, y_pred): | |
""" | |
Use only for Keras models | |
""" | |
from keras import backend | |
return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1)) | |
@staticmethod | |
def df_attributes(data_ins): | |
''' | |
Use the pandas inherant methods and give a gist of the data | |
1. Info | |
2. Describe | |
3. Dtypes | |
4. columns | |
5. check null | |
''' | |
print("Dataframe Info") | |
print(data_ins.info()) | |
print("===============================================================") | |
print("Dataframe Describe Method") | |
print(data_ins.describe()) | |
print("===============================================================") | |
print("Dataframe Dtype") | |
print(data_ins.dtypes) | |
print("===============================================================") | |
print("Dataframe Columns") | |
print(data_ins.columns) | |
print("===============================================================") | |
print("Dataframe Check if null") | |
print(data_ins.isnull().sum()) | |
print("===============================================================") | |
@staticmethod | |
def hist_flt_int(dataset): | |
''' | |
From the df.dtypes, which is pandas series, we convert to dict then get use list comprehension to get the | |
column name with int and float | |
input - dataframe | |
output - histogram of int and floats | |
''' | |
## TODO : Image size config | |
df = dict(dataset.dtypes) | |
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")] | |
fig = plt.figure(figsize = (15,20)) | |
ax = fig.gca() | |
return dataset[hist_cols].hist(ax = ax) | |
# @staticmethod | |
# def datetime_from_Y_M_D(dataset, ymd_dict): | |
# ''' | |
# given data set, and 3 columns get datetime object | |
# You cannot add string like this like multiple strings, so this wont work | |
# Do an apply(lambda x: "-".join(x)) | |
# ''' | |
# dataset["str_year"] = dataset[ymd_dict["year_col"]].astype(str) + '-' + dataset[ymd_dict["month_col"]].astype(str)\ | |
# + '-' + dataset[ymd_dict["day_col"]].astype(str) | |
# dataset["date"] = datetime.datetime.strptime(dataset.str_year, '%Y-%m-%d') | |
# dataset.drop(["str_year"], axis = 1, inplace = True) | |
# return dataset | |
@staticmethod | |
def datetime_from_Y_M_D(df, ymd_dict): | |
''' | |
given data set, and 3 columns get datetime object | |
You cannot add string like this like multiple strings, so this wont work | |
Do an apply(lambda x: "-".join(x)) | |
''' | |
## TODO | |
df['year'] =df['year'].astype(str) | |
df['month']=df['month'].astype(str) | |
df['day'] =df['day'].astype(str) | |
df['date'] = df[['year', 'month', 'day']].apply(lambda x: '-'.join(x), axis=1) | |
df["date"] = df['date'].apply(lambda x: datetime.datetime.strptime(x , '%Y-%m-%d')) | |
return df | |
@staticmethod | |
def reg_evaluation(y_test, y_pred): | |
''' | |
''' | |
mae = metrics.mean_absolute_error(y_test, y_pred) | |
mse = metrics.mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = metrics.r2_score(y_test, y_pred) | |
reg_eval_dict = {} | |
reg_eval_dict["mae"] = mae | |
reg_eval_dict["mse"] = mse | |
reg_eval_dict["rmse"] = rmse | |
reg_eval_dict["r2_test"] = r2 | |
return reg_eval_dict | |
@staticmethod | |
def get_dummbies(): | |
''' | |
''' | |
features = pd.get_dummies(features) | |
features.head(5) | |
return None | |
@staticmethod | |
def plot_corr(df_insurance_train): | |
''' | |
Given a dataframe plot it | |
df_insurance_train - pandas dataframe | |
''' | |
## filter float and int columns | |
df = dict(df_insurance_train.dtypes) | |
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")] | |
## plot seaborn plot | |
f, ax = plt.subplots(figsize=(10, 8)) | |
corr = df_insurance_train[hist_cols].corr() | |
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240, 10, as_cmap=True), | |
square=True, ax=ax) | |
## plot pandas plot | |
corr1 = df_insurance_train[hist_cols].corr() | |
return corr1.style.background_gradient(cmap='coolwarm').set_precision(2) | |
@staticmethod | |
def plot_corr_pandas(df_insurance_train): | |
''' | |
Given a dataframe plot it | |
df_insurance_train - pandas dataframe | |
Pandas plots needs to be returns to view | |
''' | |
## filter float and int columns | |
df = dict(df_insurance_train.dtypes) | |
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")] | |
## plot pandas plot | |
## pandas plot needs to returns or else it wont show | |
corr1 = df_insurance_train[hist_cols].corr() | |
return corr1.style.background_gradient(cmap='coolwarm').set_precision(2) | |
@staticmethod | |
def plot_categorical_bar(df_insurance_train): | |
''' | |
Input data frame - | |
Bar plot for all columns which are not float or int | |
Keep top ten sorted high to low - this can be a variable | |
''' | |
df = dict(df_insurance_train.dtypes) | |
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")] | |
a = list(df_insurance_train.columns) | |
b = hist_cols | |
categorical_columns = list(set(a)-set(b)) | |
for col in categorical_columns: | |
print(col) | |
## the output of value_counts is pandas series and we can directly pass it to pandas DataFrame method to get a df | |
## value_count gives multi index so reset | |
v_cont = pd.DataFrame(df_insurance_train[[col]].value_counts().reset_index()) | |
## assign common column names | |
v_cont.columns = ["feature", "count"] | |
## Sort descning and limit to top 10 | |
v_cont.sort_values("count", axis = 0, ascending = False, inplace = True) | |
v_cont = v_cont[0:10] | |
ax = sns.barplot(x="feature", y="count", data=v_cont) | |
##reset index as iterrows() will iterate over index | |
v_cont.reset_index(inplace = True) | |
for index, row in v_cont.iterrows(): | |
ax.text(row.name,row["count"], round(row["count"],2), color='black', ha="center") | |
## dropping pandas bar plot | |
#ax = v_cont.plot.bar() | |
plt.xticks(rotation = 45) ## rotate x lables by 45 degrees | |
plt.title(col) | |
plt.show() | |
# v_cont.index = v_cont.feature | |
# for index, value in enumerate(v_cont.count): | |
# plt.text(index,value, str(value)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment