Skip to content

Instantly share code, notes, and snippets.

Avatar

Chris Aloysius Dmello chrisdmell

View GitHub Profile
@chrisdmell
chrisdmell / dataclean.py
Last active Dec 8, 2021
CatBoost Regression: EDA And Model Building Approach
View dataclean.py
class DataClean:
@staticmethod
def load_data(path):
"""
Read a CSV file from a given path and return a Pandas DataFrame
:param path: path to csv file
:return: returns Pandas DataFrame
"""
@chrisdmell
chrisdmell / catboost.py
Created Dec 8, 2021
catboost Regression Model Class
View catboost.py
class catboost_regressor():
'''
'''
def __init__(self, param = []):
'''
'''
self._rfr = CatBoostRegressor(**params) ## kwargs loss_function='RMSE', iterations = 100
self._param = param
View lab_utils_cls.py
import statistics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
import os
import tempfile
import numpy as np
View bus_price_clean.py
def clean_seat(x):
'''
input is a string object and not a list
'''
# a = [float(sing_price) for price in x for sing_price in price.split(",")]
# a = [sing_price for price in x for sing_price in price.split(",")]
# return sum(a)/len(a)
a = [float(price) for price in x.split(",")]
return sum(a)/len(a)
View bus_price_clean_002.py
bus_fare_df = bus_fare_df.sort_values(by = ["Bus","Service_Date","RecordedAt" ])
# display(bus_fare_df.head())
test = bus_fare_df[["Bus","Service_Date","RecordedAt","Seat_Fare_Type_1_average" ]].sort_values(by = ["Bus","Service_Date","RecordedAt" ])
test = test[["Bus","Service_Date","Seat_Fare_Type_1_average" ]]
test["Seat_Fare_Type_1_average_impute"] = test.groupby(["Bus","Service_Date" ]).transform(lambda x: x.replace(to_replace=0, method='ffill'))
display(test.shape)
display(bus_fare_df.shape)
test2 = bus_fare_df[["Bus","Service_Date","RecordedAt","Seat_Fare_Type_2_average" ]].sort_values(by = ["Bus","Service_Date","RecordedAt" ])
test2 = test2[["Bus","Service_Date","Seat_Fare_Type_2_average" ]]
View bus_price_clean_1.py
one_df_002 = pd.merge(one_df,final_cleaned_df[["Bus","Service_Date","RecordedAt", "average_price_s1_s2_filled"]], how = "left" , left_on = ["Bus","Service_Date","min"], right_on =["Bus","Service_Date","RecordedAt"],
suffixes=('_left', '_right'))
one_df_003 = pd.merge(one_df_002,final_cleaned_df[["Bus","Service_Date","RecordedAt", "average_price_s1_s2_filled"]], how = "left" , left_on = ["Bus","Service_Date","max"], right_on =["Bus","Service_Date","RecordedAt"],
suffixes=('_left', '_right'))
one_df_003["price_diff_i_f"] = one_df_003.average_price_s1_s2_filled_right - one_df_003.average_price_s1_s2_filled_left
one_df_003["price_diff_i_f_perc"] = one_df_003.price_diff_i_f / one_df_003.average_price_s1_s2_filled_left
one_df_004 = one_df_003[["Bus","Service_Date", "price_diff_i_f"]].drop_duplicates()
one_df_005 = one_df_003[["Bus","Service_Date", "price_diff_i_f_perc"]].drop_duplicates()
one_df_005.boxplot(column = ["price_diff_i_f_perc"])
one_df_004.price_diff_i_f.hist(
View bus_price_clean_2.py
groups = final_cleaned_df.groupby(["Bus","Service_Date_new"]).RecordedAt_new
min_val = groups.transform(min)
one_df = final_cleaned_df[(final_cleaned_df.RecordedAt_new==min_val) ]
one_df["date_diff"] = one_df.Service_Date_new - one_df.RecordedAt_new
figure(figsize=(10, 6), dpi=80)
plt.subplot(1, 2, 1)
plt.title("Date Difference Boxplot")
plt.boxplot(one_df.date_diff.astype('timedelta64[D]'))
plt.subplot(1, 2, 2)
plt.title("Date Difference Histogram")
View bus_price_clean_3.py
plot_1 = bus_fare_df[bus_fare_df["average_price_s1_s2_filled"] !=0].groupby(["RecordedAt_date_only"]).agg({"average_price_s1_s2_filled":np.mean})
figure(figsize=(10, 6), dpi=80)
plt.plot(plot_1.index, plot_1.average_price_s1_s2_filled,label = "Platform")
plot_2 = bus_fare_df[(bus_fare_df["average_price_s1_s2_filled"] !=0)&(bus_fare_df["Bus"] =="060c6d5595f3d7cf53838b0b8c66673d")].groupby(["RecordedAt_date_only"]).agg({"average_price_s1_s2_filled":np.mean})
plt.plot(plot_2.index, plot_2.average_price_s1_s2_filled, label = "060c6d5595f3d7cf53838b0b8c66673d")
plt.show()
View bus_price_clean_4.py
figure(figsize=(10, 6), dpi=80)
price_f = final_cleaned_df[final_cleaned_df["average_price_s1_s2_filled"] != 0]
plt.subplot(1, 2, 1)
plt.title("Boxplot - Average Price")
price_f = final_cleaned_df[final_cleaned_df["average_price_s1_s2_filled"] != 0]
plt.boxplot(price_f.average_price_s1_s2_filled)
plt.subplot(1, 2, 2)
plt.title("Histogram - Average Price")
View bus_price_clean_5.py
f = final_cleaned_df.copy()
b1 = f[(f["Bus"] == "a6951a59b64579edcf822ab9ea4c0c83") & (f["Service_Date"] == "15-07-2020 00:00")]
b2 = f[(f["Bus"] == "ab479dab4a9e6bc3eaefe77a09f027ed") & (f["Service_Date"] == "15-07-2020 00:00")]
recorded_dates_df = pd.concat([b1[["RecordedAt_new"]], b2[["RecordedAt_new"]]], axis = 0).drop_duplicates().sort_values(by = "RecordedAt_new").reset_index().drop(columns = "index")
joined_1 = pd.merge(recorded_dates_df, b1, on=["RecordedAt_new"], how='left',suffixes=('_actuals', '_B1'))
joined_df = pd.merge(joined_1, b2, on=["RecordedAt_new"], how='left',suffixes=('_B1', '_B2'))
joined_df