Last active
December 8, 2021 15:16
Star
You must be signed in to star a gist
CatBoost Regression: EDA And Model Building Approach
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DataClean: | |
@staticmethod | |
def load_data(path): | |
""" | |
Read a CSV file from a given path and return a Pandas DataFrame | |
:param path: path to csv file | |
:return: returns Pandas DataFrame | |
""" | |
df = pd.read_csv(path) | |
return df | |
@staticmethod | |
def missing_percentage(df_insurance_train, other_dict = {}): | |
''' | |
input is a dataframe | |
returns : the percentage of missing values | |
''' | |
missing_df = df_insurance_train.isnull().sum().reset_index() | |
missing_df["total"] = len(df_insurance_train) | |
missing_df.columns = ["features", "null_count", "total"] | |
missing_df["missing_percent"] = round(missing_df["null_count"]/missing_df.total*100, 2) | |
missing_df.sort_values("missing_percent", ascending = False, inplace = True) | |
print(missing_df.to_markdown()) | |
return missing_df | |
@staticmethod | |
def null_to_missing_cat(df_insurance_train, other_dict = {}): | |
''' | |
Input data frame with np.nan values and pandas NULL | |
fillna() misses out np.nan | |
NAN and NONE are interchangable in pandas | |
All null values are convereted to a class called missing_value | |
Output : pandas df with same shape | |
''' | |
df = dict(df_insurance_train.dtypes) | |
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")] | |
a = list(df_insurance_train.columns) | |
b = hist_cols | |
categorical_columns = list(set(a)-set(b)) | |
df_numeric = df_insurance_train[hist_cols] | |
## replace null values | |
df_insurance_train[categorical_columns].fillna('missing_value', inplace=True) | |
df_categorical = df_insurance_train[categorical_columns].replace(np.nan, 'missing_value', regex=True) # All data frame | |
df_insurance_train = pd.concat([df_categorical.reset_index(drop=True), df_numeric], axis=1) | |
DataClean.missing_percentage(df_insurance_train) | |
return(df_insurance_train) | |
@staticmethod | |
def num_col_mean_impute(df_insurance_train, num_impute_dict, other_dict = {}): | |
''' | |
inputs: | |
df_insurance_train - train dataframe with | |
num_impute_dict - | |
num_impute_dict = {"Property Age" : ["Profession", "mean"], "Income (USD)":["Profession", "mean"],\ | |
"Dependents":["", "mode"] , "Credit Score":["Has Active Credit Card", "mean"],\ | |
"Loan Sanction Amount (USD)":["", 0], "Current Loan Expenses (USD)":["Profession", "mean"]} | |
The idea is to DO MORE, rn doing the minimum, | |
{"Property Age" : ["Profession", "mean"]} - The idea is, impute proterty age with mean property age of profession columns. | |
Business ideas, same profession guys look for similar property age. | |
A godown guy will look for older buildings, but a technie will look for new homes. | |
''' | |
impute_df = pd.DataFrame(num_impute_dict) | |
## helps to pretty print in jupyter we use to_markdown() | |
print(num_impute_dict) | |
# print(impute_df) | |
## loop over the df | |
for cols in impute_df.columns: | |
print(cols) | |
x = impute_df[[cols]] | |
# print(x.columns[0]) | |
## fillna with column mean. | |
df_insurance_train[cols].fillna(value= df_insurance_train[cols].mean(), inplace=True) | |
DataClean.missing_percentage(df_insurance_train) | |
return df_insurance_train |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment