Skip to content

Instantly share code, notes, and snippets.

@chrisdmell
Last active December 8, 2021 15:16
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save chrisdmell/c805de55a68f069ba4ae8fc2861d6d74 to your computer and use it in GitHub Desktop.
CatBoost Regression: EDA And Model Building Approach
class DataClean:
@staticmethod
def load_data(path):
"""
Read a CSV file from a given path and return a Pandas DataFrame
:param path: path to csv file
:return: returns Pandas DataFrame
"""
df = pd.read_csv(path)
return df
@staticmethod
def missing_percentage(df_insurance_train, other_dict = {}):
'''
input is a dataframe
returns : the percentage of missing values
'''
missing_df = df_insurance_train.isnull().sum().reset_index()
missing_df["total"] = len(df_insurance_train)
missing_df.columns = ["features", "null_count", "total"]
missing_df["missing_percent"] = round(missing_df["null_count"]/missing_df.total*100, 2)
missing_df.sort_values("missing_percent", ascending = False, inplace = True)
print(missing_df.to_markdown())
return missing_df
@staticmethod
def null_to_missing_cat(df_insurance_train, other_dict = {}):
'''
Input data frame with np.nan values and pandas NULL
fillna() misses out np.nan
NAN and NONE are interchangable in pandas
All null values are convereted to a class called missing_value
Output : pandas df with same shape
'''
df = dict(df_insurance_train.dtypes)
hist_cols = [key for key in df.keys() if (df[key] == "int64" or df[key] == "float64")]
a = list(df_insurance_train.columns)
b = hist_cols
categorical_columns = list(set(a)-set(b))
df_numeric = df_insurance_train[hist_cols]
## replace null values
df_insurance_train[categorical_columns].fillna('missing_value', inplace=True)
df_categorical = df_insurance_train[categorical_columns].replace(np.nan, 'missing_value', regex=True) # All data frame
df_insurance_train = pd.concat([df_categorical.reset_index(drop=True), df_numeric], axis=1)
DataClean.missing_percentage(df_insurance_train)
return(df_insurance_train)
@staticmethod
def num_col_mean_impute(df_insurance_train, num_impute_dict, other_dict = {}):
'''
inputs:
df_insurance_train - train dataframe with
num_impute_dict -
num_impute_dict = {"Property Age" : ["Profession", "mean"], "Income (USD)":["Profession", "mean"],\
"Dependents":["", "mode"] , "Credit Score":["Has Active Credit Card", "mean"],\
"Loan Sanction Amount (USD)":["", 0], "Current Loan Expenses (USD)":["Profession", "mean"]}
The idea is to DO MORE, rn doing the minimum,
{"Property Age" : ["Profession", "mean"]} - The idea is, impute proterty age with mean property age of profession columns.
Business ideas, same profession guys look for similar property age.
A godown guy will look for older buildings, but a technie will look for new homes.
'''
impute_df = pd.DataFrame(num_impute_dict)
## helps to pretty print in jupyter we use to_markdown()
print(num_impute_dict)
# print(impute_df)
## loop over the df
for cols in impute_df.columns:
print(cols)
x = impute_df[[cols]]
# print(x.columns[0])
## fillna with column mean.
df_insurance_train[cols].fillna(value= df_insurance_train[cols].mean(), inplace=True)
DataClean.missing_percentage(df_insurance_train)
return df_insurance_train
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment