Created
June 3, 2019 16:13
-
-
Save ipreencekmr/e84edceafd2fd3b7c4f94267626097b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
# In[2]: | |
mpg_df = sns.load_dataset('mpg') | |
mpg_df.shape | |
# In[3]: | |
mpg_df.drop(['model_year','origin','name'], axis=1, inplace=True) #As it doesn't work with categorical variables | |
mpg_df.head() | |
# In[4]: | |
#Insert some null values | |
mpg_df.loc[3,'cylinders'] = np.nan | |
mpg_df.loc[5,'cylinders'] = np.nan | |
mpg_df.loc[30,'displacement'] = np.nan | |
mpg_df.loc[31,'displacement'] = np.nan | |
mpg_df.loc[40,'displacement'] = np.nan | |
# In[5]: | |
#check if null exists | |
mpg_df.isnull().sum() | |
# ### Rational Imputer | |
# - It works only with continuous type independent variables having strong relationships among themselves | |
# - It checks for highest correlation value existence for target or to be imputed variable | |
# - Then picks up column which is highly correlated with target column and picks only those rows having not null value | |
# in this independent column and sort by it. | |
# - Remove 15% of the data from either side (top-bottom) to ensure very less outliers existing in our dataset. | |
# - Uses linear model to train with this data and predict the target and impute it with predicted value. | |
# - It also maintains log if you want to see at which index what value has been imputed. | |
# - You can also check how much correlation value is there between predictor and target using get_corr_info. | |
# In[6]: | |
from sklearn.linear_model import LinearRegression | |
class RationalImputer(): | |
def __init__(self): | |
self.log_dict = dict() | |
self.log_df = pd.DataFrame() | |
self.corr_list = [] | |
self.predictor_list = [] | |
self.intercept_list = [] | |
self.coeff_list = [] | |
self.target_list = [] | |
#getting maximum correlation value column | |
def get_max_corr_column(self, for_corr_df, for_col): | |
#not max which indicates interacting with self | |
large_df = for_corr_df[for_col].nlargest(2) | |
large_corr_col = large_df[large_df == large_df.min()].index[0] | |
self.corr_list.append(large_df.min())#To maintain log | |
return large_corr_col | |
#getting only those values from dataFrame which do not have | |
#null values in both these independent and dependent column | |
def get_not_null_df(self, dataFrame, for_ind_col, for_dep_col): | |
not_null_df = dataFrame[~dataFrame[for_ind_col].isnull() | |
& (~dataFrame[for_dep_col].isnull())] | |
#getting not_null_df sorted as we need to pick up range of df between q1 and q3 | |
#we are doing so to avoid outliers to effect our model training | |
sorted_df = not_null_df.sort_values(by=for_ind_col) | |
sorted_df.reset_index(inplace=True) | |
total_len = len(sorted_df) | |
#We do not want to loose more data just to avoid outliers so we are setting | |
#First quartile at 15% and 3rd Quartile at 85% | |
start_index = int(total_len * 0.15) #Get First Quartile Index | |
end_index = int(total_len * 0.85) #Get Third Quartile Index | |
inter_quartile_df = sorted_df.iloc[start_index:(end_index+1)] #+1 as exluded | |
return inter_quartile_df | |
#training model with predictor(max_correlation_col) and target variable(to predict) | |
def train_lin_reg_model(self, not_null_df, ind_col, dep_col): | |
ind_values_X = not_null_df[[ind_col]] | |
dep_values_y = not_null_df[dep_col] | |
lin_model = LinearRegression() | |
lin_model.fit(ind_values_X, dep_values_y) | |
#To maintain log | |
self.intercept_list.append(lin_model.intercept_) | |
self.coeff_list.append(lin_model.coef_[0]) | |
return lin_model | |
def impute_missing(self, dataFrame, for_col, inplace=False): | |
correlation_df = dataFrame.corr() | |
predictor_col = self.get_max_corr_column(for_corr_df=correlation_df, for_col=for_col) | |
#To maintain log | |
self.predictor_list.append(predictor_col) | |
self.target_list.append(for_col) | |
#get dataFrame having not null values | |
not_null_df = self.get_not_null_df(dataFrame, for_ind_col=predictor_col, for_dep_col=for_col) | |
#get linear model trained | |
model = self.train_lin_reg_model(not_null_df=not_null_df, ind_col=predictor_col, dep_col=for_col) | |
#Get dataFrame view for for col having null values | |
null_df = dataFrame[dataFrame[for_col].isnull()] | |
#get predictor values from independent column where dependent col is null | |
x_values = dataFrame[dataFrame[for_col].isnull()][[predictor_col]] #In dataFrame | |
#Return None in case if predictors not found and inplace = True, | |
#Return Same DataFrame if predictors not found and inplace = False | |
if(len(x_values) == 0): | |
if (inplace == False): | |
return dataFrame | |
else: | |
return None | |
#get predicted values of target column by feeding independent variable values | |
predicted_values = model.predict(x_values) | |
#convert predicted values to Series having same index as null_df has | |
#so that it won't provide warning while replacing values in DataFrame | |
predicted_values = pd.Series(predicted_values, index=null_df.index) | |
#Fill Nan Values with new predicted values only in null_df | |
null_filled_df = null_df[for_col].fillna(predicted_values) | |
#maintain log | |
log_df = predicted_values.to_frame() #converting series to dataframe | |
self.log_dict[for_col] = log_df | |
#Return None in case inplace set to True | |
if(inplace==True): | |
dataFrame.update(null_filled_df) | |
return None | |
#Return new dataFrame with updated values | |
imputed_df = dataFrame.copy() | |
imputed_df.update(null_filled_df) | |
return imputed_df | |
def impute_missing_in_dataset(self, dataset): | |
#getting only those features having null values | |
null_feature_df = dataset.isnull().sum()[(dataset.isnull().sum() > 0)] | |
for col in null_feature_df.index: | |
self.impute_missing(dataset, for_col=col, inplace=True) | |
return None | |
def print_log(self): | |
logs = self.log_dict | |
for key in logs.keys(): | |
print('Column:',key) | |
self.log_df = logs[key] | |
self.log_df.columns = ['Imputed'] | |
print(self.log_df,'\n') | |
def get_corr_info(self): | |
corr_info_df = pd.DataFrame({'Correlation':self.corr_list, | |
'Predictor':self.predictor_list, | |
'Intercept':self.intercept_list, | |
'Coefficient':self.coeff_list, | |
'Target Column':self.target_list}) | |
return corr_info_df | |
# In[7]: | |
r_imputer = RationalImputer() | |
r_imputer.impute_missing_in_dataset(mpg_df) | |
# In[8]: | |
mpg_df.isnull().sum() | |
# In[9]: | |
r_imputer.print_log() | |
# In[10]: | |
r_imputer.get_corr_info() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment