Skip to content

Instantly share code, notes, and snippets.

@ipreencekmr
Created June 3, 2019 16:13
Show Gist options
  • Save ipreencekmr/e84edceafd2fd3b7c4f94267626097b5 to your computer and use it in GitHub Desktop.
Save ipreencekmr/e84edceafd2fd3b7c4f94267626097b5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import seaborn as sns
# In[2]:
mpg_df = sns.load_dataset('mpg')
mpg_df.shape
# In[3]:
mpg_df.drop(['model_year','origin','name'], axis=1, inplace=True) #As it doesn't work with categorical variables
mpg_df.head()
# In[4]:
#Insert some null values
mpg_df.loc[3,'cylinders'] = np.nan
mpg_df.loc[5,'cylinders'] = np.nan
mpg_df.loc[30,'displacement'] = np.nan
mpg_df.loc[31,'displacement'] = np.nan
mpg_df.loc[40,'displacement'] = np.nan
# In[5]:
#check if null exists
mpg_df.isnull().sum()
# ### Rational Imputer
# - It works only with continuous type independent variables having strong relationships among themselves
# - It checks for highest correlation value existence for target or to be imputed variable
# - Then picks up column which is highly correlated with target column and picks only those rows having not null value
# in this independent column and sort by it.
# - Remove 15% of the data from either side (top-bottom) to ensure very less outliers existing in our dataset.
# - Uses linear model to train with this data and predict the target and impute it with predicted value.
# - It also maintains log if you want to see at which index what value has been imputed.
# - You can also check how much correlation value is there between predictor and target using get_corr_info.
# In[6]:
from sklearn.linear_model import LinearRegression
class RationalImputer():
def __init__(self):
self.log_dict = dict()
self.log_df = pd.DataFrame()
self.corr_list = []
self.predictor_list = []
self.intercept_list = []
self.coeff_list = []
self.target_list = []
#getting maximum correlation value column
def get_max_corr_column(self, for_corr_df, for_col):
#not max which indicates interacting with self
large_df = for_corr_df[for_col].nlargest(2)
large_corr_col = large_df[large_df == large_df.min()].index[0]
self.corr_list.append(large_df.min())#To maintain log
return large_corr_col
#getting only those values from dataFrame which do not have
#null values in both these independent and dependent column
def get_not_null_df(self, dataFrame, for_ind_col, for_dep_col):
not_null_df = dataFrame[~dataFrame[for_ind_col].isnull()
& (~dataFrame[for_dep_col].isnull())]
#getting not_null_df sorted as we need to pick up range of df between q1 and q3
#we are doing so to avoid outliers to effect our model training
sorted_df = not_null_df.sort_values(by=for_ind_col)
sorted_df.reset_index(inplace=True)
total_len = len(sorted_df)
#We do not want to loose more data just to avoid outliers so we are setting
#First quartile at 15% and 3rd Quartile at 85%
start_index = int(total_len * 0.15) #Get First Quartile Index
end_index = int(total_len * 0.85) #Get Third Quartile Index
inter_quartile_df = sorted_df.iloc[start_index:(end_index+1)] #+1 as exluded
return inter_quartile_df
#training model with predictor(max_correlation_col) and target variable(to predict)
def train_lin_reg_model(self, not_null_df, ind_col, dep_col):
ind_values_X = not_null_df[[ind_col]]
dep_values_y = not_null_df[dep_col]
lin_model = LinearRegression()
lin_model.fit(ind_values_X, dep_values_y)
#To maintain log
self.intercept_list.append(lin_model.intercept_)
self.coeff_list.append(lin_model.coef_[0])
return lin_model
def impute_missing(self, dataFrame, for_col, inplace=False):
correlation_df = dataFrame.corr()
predictor_col = self.get_max_corr_column(for_corr_df=correlation_df, for_col=for_col)
#To maintain log
self.predictor_list.append(predictor_col)
self.target_list.append(for_col)
#get dataFrame having not null values
not_null_df = self.get_not_null_df(dataFrame, for_ind_col=predictor_col, for_dep_col=for_col)
#get linear model trained
model = self.train_lin_reg_model(not_null_df=not_null_df, ind_col=predictor_col, dep_col=for_col)
#Get dataFrame view for for col having null values
null_df = dataFrame[dataFrame[for_col].isnull()]
#get predictor values from independent column where dependent col is null
x_values = dataFrame[dataFrame[for_col].isnull()][[predictor_col]] #In dataFrame
#Return None in case if predictors not found and inplace = True,
#Return Same DataFrame if predictors not found and inplace = False
if(len(x_values) == 0):
if (inplace == False):
return dataFrame
else:
return None
#get predicted values of target column by feeding independent variable values
predicted_values = model.predict(x_values)
#convert predicted values to Series having same index as null_df has
#so that it won't provide warning while replacing values in DataFrame
predicted_values = pd.Series(predicted_values, index=null_df.index)
#Fill Nan Values with new predicted values only in null_df
null_filled_df = null_df[for_col].fillna(predicted_values)
#maintain log
log_df = predicted_values.to_frame() #converting series to dataframe
self.log_dict[for_col] = log_df
#Return None in case inplace set to True
if(inplace==True):
dataFrame.update(null_filled_df)
return None
#Return new dataFrame with updated values
imputed_df = dataFrame.copy()
imputed_df.update(null_filled_df)
return imputed_df
def impute_missing_in_dataset(self, dataset):
#getting only those features having null values
null_feature_df = dataset.isnull().sum()[(dataset.isnull().sum() > 0)]
for col in null_feature_df.index:
self.impute_missing(dataset, for_col=col, inplace=True)
return None
def print_log(self):
logs = self.log_dict
for key in logs.keys():
print('Column:',key)
self.log_df = logs[key]
self.log_df.columns = ['Imputed']
print(self.log_df,'\n')
def get_corr_info(self):
corr_info_df = pd.DataFrame({'Correlation':self.corr_list,
'Predictor':self.predictor_list,
'Intercept':self.intercept_list,
'Coefficient':self.coeff_list,
'Target Column':self.target_list})
return corr_info_df
# In[7]:
r_imputer = RationalImputer()
r_imputer.impute_missing_in_dataset(mpg_df)
# In[8]:
mpg_df.isnull().sum()
# In[9]:
r_imputer.print_log()
# In[10]:
r_imputer.get_corr_info()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment