ipreencekmr/Rational_Imputer.py

## Rational_Imputer.py
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import seaborn as sns


# In[2]:


mpg_df = sns.load_dataset('mpg')
mpg_df.shape


# In[3]:


mpg_df.drop(['model_year','origin','name'], axis=1, inplace=True) #As it doesn't work with categorical variables
mpg_df.head()


# In[4]:


#Insert some null values
mpg_df.loc[3,'cylinders'] = np.nan
mpg_df.loc[5,'cylinders'] = np.nan

mpg_df.loc[30,'displacement'] = np.nan
mpg_df.loc[31,'displacement'] = np.nan
mpg_df.loc[40,'displacement'] = np.nan


# In[5]:


#check if null exists
mpg_df.isnull().sum()


# ### Rational Imputer
# - It works only with continuous type independent variables having strong relationships among themselves
# - It checks for highest correlation value existence for target or to be imputed variable
# - Then picks up column which is highly correlated with target column and picks only those rows having not null value
#   in this independent column and sort by it.
# - Remove 15% of the data from either side (top-bottom) to ensure very less outliers existing in our dataset.
# - Uses linear model to train with this data and predict the target and impute it with predicted value.
# - It also maintains log if you want to see at which index what value has been imputed.
# - You can also check how much correlation value is there between predictor and target using get_corr_info.

# In[6]:


from sklearn.linear_model import LinearRegression

class RationalImputer():

    def __init__(self):
        self.log_dict = dict()
        self.log_df = pd.DataFrame()
        self.corr_list = []
        self.predictor_list = []
        self.intercept_list = []
        self.coeff_list = []
        self.target_list = []

    #getting maximum correlation value column
    def get_max_corr_column(self, for_corr_df, for_col):
        #not max which indicates interacting with self
        large_df = for_corr_df[for_col].nlargest(2)
        large_corr_col = large_df[large_df == large_df.min()].index[0]

        self.corr_list.append(large_df.min())#To maintain log
        return large_corr_col

    #getting only those values from dataFrame which do not have
    #null values in both these independent and dependent column
    def get_not_null_df(self, dataFrame, for_ind_col, for_dep_col):
        not_null_df = dataFrame[~dataFrame[for_ind_col].isnull()
                                & (~dataFrame[for_dep_col].isnull())]

        #getting not_null_df sorted as we need to pick up range of df between q1 and q3
        #we are doing so to avoid outliers to effect our model training
        sorted_df = not_null_df.sort_values(by=for_ind_col)
        sorted_df.reset_index(inplace=True)

        total_len = len(sorted_df)

        #We do not want to loose more data just to avoid outliers so we are setting
        #First quartile at 15% and 3rd Quartile at 85%
        start_index = int(total_len * 0.15) #Get First Quartile Index
        end_index = int(total_len * 0.85) #Get Third Quartile Index

        inter_quartile_df = sorted_df.iloc[start_index:(end_index+1)] #+1 as exluded

        return inter_quartile_df

    #training model with predictor(max_correlation_col) and target variable(to predict)
    def train_lin_reg_model(self, not_null_df, ind_col, dep_col):
        ind_values_X = not_null_df[[ind_col]]
        dep_values_y = not_null_df[dep_col]
        lin_model = LinearRegression()
        lin_model.fit(ind_values_X, dep_values_y)

        #To maintain log
        self.intercept_list.append(lin_model.intercept_)
        self.coeff_list.append(lin_model.coef_[0])
        return lin_model

    def impute_missing(self, dataFrame, for_col, inplace=False):
        correlation_df = dataFrame.corr()

        predictor_col = self.get_max_corr_column(for_corr_df=correlation_df, for_col=for_col)

        #To maintain log
        self.predictor_list.append(predictor_col)
        self.target_list.append(for_col)

        #get dataFrame having not null values
        not_null_df = self.get_not_null_df(dataFrame, for_ind_col=predictor_col, for_dep_col=for_col)

        #get linear model trained
        model = self.train_lin_reg_model(not_null_df=not_null_df, ind_col=predictor_col, dep_col=for_col)

        #Get dataFrame view for for col having null values
        null_df = dataFrame[dataFrame[for_col].isnull()]

        #get predictor values from independent column where dependent col is null
        x_values = dataFrame[dataFrame[for_col].isnull()][[predictor_col]] #In dataFrame

        #Return None in case if predictors not found and inplace = True,
        #Return Same DataFrame if predictors not found and inplace = False
        if(len(x_values) == 0):
            if (inplace == False):
                return dataFrame
            else:
                return None

        #get predicted values of target column by feeding independent variable values
        predicted_values = model.predict(x_values)

        #convert predicted values to Series having same index as null_df has
        #so that it won't provide warning while replacing values in DataFrame
        predicted_values = pd.Series(predicted_values, index=null_df.index)

        #Fill Nan Values with new predicted values only in null_df
        null_filled_df = null_df[for_col].fillna(predicted_values)

        #maintain log
        log_df = predicted_values.to_frame() #converting series to dataframe
        self.log_dict[for_col] = log_df

        #Return None in case inplace set to True
        if(inplace==True):
            dataFrame.update(null_filled_df)
            return None

        #Return new dataFrame with updated values
        imputed_df = dataFrame.copy()
        imputed_df.update(null_filled_df)

        return imputed_df

    def impute_missing_in_dataset(self, dataset):
        #getting only those features having null values
        null_feature_df = dataset.isnull().sum()[(dataset.isnull().sum() > 0)]

        for col in null_feature_df.index:
            self.impute_missing(dataset, for_col=col, inplace=True)
        return None

    def print_log(self):
        logs = self.log_dict

        for key in logs.keys():
            print('Column:',key)
            self.log_df = logs[key]
            self.log_df.columns = ['Imputed']
            print(self.log_df,'\n')

    def get_corr_info(self):
        corr_info_df = pd.DataFrame({'Correlation':self.corr_list,
                                     'Predictor':self.predictor_list,
                                     'Intercept':self.intercept_list,
                                     'Coefficient':self.coeff_list,
                                     'Target Column':self.target_list})
        return corr_info_df


# In[7]:


r_imputer = RationalImputer()
r_imputer.impute_missing_in_dataset(mpg_df)


# In[8]:


mpg_df.isnull().sum()


# In[9]:


r_imputer.print_log()


# In[10]:


r_imputer.get_corr_info()
	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	import numpy as np
	import pandas as pd
	import seaborn as sns


	# In[2]:


	mpg_df = sns.load_dataset('mpg')
	mpg_df.shape


	# In[3]:


	mpg_df.drop(['model_year','origin','name'], axis=1, inplace=True) #As it doesn't work with categorical variables
	mpg_df.head()


	# In[4]:


	#Insert some null values
	mpg_df.loc[3,'cylinders'] = np.nan
	mpg_df.loc[5,'cylinders'] = np.nan

	mpg_df.loc[30,'displacement'] = np.nan
	mpg_df.loc[31,'displacement'] = np.nan
	mpg_df.loc[40,'displacement'] = np.nan


	# In[5]:


	#check if null exists
	mpg_df.isnull().sum()


	# ### Rational Imputer
	# - It works only with continuous type independent variables having strong relationships among themselves
	# - It checks for highest correlation value existence for target or to be imputed variable
	# - Then picks up column which is highly correlated with target column and picks only those rows having not null value
	# in this independent column and sort by it.
	# - Remove 15% of the data from either side (top-bottom) to ensure very less outliers existing in our dataset.
	# - Uses linear model to train with this data and predict the target and impute it with predicted value.
	# - It also maintains log if you want to see at which index what value has been imputed.
	# - You can also check how much correlation value is there between predictor and target using get_corr_info.

	# In[6]:


	from sklearn.linear_model import LinearRegression

	class RationalImputer():

	def __init__(self):
	self.log_dict = dict()
	self.log_df = pd.DataFrame()
	self.corr_list = []
	self.predictor_list = []
	self.intercept_list = []
	self.coeff_list = []
	self.target_list = []

	#getting maximum correlation value column
	def get_max_corr_column(self, for_corr_df, for_col):
	#not max which indicates interacting with self
	large_df = for_corr_df[for_col].nlargest(2)
	large_corr_col = large_df[large_df == large_df.min()].index[0]

	self.corr_list.append(large_df.min())#To maintain log
	return large_corr_col

	#getting only those values from dataFrame which do not have
	#null values in both these independent and dependent column
	def get_not_null_df(self, dataFrame, for_ind_col, for_dep_col):
	not_null_df = dataFrame[~dataFrame[for_ind_col].isnull()
	& (~dataFrame[for_dep_col].isnull())]

	#getting not_null_df sorted as we need to pick up range of df between q1 and q3
	#we are doing so to avoid outliers to effect our model training
	sorted_df = not_null_df.sort_values(by=for_ind_col)
	sorted_df.reset_index(inplace=True)

	total_len = len(sorted_df)

	#We do not want to loose more data just to avoid outliers so we are setting
	#First quartile at 15% and 3rd Quartile at 85%
	start_index = int(total_len * 0.15) #Get First Quartile Index
	end_index = int(total_len * 0.85) #Get Third Quartile Index

	inter_quartile_df = sorted_df.iloc[start_index:(end_index+1)] #+1 as exluded

	return inter_quartile_df

	#training model with predictor(max_correlation_col) and target variable(to predict)
	def train_lin_reg_model(self, not_null_df, ind_col, dep_col):
	ind_values_X = not_null_df[[ind_col]]
	dep_values_y = not_null_df[dep_col]
	lin_model = LinearRegression()
	lin_model.fit(ind_values_X, dep_values_y)

	#To maintain log
	self.intercept_list.append(lin_model.intercept_)
	self.coeff_list.append(lin_model.coef_[0])
	return lin_model

	def impute_missing(self, dataFrame, for_col, inplace=False):
	correlation_df = dataFrame.corr()

	predictor_col = self.get_max_corr_column(for_corr_df=correlation_df, for_col=for_col)

	#To maintain log
	self.predictor_list.append(predictor_col)
	self.target_list.append(for_col)

	#get dataFrame having not null values
	not_null_df = self.get_not_null_df(dataFrame, for_ind_col=predictor_col, for_dep_col=for_col)

	#get linear model trained
	model = self.train_lin_reg_model(not_null_df=not_null_df, ind_col=predictor_col, dep_col=for_col)

	#Get dataFrame view for for col having null values
	null_df = dataFrame[dataFrame[for_col].isnull()]

	#get predictor values from independent column where dependent col is null
	x_values = dataFrame[dataFrame[for_col].isnull()][[predictor_col]] #In dataFrame

	#Return None in case if predictors not found and inplace = True,
	#Return Same DataFrame if predictors not found and inplace = False
	if(len(x_values) == 0):
	if (inplace == False):
	return dataFrame
	else:
	return None

	#get predicted values of target column by feeding independent variable values
	predicted_values = model.predict(x_values)

	#convert predicted values to Series having same index as null_df has
	#so that it won't provide warning while replacing values in DataFrame
	predicted_values = pd.Series(predicted_values, index=null_df.index)

	#Fill Nan Values with new predicted values only in null_df
	null_filled_df = null_df[for_col].fillna(predicted_values)

	#maintain log
	log_df = predicted_values.to_frame() #converting series to dataframe
	self.log_dict[for_col] = log_df

	#Return None in case inplace set to True
	if(inplace==True):
	dataFrame.update(null_filled_df)
	return None

	#Return new dataFrame with updated values
	imputed_df = dataFrame.copy()
	imputed_df.update(null_filled_df)

	return imputed_df

	def impute_missing_in_dataset(self, dataset):
	#getting only those features having null values
	null_feature_df = dataset.isnull().sum()[(dataset.isnull().sum() > 0)]

	for col in null_feature_df.index:
	self.impute_missing(dataset, for_col=col, inplace=True)
	return None

	def print_log(self):
	logs = self.log_dict

	for key in logs.keys():
	print('Column:',key)
	self.log_df = logs[key]
	self.log_df.columns = ['Imputed']
	print(self.log_df,'\n')

	def get_corr_info(self):
	corr_info_df = pd.DataFrame({'Correlation':self.corr_list,
	'Predictor':self.predictor_list,
	'Intercept':self.intercept_list,
	'Coefficient':self.coeff_list,
	'Target Column':self.target_list})
	return corr_info_df


	# In[7]:


	r_imputer = RationalImputer()
	r_imputer.impute_missing_in_dataset(mpg_df)


	# In[8]:


	mpg_df.isnull().sum()


	# In[9]:


	r_imputer.print_log()


	# In[10]:


	r_imputer.get_corr_info()