Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created Dec 27, 2020
Embed
What would you like to do?
import numpy as np
import pandas as pd
# for vis
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
# anomaly and skewness detection
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from numpy import mean, std
from IPython.display import HTML
from matplotlib import animation
from termcolor import colored
# Importing and concating train and test set
train = pd.read_csv('train.csv', )
test = pd.read_csv('test.csv')
train.name = 'train'
test.name = 'test'
# keeping testing id for submission in the future
test_id = test.Id
for df in [train, test]:
df.drop(columns = ['Id'], inplace = True)
df_concat = pd.concat([train, test], axis = 0).reset_index(drop = True)
df_concat.name = 'both dfs'
df_concat.loc[:train.shape[0], 'which'] = 'train'
df_concat.loc[train.shape[0]:, 'which'] = 'test'
# Dropping two unuseful columns
df_concat.drop(columns = ['PoolQC', 'Utilities'], inplace = True)
# Filling missing values
# Filling with zero
# fields about the Garage
for field in ['GarageType', 'GarageFinish','GarageQual', 'GarageCond',
'BsmtFinType1','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1',
'BsmtFinType2','MiscFeature','Alley','Fence','FireplaceQu',
'MasVnrType' ] :
df_concat[field].fillna('None',inplace=True)
for field in ['MasVnrArea','BsmtFullBath','BsmtHalfBath'
,'BsmtFinSF1','GarageCars','GarageArea','TotalBsmtSF',
'BsmtUnfSF','BsmtFinSF2','GarageYrBlt','TotalBsmtSF']:
df_concat[field].fillna(0,inplace=True)
# Filling with appropriate values
df_concat['LotFrontage'] = df_concat.groupby('Neighborhood')['LotFrontage']\
.transform(lambda x: x.fillna(x.mean()))
for feature in ['MSZoning', 'Electrical']:
df_concat[feature] = df_concat.groupby('Neighborhood')[feature]\
.transform(lambda x: x.fillna(x.mode()[0]))
for field in ['SaleType','Exterior1st','Exterior2nd',]:
df_concat[field].fillna(df_concat[field].mode()[0],inplace=True)
df_concat.Functional.fillna('Typ',inplace=True)
df_concat.KitchenQual.fillna('TA',inplace=True)
# Converting categorical data into numerical
### ordinal
ordinal_fields_with_labelencoder=['LandSlope','YearBuilt','YearRemodAdd',
'CentralAir','GarageYrBlt','PavedDrive',
'YrSold']
### ordinal with labelencoder...
for field in ordinal_fields_with_labelencoder:
le = LabelEncoder()
df_concat[field] = le.fit_transform(df_concat[field].values)
features_that_are_already_ordinal = ['OverallQual','OverallCond','MoSold',
'FullBath','KitchenAbvGr','TotRmsAbvGrd']
### ordinal features that need to be sorted with ordinal encoder...
fields_that_need_to_be_ordered = [
'MSSubClass','ExterQual','LotShape','BsmtQual','BsmtCond',
'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC',
'Functional','FireplaceQu','KitchenQual', 'GarageFinish',
'GarageQual','GarageCond','Fence'
]
for field in fields_that_need_to_be_ordered:
df_concat[field] = df_concat[field].astype(str)
orders=[#msclass
['20','30','40','45','50','60','70','75','80','85', '90','120','150','160','180','190'],
#ExterQual
['Po','Fa','TA','Gd','Ex'],
#LotShape
['Reg','IR1' ,'IR2','IR3'],
#BsmtQual
['None','Fa','TA','Gd','Ex'],
#BsmtCond
['None','Po','Fa','TA','Gd','Ex'],
#BsmtExposure
['None','No','Mn','Av','Gd'],
#BsmtFinType1
['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
#BsmtFinType2
['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
#HeatingQC
['Po','Fa','TA','Gd','Ex'],
#Functional
['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
#FireplaceQu
['None','Po','Fa','TA','Gd','Ex'],
#KitchenQual
['Fa','TA','Gd','Ex'],
#GarageFinish
['None','Unf','RFn','Fin'],
#GarageQual
['None','Po','Fa','TA','Gd','Ex'],
#GarageCond
['None','Po','Fa','TA','Gd','Ex'],
#PoolQC
#['None','Fa','Gd','Ex'],
#Fence
['None','MnWw','GdWo','MnPrv','GdPrv'] ]
### ordinal features with specific order.....
for i in range(len(orders)):
ord_en = OrdinalEncoder(categories = {0:orders[i]})
df_concat.loc[:,fields_that_need_to_be_ordered[i]] = ord_en.fit_transform(df_concat.loc[:,fields_that_need_to_be_ordered[i]].values.reshape(-1,1))
# Finally one hot encoding categorical data that are not ordinal
df_concat=pd.get_dummies(df_concat.drop(columns = ['which']))
train = df_concat[:train.shape[0]]
test = df_concat[train.shape[0]:].drop(columns = ['SalePrice'])
def finding_over_fitting_features(df, percentage = 99.9):
overfit=[]
for feature in df.columns:
most_frequent=(df[feature] .value_counts().iloc[0])
if most_frequent/len(df) *100 >99.9:
overfit.append(feature)
return(overfit)
overfitted = finding_over_fitting_features(df_concat, percentage = 99.0)
df_concat.drop(columns = overfitted, inplace = True)
df = pd.read_csv('mood_swings/mood swings.csv', sep = '')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment