-
-
Save amankharwal/cd6617ff99783280798ec99d8273df8b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
# for vis | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
from plotly.subplots import make_subplots | |
import plotly.graph_objects as go | |
import seaborn as sns | |
sns.set_style("whitegrid") | |
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder | |
# anomaly and skewness detection | |
from scipy import stats | |
from scipy.stats import skew, norm | |
from scipy.special import boxcox1p | |
from numpy import mean, std | |
from IPython.display import HTML | |
from matplotlib import animation | |
from termcolor import colored | |
# Importing and concating train and test set | |
train = pd.read_csv('train.csv', ) | |
test = pd.read_csv('test.csv') | |
train.name = 'train' | |
test.name = 'test' | |
# keeping testing id for submission in the future | |
test_id = test.Id | |
for df in [train, test]: | |
df.drop(columns = ['Id'], inplace = True) | |
df_concat = pd.concat([train, test], axis = 0).reset_index(drop = True) | |
df_concat.name = 'both dfs' | |
df_concat.loc[:train.shape[0], 'which'] = 'train' | |
df_concat.loc[train.shape[0]:, 'which'] = 'test' | |
# Dropping two unuseful columns | |
df_concat.drop(columns = ['PoolQC', 'Utilities'], inplace = True) | |
# Filling missing values | |
# Filling with zero | |
# fields about the Garage | |
for field in ['GarageType', 'GarageFinish','GarageQual', 'GarageCond', | |
'BsmtFinType1','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', | |
'BsmtFinType2','MiscFeature','Alley','Fence','FireplaceQu', | |
'MasVnrType' ] : | |
df_concat[field].fillna('None',inplace=True) | |
for field in ['MasVnrArea','BsmtFullBath','BsmtHalfBath' | |
,'BsmtFinSF1','GarageCars','GarageArea','TotalBsmtSF', | |
'BsmtUnfSF','BsmtFinSF2','GarageYrBlt','TotalBsmtSF']: | |
df_concat[field].fillna(0,inplace=True) | |
# Filling with appropriate values | |
df_concat['LotFrontage'] = df_concat.groupby('Neighborhood')['LotFrontage']\ | |
.transform(lambda x: x.fillna(x.mean())) | |
for feature in ['MSZoning', 'Electrical']: | |
df_concat[feature] = df_concat.groupby('Neighborhood')[feature]\ | |
.transform(lambda x: x.fillna(x.mode()[0])) | |
for field in ['SaleType','Exterior1st','Exterior2nd',]: | |
df_concat[field].fillna(df_concat[field].mode()[0],inplace=True) | |
df_concat.Functional.fillna('Typ',inplace=True) | |
df_concat.KitchenQual.fillna('TA',inplace=True) | |
# Converting categorical data into numerical | |
### ordinal | |
ordinal_fields_with_labelencoder=['LandSlope','YearBuilt','YearRemodAdd', | |
'CentralAir','GarageYrBlt','PavedDrive', | |
'YrSold'] | |
### ordinal with labelencoder... | |
for field in ordinal_fields_with_labelencoder: | |
le = LabelEncoder() | |
df_concat[field] = le.fit_transform(df_concat[field].values) | |
features_that_are_already_ordinal = ['OverallQual','OverallCond','MoSold', | |
'FullBath','KitchenAbvGr','TotRmsAbvGrd'] | |
### ordinal features that need to be sorted with ordinal encoder... | |
fields_that_need_to_be_ordered = [ | |
'MSSubClass','ExterQual','LotShape','BsmtQual','BsmtCond', | |
'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC', | |
'Functional','FireplaceQu','KitchenQual', 'GarageFinish', | |
'GarageQual','GarageCond','Fence' | |
] | |
for field in fields_that_need_to_be_ordered: | |
df_concat[field] = df_concat[field].astype(str) | |
orders=[#msclass | |
['20','30','40','45','50','60','70','75','80','85', '90','120','150','160','180','190'], | |
#ExterQual | |
['Po','Fa','TA','Gd','Ex'], | |
#LotShape | |
['Reg','IR1' ,'IR2','IR3'], | |
#BsmtQual | |
['None','Fa','TA','Gd','Ex'], | |
#BsmtCond | |
['None','Po','Fa','TA','Gd','Ex'], | |
#BsmtExposure | |
['None','No','Mn','Av','Gd'], | |
#BsmtFinType1 | |
['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ], | |
#BsmtFinType2 | |
['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ], | |
#HeatingQC | |
['Po','Fa','TA','Gd','Ex'], | |
#Functional | |
['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'], | |
#FireplaceQu | |
['None','Po','Fa','TA','Gd','Ex'], | |
#KitchenQual | |
['Fa','TA','Gd','Ex'], | |
#GarageFinish | |
['None','Unf','RFn','Fin'], | |
#GarageQual | |
['None','Po','Fa','TA','Gd','Ex'], | |
#GarageCond | |
['None','Po','Fa','TA','Gd','Ex'], | |
#PoolQC | |
#['None','Fa','Gd','Ex'], | |
#Fence | |
['None','MnWw','GdWo','MnPrv','GdPrv'] ] | |
### ordinal features with specific order..... | |
for i in range(len(orders)): | |
ord_en = OrdinalEncoder(categories = {0:orders[i]}) | |
df_concat.loc[:,fields_that_need_to_be_ordered[i]] = ord_en.fit_transform(df_concat.loc[:,fields_that_need_to_be_ordered[i]].values.reshape(-1,1)) | |
# Finally one hot encoding categorical data that are not ordinal | |
df_concat=pd.get_dummies(df_concat.drop(columns = ['which'])) | |
train = df_concat[:train.shape[0]] | |
test = df_concat[train.shape[0]:].drop(columns = ['SalePrice']) | |
def finding_over_fitting_features(df, percentage = 99.9): | |
overfit=[] | |
for feature in df.columns: | |
most_frequent=(df[feature] .value_counts().iloc[0]) | |
if most_frequent/len(df) *100 >99.9: | |
overfit.append(feature) | |
return(overfit) | |
overfitted = finding_over_fitting_features(df_concat, percentage = 99.0) | |
df_concat.drop(columns = overfitted, inplace = True) | |
df = pd.read_csv('mood_swings/mood swings.csv', sep = '') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment