amankharwal/outliers.py Secret

## outliers.py
import numpy as np
import pandas as pd

# for vis
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
# anomaly and skewness detection
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from numpy import mean, std

from IPython.display import HTML
from matplotlib import animation
from termcolor import colored

# Importing and concating train and test set
train = pd.read_csv('train.csv', )
test = pd.read_csv('test.csv')

train.name = 'train'
test.name = 'test'

# keeping testing id for submission in the future
test_id = test.Id
for df in [train, test]:
    df.drop(columns = ['Id'], inplace = True)

df_concat = pd.concat([train, test], axis = 0).reset_index(drop = True)
df_concat.name = 'both dfs'


df_concat.loc[:train.shape[0], 'which'] = 'train'
df_concat.loc[train.shape[0]:, 'which'] = 'test'

# Dropping two unuseful columns
df_concat.drop(columns = ['PoolQC', 'Utilities'], inplace = True)

# Filling missing values

# Filling with zero
# fields about the Garage
for field in ['GarageType', 'GarageFinish','GarageQual', 'GarageCond',
              'BsmtFinType1','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1',
              'BsmtFinType2','MiscFeature','Alley','Fence','FireplaceQu',
               'MasVnrType' ] :
    df_concat[field].fillna('None',inplace=True)

for field in ['MasVnrArea','BsmtFullBath','BsmtHalfBath'
              ,'BsmtFinSF1','GarageCars','GarageArea','TotalBsmtSF',
             'BsmtUnfSF','BsmtFinSF2','GarageYrBlt','TotalBsmtSF']:
    df_concat[field].fillna(0,inplace=True)


# Filling with appropriate values
df_concat['LotFrontage'] = df_concat.groupby('Neighborhood')['LotFrontage']\
                          .transform(lambda x: x.fillna(x.mean()))
for feature in ['MSZoning', 'Electrical']:
    df_concat[feature] = df_concat.groupby('Neighborhood')[feature]\
                        .transform(lambda x: x.fillna(x.mode()[0]))

for field in ['SaleType','Exterior1st','Exterior2nd',]:
    df_concat[field].fillna(df_concat[field].mode()[0],inplace=True)

df_concat.Functional.fillna('Typ',inplace=True)
df_concat.KitchenQual.fillna('TA',inplace=True)

# Converting categorical data into numerical

### ordinal
ordinal_fields_with_labelencoder=['LandSlope','YearBuilt','YearRemodAdd',
                                  'CentralAir','GarageYrBlt','PavedDrive',
                                  'YrSold']

### ordinal with labelencoder...
for field in ordinal_fields_with_labelencoder:
    le = LabelEncoder()
    df_concat[field] = le.fit_transform(df_concat[field].values)

features_that_are_already_ordinal = ['OverallQual','OverallCond','MoSold',
                                     'FullBath','KitchenAbvGr','TotRmsAbvGrd']


### ordinal features that need to be sorted with ordinal encoder...
fields_that_need_to_be_ordered = [
              'MSSubClass','ExterQual','LotShape','BsmtQual','BsmtCond',
              'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC',
              'Functional','FireplaceQu','KitchenQual', 'GarageFinish',
              'GarageQual','GarageCond','Fence'
                                    ]
for field in  fields_that_need_to_be_ordered:
    df_concat[field] = df_concat[field].astype(str)


orders=[#msclass
    ['20','30','40','45','50','60','70','75','80','85', '90','120','150','160','180','190'],
    #ExterQual
    ['Po','Fa','TA','Gd','Ex'],
    #LotShape
    ['Reg','IR1' ,'IR2','IR3'],
    #BsmtQual
    ['None','Fa','TA','Gd','Ex'],
    #BsmtCond
    ['None','Po','Fa','TA','Gd','Ex'],
    #BsmtExposure
    ['None','No','Mn','Av','Gd'],
    #BsmtFinType1
    ['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
    #BsmtFinType2
   ['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
    #HeatingQC
    ['Po','Fa','TA','Gd','Ex'],
    #Functional
   ['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
    #FireplaceQu
    ['None','Po','Fa','TA','Gd','Ex'],
    #KitchenQual
    ['Fa','TA','Gd','Ex'],
    #GarageFinish
    ['None','Unf','RFn','Fin'],
    #GarageQual
    ['None','Po','Fa','TA','Gd','Ex'],
    #GarageCond
    ['None','Po','Fa','TA','Gd','Ex'],
    #PoolQC
    #['None','Fa','Gd','Ex'],
    #Fence
    ['None','MnWw','GdWo','MnPrv','GdPrv'] ]


### ordinal features with specific order.....
for i in range(len(orders)):

    ord_en = OrdinalEncoder(categories = {0:orders[i]})
    df_concat.loc[:,fields_that_need_to_be_ordered[i]] = ord_en.fit_transform(df_concat.loc[:,fields_that_need_to_be_ordered[i]].values.reshape(-1,1))

# Finally one hot encoding categorical data that are not ordinal
df_concat=pd.get_dummies(df_concat.drop(columns = ['which']))
train = df_concat[:train.shape[0]]
test = df_concat[train.shape[0]:].drop(columns = ['SalePrice'])


def finding_over_fitting_features(df, percentage = 99.9):
    overfit=[]
    for feature in df.columns:
        most_frequent=(df[feature] .value_counts().iloc[0])
        if most_frequent/len(df) *100 >99.9:
            overfit.append(feature)
    return(overfit)
overfitted = finding_over_fitting_features(df_concat, percentage = 99.0)
df_concat.drop(columns = overfitted, inplace = True)

df = pd.read_csv('mood_swings/mood swings.csv', sep = '')
	import numpy as np
	import pandas as pd

	# for vis
	import matplotlib.pyplot as plt
	import plotly.express as px
	from plotly.subplots import make_subplots
	import plotly.graph_objects as go
	import seaborn as sns
	sns.set_style("whitegrid")
	from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
	# anomaly and skewness detection
	from scipy import stats
	from scipy.stats import skew, norm
	from scipy.special import boxcox1p
	from numpy import mean, std

	from IPython.display import HTML
	from matplotlib import animation
	from termcolor import colored

	# Importing and concating train and test set
	train = pd.read_csv('train.csv', )
	test = pd.read_csv('test.csv')

	train.name = 'train'
	test.name = 'test'

	# keeping testing id for submission in the future
	test_id = test.Id
	for df in [train, test]:
	df.drop(columns = ['Id'], inplace = True)

	df_concat = pd.concat([train, test], axis = 0).reset_index(drop = True)
	df_concat.name = 'both dfs'


	df_concat.loc[:train.shape[0], 'which'] = 'train'
	df_concat.loc[train.shape[0]:, 'which'] = 'test'

	# Dropping two unuseful columns
	df_concat.drop(columns = ['PoolQC', 'Utilities'], inplace = True)

	# Filling missing values

	# Filling with zero
	# fields about the Garage
	for field in ['GarageType', 'GarageFinish','GarageQual', 'GarageCond',
	'BsmtFinType1','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1',
	'BsmtFinType2','MiscFeature','Alley','Fence','FireplaceQu',
	'MasVnrType' ] :
	df_concat[field].fillna('None',inplace=True)

	for field in ['MasVnrArea','BsmtFullBath','BsmtHalfBath'
	,'BsmtFinSF1','GarageCars','GarageArea','TotalBsmtSF',
	'BsmtUnfSF','BsmtFinSF2','GarageYrBlt','TotalBsmtSF']:
	df_concat[field].fillna(0,inplace=True)


	# Filling with appropriate values
	df_concat['LotFrontage'] = df_concat.groupby('Neighborhood')['LotFrontage']\
	.transform(lambda x: x.fillna(x.mean()))
	for feature in ['MSZoning', 'Electrical']:
	df_concat[feature] = df_concat.groupby('Neighborhood')[feature]\
	.transform(lambda x: x.fillna(x.mode()[0]))

	for field in ['SaleType','Exterior1st','Exterior2nd',]:
	df_concat[field].fillna(df_concat[field].mode()[0],inplace=True)

	df_concat.Functional.fillna('Typ',inplace=True)
	df_concat.KitchenQual.fillna('TA',inplace=True)

	# Converting categorical data into numerical

	### ordinal
	ordinal_fields_with_labelencoder=['LandSlope','YearBuilt','YearRemodAdd',
	'CentralAir','GarageYrBlt','PavedDrive',
	'YrSold']

	### ordinal with labelencoder...
	for field in ordinal_fields_with_labelencoder:
	le = LabelEncoder()
	df_concat[field] = le.fit_transform(df_concat[field].values)

	features_that_are_already_ordinal = ['OverallQual','OverallCond','MoSold',
	'FullBath','KitchenAbvGr','TotRmsAbvGrd']


	### ordinal features that need to be sorted with ordinal encoder...
	fields_that_need_to_be_ordered = [
	'MSSubClass','ExterQual','LotShape','BsmtQual','BsmtCond',
	'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC',
	'Functional','FireplaceQu','KitchenQual', 'GarageFinish',
	'GarageQual','GarageCond','Fence'
	]
	for field in fields_that_need_to_be_ordered:
	df_concat[field] = df_concat[field].astype(str)


	orders=[#msclass
	['20','30','40','45','50','60','70','75','80','85', '90','120','150','160','180','190'],
	#ExterQual
	['Po','Fa','TA','Gd','Ex'],
	#LotShape
	['Reg','IR1' ,'IR2','IR3'],
	#BsmtQual
	['None','Fa','TA','Gd','Ex'],
	#BsmtCond
	['None','Po','Fa','TA','Gd','Ex'],
	#BsmtExposure
	['None','No','Mn','Av','Gd'],
	#BsmtFinType1
	['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
	#BsmtFinType2
	['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
	#HeatingQC
	['Po','Fa','TA','Gd','Ex'],
	#Functional
	['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
	#FireplaceQu
	['None','Po','Fa','TA','Gd','Ex'],
	#KitchenQual
	['Fa','TA','Gd','Ex'],
	#GarageFinish
	['None','Unf','RFn','Fin'],
	#GarageQual
	['None','Po','Fa','TA','Gd','Ex'],
	#GarageCond
	['None','Po','Fa','TA','Gd','Ex'],
	#PoolQC
	#['None','Fa','Gd','Ex'],
	#Fence
	['None','MnWw','GdWo','MnPrv','GdPrv'] ]


	### ordinal features with specific order.....
	for i in range(len(orders)):

	ord_en = OrdinalEncoder(categories = {0:orders[i]})
	df_concat.loc[:,fields_that_need_to_be_ordered[i]] = ord_en.fit_transform(df_concat.loc[:,fields_that_need_to_be_ordered[i]].values.reshape(-1,1))

	# Finally one hot encoding categorical data that are not ordinal
	df_concat=pd.get_dummies(df_concat.drop(columns = ['which']))
	train = df_concat[:train.shape[0]]
	test = df_concat[train.shape[0]:].drop(columns = ['SalePrice'])


	def finding_over_fitting_features(df, percentage = 99.9):
	overfit=[]
	for feature in df.columns:
	most_frequent=(df[feature] .value_counts().iloc[0])
	if most_frequent/len(df) *100 >99.9:
	overfit.append(feature)
	return(overfit)
	overfitted = finding_over_fitting_features(df_concat, percentage = 99.0)
	df_concat.drop(columns = overfitted, inplace = True)

	df = pd.read_csv('mood_swings/mood swings.csv', sep = '')