Skip to content

Instantly share code, notes, and snippets.

@AparaV
Created July 29, 2017 23:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AparaV/f47e8054f44547f812788a6aa41233aa to your computer and use it in GitHub Desktop.
Save AparaV/f47e8054f44547f812788a6aa41233aa to your computer and use it in GitHub Desktop.
import csv
import random
import numpy as np
import pandas as pd
def cleanup(df):
'''
Cleans data
1. Creates new features:
- total bathrooms = full + half bathrooms
- total porch area = closed + open porch area
2. Drops unwanted features
3. Fills missing values with the mode
4. Performs feature scaling
'''
# Features to drop
to_drop = ['MiscFeature', 'MiscVal', 'GarageArea', 'GarageYrBlt', 'Street', 'Alley',
'LotShape', 'LandContour', 'LandSlope', 'RoofMatl', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'BsmtFinSF1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'Electrical',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolQC', 'MoSold']
df['Bathrooms'] = df['FullBath'] + df['HalfBath']
df['PorchSF'] = df['EnclosedPorch'] + df['OpenPorchSF']
df = df.drop(to_drop, axis=1)
# Columns to ignore when normalizing features
to_ignore = ['SalePrice', 'Id']
for column in df.columns:
x = df[column].dropna().value_counts().index[0]
df = df.fillna(x)
if df[column].dtype != 'object' and column not in to_ignore:
m = df[column].min()
M = df[column].max()
Range = M - m
df[column] = (df[column] - m) / Range
return df
def encode_features(df_train, df_test):
'''
Takes columns whose values are strings (objects)
and categorizes them into discrete numbers.
This makes it feasible to use regression
'''
features = list(df_train.select_dtypes(include=['object']).columns)
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
unique_categories = list(df_combined[feature].unique())
map_dict = {}
for idx, category in enumerate(unique_categories):
map_dict[category] = idx + 1
df_train[feature] = df_train[feature].map(map_dict)
df_test[feature] = df_test[feature].map(map_dict)
return df_train, df_test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment