Created
July 29, 2017 23:43
-
-
Save AparaV/f47e8054f44547f812788a6aa41233aa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import random | |
import numpy as np | |
import pandas as pd | |
def cleanup(df): | |
''' | |
Cleans data | |
1. Creates new features: | |
- total bathrooms = full + half bathrooms | |
- total porch area = closed + open porch area | |
2. Drops unwanted features | |
3. Fills missing values with the mode | |
4. Performs feature scaling | |
''' | |
# Features to drop | |
to_drop = ['MiscFeature', 'MiscVal', 'GarageArea', 'GarageYrBlt', 'Street', 'Alley', | |
'LotShape', 'LandContour', 'LandSlope', 'RoofMatl', 'Exterior2nd', 'MasVnrType', | |
'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', | |
'BsmtFinSF1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'Electrical', | |
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', | |
'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'FireplaceQu', | |
'GarageType', 'GarageFinish', 'GarageQual', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', | |
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolQC', 'MoSold'] | |
df['Bathrooms'] = df['FullBath'] + df['HalfBath'] | |
df['PorchSF'] = df['EnclosedPorch'] + df['OpenPorchSF'] | |
df = df.drop(to_drop, axis=1) | |
# Columns to ignore when normalizing features | |
to_ignore = ['SalePrice', 'Id'] | |
for column in df.columns: | |
x = df[column].dropna().value_counts().index[0] | |
df = df.fillna(x) | |
if df[column].dtype != 'object' and column not in to_ignore: | |
m = df[column].min() | |
M = df[column].max() | |
Range = M - m | |
df[column] = (df[column] - m) / Range | |
return df | |
def encode_features(df_train, df_test): | |
''' | |
Takes columns whose values are strings (objects) | |
and categorizes them into discrete numbers. | |
This makes it feasible to use regression | |
''' | |
features = list(df_train.select_dtypes(include=['object']).columns) | |
df_combined = pd.concat([df_train[features], df_test[features]]) | |
for feature in features: | |
unique_categories = list(df_combined[feature].unique()) | |
map_dict = {} | |
for idx, category in enumerate(unique_categories): | |
map_dict[category] = idx + 1 | |
df_train[feature] = df_train[feature].map(map_dict) | |
df_test[feature] = df_test[feature].map(map_dict) | |
return df_train, df_test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment