Last active
August 2, 2018 13:39
-
-
Save jay-trivedi/529cd4c2b0ffb5802be0268dc33356ec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Code Courtsey: juliencs (https://www.kaggle.com/juliencs) | |
## Link to Original Script: https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset | |
# Imports | |
import pandas as pd | |
import numpy as np | |
from scipy.stats import skew | |
def housing(train): | |
# Get data | |
train = pd.read_csv(train) | |
# Drop Id column | |
train.drop("Id", axis = 1, inplace = True) | |
# Handle missing values for features where median/mean or most common value doesn't make sense | |
# Alley : data description says NA means "no alley access" | |
train.loc[:, "Alley"] = train.loc[:, "Alley"].fillna("None") | |
# BedroomAbvGr : NA most likely means 0 | |
train.loc[:, "BedroomAbvGr"] = train.loc[:, "BedroomAbvGr"].fillna(0) | |
# BsmtQual etc : data description says NA for basement features is "no basement" | |
train.loc[:, "BsmtQual"] = train.loc[:, "BsmtQual"].fillna("No") | |
train.loc[:, "BsmtCond"] = train.loc[:, "BsmtCond"].fillna("No") | |
train.loc[:, "BsmtExposure"] = train.loc[:, "BsmtExposure"].fillna("No") | |
train.loc[:, "BsmtFinType1"] = train.loc[:, "BsmtFinType1"].fillna("No") | |
train.loc[:, "BsmtFinType2"] = train.loc[:, "BsmtFinType2"].fillna("No") | |
train.loc[:, "BsmtFullBath"] = train.loc[:, "BsmtFullBath"].fillna(0) | |
train.loc[:, "BsmtHalfBath"] = train.loc[:, "BsmtHalfBath"].fillna(0) | |
train.loc[:, "BsmtUnfSF"] = train.loc[:, "BsmtUnfSF"].fillna(0) | |
# CentralAir : NA most likely means No | |
train.loc[:, "CentralAir"] = train.loc[:, "CentralAir"].fillna("N") | |
# Condition : NA most likely means Normal | |
train.loc[:, "Condition1"] = train.loc[:, "Condition1"].fillna("Norm") | |
train.loc[:, "Condition2"] = train.loc[:, "Condition2"].fillna("Norm") | |
# EnclosedPorch : NA most likely means no enclosed porch | |
train.loc[:, "EnclosedPorch"] = train.loc[:, "EnclosedPorch"].fillna(0) | |
# External stuff : NA most likely means average | |
train.loc[:, "ExterCond"] = train.loc[:, "ExterCond"].fillna("TA") | |
train.loc[:, "ExterQual"] = train.loc[:, "ExterQual"].fillna("TA") | |
# Fence : data description says NA means "no fence" | |
train.loc[:, "Fence"] = train.loc[:, "Fence"].fillna("No") | |
# FireplaceQu : data description says NA means "no fireplace" | |
train.loc[:, "FireplaceQu"] = train.loc[:, "FireplaceQu"].fillna("No") | |
train.loc[:, "Fireplaces"] = train.loc[:, "Fireplaces"].fillna(0) | |
# Functional : data description says NA means typical | |
train.loc[:, "Functional"] = train.loc[:, "Functional"].fillna("Typ") | |
# GarageType etc : data description says NA for garage features is "no garage" | |
train.loc[:, "GarageType"] = train.loc[:, "GarageType"].fillna("No") | |
train.loc[:, "GarageFinish"] = train.loc[:, "GarageFinish"].fillna("No") | |
train.loc[:, "GarageQual"] = train.loc[:, "GarageQual"].fillna("No") | |
train.loc[:, "GarageCond"] = train.loc[:, "GarageCond"].fillna("No") | |
train.loc[:, "GarageArea"] = train.loc[:, "GarageArea"].fillna(0) | |
train.loc[:, "GarageCars"] = train.loc[:, "GarageCars"].fillna(0) | |
# HalfBath : NA most likely means no half baths above grade | |
train.loc[:, "HalfBath"] = train.loc[:, "HalfBath"].fillna(0) | |
# HeatingQC : NA most likely means typical | |
train.loc[:, "HeatingQC"] = train.loc[:, "HeatingQC"].fillna("TA") | |
# KitchenAbvGr : NA most likely means 0 | |
train.loc[:, "KitchenAbvGr"] = train.loc[:, "KitchenAbvGr"].fillna(0) | |
# KitchenQual : NA most likely means typical | |
train.loc[:, "KitchenQual"] = train.loc[:, "KitchenQual"].fillna("TA") | |
# LotFrontage : NA most likely means no lot frontage | |
train.loc[:, "LotFrontage"] = train.loc[:, "LotFrontage"].fillna(0) | |
# LotShape : NA most likely means regular | |
train.loc[:, "LotShape"] = train.loc[:, "LotShape"].fillna("Reg") | |
# MasVnrType : NA most likely means no veneer | |
train.loc[:, "MasVnrType"] = train.loc[:, "MasVnrType"].fillna("None") | |
train.loc[:, "MasVnrArea"] = train.loc[:, "MasVnrArea"].fillna(0) | |
# MiscFeature : data description says NA means "no misc feature" | |
train.loc[:, "MiscFeature"] = train.loc[:, "MiscFeature"].fillna("No") | |
train.loc[:, "MiscVal"] = train.loc[:, "MiscVal"].fillna(0) | |
# OpenPorchSF : NA most likely means no open porch | |
train.loc[:, "OpenPorchSF"] = train.loc[:, "OpenPorchSF"].fillna(0) | |
# PavedDrive : NA most likely means not paved | |
train.loc[:, "PavedDrive"] = train.loc[:, "PavedDrive"].fillna("N") | |
# PoolQC : data description says NA means "no pool" | |
train.loc[:, "PoolQC"] = train.loc[:, "PoolQC"].fillna("No") | |
train.loc[:, "PoolArea"] = train.loc[:, "PoolArea"].fillna(0) | |
# SaleCondition : NA most likely means normal sale | |
train.loc[:, "SaleCondition"] = train.loc[:, "SaleCondition"].fillna("Normal") | |
# ScreenPorch : NA most likely means no screen porch | |
train.loc[:, "ScreenPorch"] = train.loc[:, "ScreenPorch"].fillna(0) | |
# TotRmsAbvGrd : NA most likely means 0 | |
train.loc[:, "TotRmsAbvGrd"] = train.loc[:, "TotRmsAbvGrd"].fillna(0) | |
# Utilities : NA most likely means all public utilities | |
train.loc[:, "Utilities"] = train.loc[:, "Utilities"].fillna("AllPub") | |
# WoodDeckSF : NA most likely means no wood deck | |
train.loc[:, "WoodDeckSF"] = train.loc[:, "WoodDeckSF"].fillna(0) | |
# Some numerical features are actually really categories | |
train = train.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", | |
50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", | |
80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", | |
150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"}, | |
"MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun", | |
7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"} | |
}) | |
return train |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment