Skip to content

Instantly share code, notes, and snippets.

@h5li
Created October 31, 2018 21:09
Show Gist options
  • Save h5li/ab5a18045c69812664941a5c58f97244 to your computer and use it in GitHub Desktop.
Save h5li/ab5a18045c69812664941a5c58f97244 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Read in the testing and training data into two dataframes\n",
"train_data = pd.read_csv('train.csv')\n",
"test_data = pd.read_csv('test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>...</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1461</td>\n",
" <td>20</td>\n",
" <td>RH</td>\n",
" <td>80.0</td>\n",
" <td>11622</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>120</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1462</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>81.0</td>\n",
" <td>14267</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Gar2</td>\n",
" <td>12500</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1463</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>74.0</td>\n",
" <td>13830</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1464</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>78.0</td>\n",
" <td>9978</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1465</td>\n",
" <td>120</td>\n",
" <td>RL</td>\n",
" <td>43.0</td>\n",
" <td>5005</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>HLS</td>\n",
" <td>AllPub</td>\n",
" <td>...</td>\n",
" <td>144</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 80 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1461 20 RH 80.0 11622 Pave NaN Reg \n",
"1 1462 20 RL 81.0 14267 Pave NaN IR1 \n",
"2 1463 60 RL 74.0 13830 Pave NaN IR1 \n",
"3 1464 60 RL 78.0 9978 Pave NaN IR1 \n",
"4 1465 120 RL 43.0 5005 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence \\\n",
"0 Lvl AllPub ... 120 0 NaN MnPrv \n",
"1 Lvl AllPub ... 0 0 NaN NaN \n",
"2 Lvl AllPub ... 0 0 NaN MnPrv \n",
"3 Lvl AllPub ... 0 0 NaN NaN \n",
"4 HLS AllPub ... 144 0 NaN NaN \n",
"\n",
" MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n",
"0 NaN 0 6 2010 WD Normal \n",
"1 Gar2 12500 6 2010 WD Normal \n",
"2 NaN 0 3 2010 WD Normal \n",
"3 NaN 0 6 2010 WD Normal \n",
"4 NaN 0 1 2010 WD Normal \n",
"\n",
"[5 rows x 80 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Look at the data we will be working with. Dang. Look at all those missing values and String values\n",
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Train data processing\n",
"train_data.dropna(axis=1,inplace=True) # Drops all columns with NaN values in them\n",
"train_data = train_data.select_dtypes(exclude=['object']) # Gets rid of all columns that don't have int data in them"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Test data processing\n",
"test_data.dropna(axis=1,inplace=True)\n",
"test_data = test_data.select_dtypes(exclude=['object'])"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index([u'Id', u'MSSubClass', u'LotArea', u'OverallQual', u'OverallCond',\n",
" u'YearBuilt', u'YearRemodAdd', u'BsmtFinSF1', u'BsmtFinSF2',\n",
" u'BsmtUnfSF', u'TotalBsmtSF', u'1stFlrSF', u'2ndFlrSF', u'LowQualFinSF',\n",
" u'GrLivArea', u'BsmtFullBath', u'BsmtHalfBath', u'FullBath',\n",
" u'HalfBath', u'BedroomAbvGr', u'KitchenAbvGr', u'TotRmsAbvGrd',\n",
" u'Fireplaces', u'GarageCars', u'GarageArea', u'WoodDeckSF',\n",
" u'OpenPorchSF', u'EnclosedPorch', u'3SsnPorch', u'ScreenPorch',\n",
" u'PoolArea', u'MiscVal', u'MoSold', u'YrSold', u'SalePrice'],\n",
" dtype='object')"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Training dataframe has more columns than test dataframe, need to keep them both the same size\n",
"train_data.columns"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Drop all columns in the training dataframe that aren't in the test dataframe\n",
"train_data = train_data.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea'],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Separate test and train into feature and label dataframes\n",
"columns_minus_saleprice = train_data.columns[1:-1]\n",
"X_train = train_data[columns_minus_saleprice] # All columns minus saleprice will make up the X_train dataframe\n",
"y_train = train_data['SalePrice'] # The column with saleprice will make up the y_train dataframe\n",
"X_test = test_data[test_data.columns[1:]] # All columns except the id column will be inputted into the machine learning model"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Initialize a classifier\n",
"regr = RandomForestRegressor() "
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
" max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0, warm_start=False)"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the classifier with X and y data\n",
"regr.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Use fitted classifier to predict saleprice of test data\n",
"predictions = regr.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Format the output so that it can be submitted to Kaggle as a csv\n",
"list_of_house_ids = test_data[\"Id\"]\n",
"submission = zip(list_of_house_ids, predictions.astype(int)) # Zip together predicted house prices and ids\n",
"submission = pd.DataFrame(submission, columns=[\"Id\", \"Saleprice\"]) # Turn into a dataframe and add column titles\n",
"submission.to_csv('submission1.csv', header=True, index=False) # Output to csv and remove index column"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment