Created
October 12, 2018 17:39
-
-
Save dienhoa/a5adf923bd8b24b3d0eadcd61aec8c2e to your computer and use it in GitHub Desktop.
data preprocessing with Tabular Module fast.ai
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from fastai import * # Quick access to most common functionality\nfrom fastai.tabular import * # Quick access to tabular functionality\nfrom fastai.docs import * # Access to example data provided with fastai", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = get_adult()\ntrain_df, valid_df = df[:-2000].copy(),df[-2000:].copy()\ntrain_df.head()", | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 2, | |
"data": { | |
"text/plain": " age workclass fnlwgt education education-num \\\n0 49 Private 101320 Assoc-acdm 12.0 \n1 44 Private 236746 Masters 14.0 \n2 38 Private 96185 HS-grad NaN \n3 38 Self-emp-inc 112847 Prof-school 15.0 \n4 42 Self-emp-not-inc 82297 7th-8th NaN \n\n marital-status occupation relationship race \\\n0 Married-civ-spouse NaN Wife White \n1 Divorced Exec-managerial Not-in-family White \n2 Divorced NaN Unmarried Black \n3 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander \n4 Married-civ-spouse Other-service Wife Black \n\n sex capital-gain capital-loss hours-per-week native-country >=50k \n0 Female 0 1902 40 United-States 1 \n1 Male 10520 0 45 United-States 1 \n2 Female 0 0 32 United-States 0 \n3 Male 0 0 40 United-States 1 \n4 Female 0 0 50 United-States 0 ", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>workclass</th>\n <th>fnlwgt</th>\n <th>education</th>\n <th>education-num</th>\n <th>marital-status</th>\n <th>occupation</th>\n <th>relationship</th>\n <th>race</th>\n <th>sex</th>\n <th>capital-gain</th>\n <th>capital-loss</th>\n <th>hours-per-week</th>\n <th>native-country</th>\n <th>>=50k</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>49</td>\n <td>Private</td>\n <td>101320</td>\n <td>Assoc-acdm</td>\n <td>12.0</td>\n <td>Married-civ-spouse</td>\n <td>NaN</td>\n <td>Wife</td>\n <td>White</td>\n <td>Female</td>\n <td>0</td>\n <td>1902</td>\n <td>40</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>44</td>\n <td>Private</td>\n <td>236746</td>\n <td>Masters</td>\n <td>14.0</td>\n <td>Divorced</td>\n <td>Exec-managerial</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>10520</td>\n <td>0</td>\n <td>45</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>96185</td>\n <td>HS-grad</td>\n <td>NaN</td>\n <td>Divorced</td>\n <td>NaN</td>\n <td>Unmarried</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>32</td>\n <td>United-States</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>38</td>\n <td>Self-emp-inc</td>\n <td>112847</td>\n <td>Prof-school</td>\n <td>15.0</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Husband</td>\n <td>Asian-Pac-Islander</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>42</td>\n <td>Self-emp-not-inc</td>\n <td>82297</td>\n <td>7th-8th</td>\n <td>NaN</td>\n <td>Married-civ-spouse</td>\n <td>Other-service</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>50</td>\n <td>United-States</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "dep_var = '>=50k'\ncat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']\ndata = tabular_data_from_df(ADULT_PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_names)", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from pandas_summary import DataFrameSummary\nfrom sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\nfrom sklearn import metrics", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "data.train_ds.conts.shape, data.train_ds.cats.shape", | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 5, | |
"data": { | |
"text/plain": "(torch.Size([30561, 6]), torch.Size([30561, 9]))" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "cats,conts = data.train_ds.cats.numpy(), data.train_ds.conts.numpy()", | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = np.concatenate((cats, conts), axis=1)", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.shape", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 8, | |
"data": { | |
"text/plain": "(30561, 15)" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "y = data.train_ds.y.numpy()", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "m = RandomForestClassifier(n_jobs=-1)", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "m.fit(df, y)\nm.score(df,y)", | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 11, | |
"data": { | |
"text/plain": "0.9880893949805307" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.4", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "data preprocessing with Tabular Module fast.ai", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment