Skip to content

Instantly share code, notes, and snippets.

@dienhoa
Created October 12, 2018 17:39
Show Gist options
  • Save dienhoa/a5adf923bd8b24b3d0eadcd61aec8c2e to your computer and use it in GitHub Desktop.
Save dienhoa/a5adf923bd8b24b3d0eadcd61aec8c2e to your computer and use it in GitHub Desktop.
data preprocessing with Tabular Module fast.ai
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from fastai import * # Quick access to most common functionality\nfrom fastai.tabular import * # Quick access to tabular functionality\nfrom fastai.docs import * # Access to example data provided with fastai",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df = get_adult()\ntrain_df, valid_df = df[:-2000].copy(),df[-2000:].copy()\ntrain_df.head()",
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 2,
"data": {
"text/plain": " age workclass fnlwgt education education-num \\\n0 49 Private 101320 Assoc-acdm 12.0 \n1 44 Private 236746 Masters 14.0 \n2 38 Private 96185 HS-grad NaN \n3 38 Self-emp-inc 112847 Prof-school 15.0 \n4 42 Self-emp-not-inc 82297 7th-8th NaN \n\n marital-status occupation relationship race \\\n0 Married-civ-spouse NaN Wife White \n1 Divorced Exec-managerial Not-in-family White \n2 Divorced NaN Unmarried Black \n3 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander \n4 Married-civ-spouse Other-service Wife Black \n\n sex capital-gain capital-loss hours-per-week native-country >=50k \n0 Female 0 1902 40 United-States 1 \n1 Male 10520 0 45 United-States 1 \n2 Female 0 0 32 United-States 0 \n3 Male 0 0 40 United-States 1 \n4 Female 0 0 50 United-States 0 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>workclass</th>\n <th>fnlwgt</th>\n <th>education</th>\n <th>education-num</th>\n <th>marital-status</th>\n <th>occupation</th>\n <th>relationship</th>\n <th>race</th>\n <th>sex</th>\n <th>capital-gain</th>\n <th>capital-loss</th>\n <th>hours-per-week</th>\n <th>native-country</th>\n <th>&gt;=50k</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>49</td>\n <td>Private</td>\n <td>101320</td>\n <td>Assoc-acdm</td>\n <td>12.0</td>\n <td>Married-civ-spouse</td>\n <td>NaN</td>\n <td>Wife</td>\n <td>White</td>\n <td>Female</td>\n <td>0</td>\n <td>1902</td>\n <td>40</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>44</td>\n <td>Private</td>\n <td>236746</td>\n <td>Masters</td>\n <td>14.0</td>\n <td>Divorced</td>\n <td>Exec-managerial</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>10520</td>\n <td>0</td>\n <td>45</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>96185</td>\n <td>HS-grad</td>\n <td>NaN</td>\n <td>Divorced</td>\n <td>NaN</td>\n <td>Unmarried</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>32</td>\n <td>United-States</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>38</td>\n <td>Self-emp-inc</td>\n <td>112847</td>\n <td>Prof-school</td>\n <td>15.0</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Husband</td>\n <td>Asian-Pac-Islander</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>42</td>\n <td>Self-emp-not-inc</td>\n <td>82297</td>\n <td>7th-8th</td>\n <td>NaN</td>\n <td>Married-civ-spouse</td>\n <td>Other-service</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>50</td>\n <td>United-States</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dep_var = '>=50k'\ncat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']\ndata = tabular_data_from_df(ADULT_PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_names)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from pandas_summary import DataFrameSummary\nfrom sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\nfrom sklearn import metrics",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "data.train_ds.conts.shape, data.train_ds.cats.shape",
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 5,
"data": {
"text/plain": "(torch.Size([30561, 6]), torch.Size([30561, 9]))"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cats,conts = data.train_ds.cats.numpy(), data.train_ds.conts.numpy()",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df = np.concatenate((cats, conts), axis=1)",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df.shape",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 8,
"data": {
"text/plain": "(30561, 15)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y = data.train_ds.y.numpy()",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "m = RandomForestClassifier(n_jobs=-1)",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "m.fit(df, y)\nm.score(df,y)",
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 11,
"data": {
"text/plain": "0.9880893949805307"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "data preprocessing with Tabular Module fast.ai",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment