Created
August 5, 2019 18:49
-
-
Save ankschoubey/37dff38bec18a34b1b0916e4f8243a10 to your computer and use it in GitHub Desktop.
Lazy programmers way to quickly get continuous and categorical variable. Specially useful for FastAI Tabular.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "20190705_pandas_profiling_separating_catvars_contvars.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "xmzR6qE6Ft4G", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"In this notebook, I demonstrate a way to make use of pandas-profiling to quickly get categorical and continous variable column names from a pandas dataframe.\n", | |
"\n", | |
"\n", | |
"This can be useful when creating FastAI Tabular. #LessTyping" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "2Of9GkXPFaHj", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Installation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Y81xXRP5D7vG", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!pip install pandas-profiling" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9c10xENSGNsR", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Google Colab Specific" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Mt5uyxogEC3X", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
}, | |
"outputId": "969e9120-af6d-4cc5-8dff-e7787e6181c7" | |
}, | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive')" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gMicj8X5EEN2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!cp /content/drive/My\\ Drive/kaggle.json ~/.kaggle" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "8zXZOrDbGKuQ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Download Dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0LxozShKEIMs", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!kaggle competitions download -c house-prices-advanced-regression-techniques" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yS0jB882GS98", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Code" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZAdHZHMREKJ8", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import pandas_profiling as pp" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9hPUmzCMENoE", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 253 | |
}, | |
"outputId": "cd9493bc-e1a6-4c9e-892c-aeedb67a9224" | |
}, | |
"source": [ | |
"df = pd.read_csv('train.csv')\n", | |
"df.head()" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Id</th>\n", | |
" <th>MSSubClass</th>\n", | |
" <th>MSZoning</th>\n", | |
" <th>LotFrontage</th>\n", | |
" <th>LotArea</th>\n", | |
" <th>Street</th>\n", | |
" <th>Alley</th>\n", | |
" <th>LotShape</th>\n", | |
" <th>LandContour</th>\n", | |
" <th>Utilities</th>\n", | |
" <th>LotConfig</th>\n", | |
" <th>LandSlope</th>\n", | |
" <th>Neighborhood</th>\n", | |
" <th>Condition1</th>\n", | |
" <th>Condition2</th>\n", | |
" <th>BldgType</th>\n", | |
" <th>HouseStyle</th>\n", | |
" <th>OverallQual</th>\n", | |
" <th>OverallCond</th>\n", | |
" <th>YearBuilt</th>\n", | |
" <th>YearRemodAdd</th>\n", | |
" <th>RoofStyle</th>\n", | |
" <th>RoofMatl</th>\n", | |
" <th>Exterior1st</th>\n", | |
" <th>Exterior2nd</th>\n", | |
" <th>MasVnrType</th>\n", | |
" <th>MasVnrArea</th>\n", | |
" <th>ExterQual</th>\n", | |
" <th>ExterCond</th>\n", | |
" <th>Foundation</th>\n", | |
" <th>BsmtQual</th>\n", | |
" <th>BsmtCond</th>\n", | |
" <th>BsmtExposure</th>\n", | |
" <th>BsmtFinType1</th>\n", | |
" <th>BsmtFinSF1</th>\n", | |
" <th>BsmtFinType2</th>\n", | |
" <th>BsmtFinSF2</th>\n", | |
" <th>BsmtUnfSF</th>\n", | |
" <th>TotalBsmtSF</th>\n", | |
" <th>Heating</th>\n", | |
" <th>...</th>\n", | |
" <th>CentralAir</th>\n", | |
" <th>Electrical</th>\n", | |
" <th>1stFlrSF</th>\n", | |
" <th>2ndFlrSF</th>\n", | |
" <th>LowQualFinSF</th>\n", | |
" <th>GrLivArea</th>\n", | |
" <th>BsmtFullBath</th>\n", | |
" <th>BsmtHalfBath</th>\n", | |
" <th>FullBath</th>\n", | |
" <th>HalfBath</th>\n", | |
" <th>BedroomAbvGr</th>\n", | |
" <th>KitchenAbvGr</th>\n", | |
" <th>KitchenQual</th>\n", | |
" <th>TotRmsAbvGrd</th>\n", | |
" <th>Functional</th>\n", | |
" <th>Fireplaces</th>\n", | |
" <th>FireplaceQu</th>\n", | |
" <th>GarageType</th>\n", | |
" <th>GarageYrBlt</th>\n", | |
" <th>GarageFinish</th>\n", | |
" <th>GarageCars</th>\n", | |
" <th>GarageArea</th>\n", | |
" <th>GarageQual</th>\n", | |
" <th>GarageCond</th>\n", | |
" <th>PavedDrive</th>\n", | |
" <th>WoodDeckSF</th>\n", | |
" <th>OpenPorchSF</th>\n", | |
" <th>EnclosedPorch</th>\n", | |
" <th>3SsnPorch</th>\n", | |
" <th>ScreenPorch</th>\n", | |
" <th>PoolArea</th>\n", | |
" <th>PoolQC</th>\n", | |
" <th>Fence</th>\n", | |
" <th>MiscFeature</th>\n", | |
" <th>MiscVal</th>\n", | |
" <th>MoSold</th>\n", | |
" <th>YrSold</th>\n", | |
" <th>SaleType</th>\n", | |
" <th>SaleCondition</th>\n", | |
" <th>SalePrice</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>60</td>\n", | |
" <td>RL</td>\n", | |
" <td>65.0</td>\n", | |
" <td>8450</td>\n", | |
" <td>Pave</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Reg</td>\n", | |
" <td>Lvl</td>\n", | |
" <td>AllPub</td>\n", | |
" <td>Inside</td>\n", | |
" <td>Gtl</td>\n", | |
" <td>CollgCr</td>\n", | |
" <td>Norm</td>\n", | |
" <td>Norm</td>\n", | |
" <td>1Fam</td>\n", | |
" <td>2Story</td>\n", | |
" <td>7</td>\n", | |
" <td>5</td>\n", | |
" <td>2003</td>\n", | |
" <td>2003</td>\n", | |
" <td>Gable</td>\n", | |
" <td>CompShg</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>BrkFace</td>\n", | |
" <td>196.0</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>PConc</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>No</td>\n", | |
" <td>GLQ</td>\n", | |
" <td>706</td>\n", | |
" <td>Unf</td>\n", | |
" <td>0</td>\n", | |
" <td>150</td>\n", | |
" <td>856</td>\n", | |
" <td>GasA</td>\n", | |
" <td>...</td>\n", | |
" <td>Y</td>\n", | |
" <td>SBrkr</td>\n", | |
" <td>856</td>\n", | |
" <td>854</td>\n", | |
" <td>0</td>\n", | |
" <td>1710</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>Gd</td>\n", | |
" <td>8</td>\n", | |
" <td>Typ</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Attchd</td>\n", | |
" <td>2003.0</td>\n", | |
" <td>RFn</td>\n", | |
" <td>2</td>\n", | |
" <td>548</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>Y</td>\n", | |
" <td>0</td>\n", | |
" <td>61</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>2008</td>\n", | |
" <td>WD</td>\n", | |
" <td>Normal</td>\n", | |
" <td>208500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>20</td>\n", | |
" <td>RL</td>\n", | |
" <td>80.0</td>\n", | |
" <td>9600</td>\n", | |
" <td>Pave</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Reg</td>\n", | |
" <td>Lvl</td>\n", | |
" <td>AllPub</td>\n", | |
" <td>FR2</td>\n", | |
" <td>Gtl</td>\n", | |
" <td>Veenker</td>\n", | |
" <td>Feedr</td>\n", | |
" <td>Norm</td>\n", | |
" <td>1Fam</td>\n", | |
" <td>1Story</td>\n", | |
" <td>6</td>\n", | |
" <td>8</td>\n", | |
" <td>1976</td>\n", | |
" <td>1976</td>\n", | |
" <td>Gable</td>\n", | |
" <td>CompShg</td>\n", | |
" <td>MetalSd</td>\n", | |
" <td>MetalSd</td>\n", | |
" <td>None</td>\n", | |
" <td>0.0</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>CBlock</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>Gd</td>\n", | |
" <td>ALQ</td>\n", | |
" <td>978</td>\n", | |
" <td>Unf</td>\n", | |
" <td>0</td>\n", | |
" <td>284</td>\n", | |
" <td>1262</td>\n", | |
" <td>GasA</td>\n", | |
" <td>...</td>\n", | |
" <td>Y</td>\n", | |
" <td>SBrkr</td>\n", | |
" <td>1262</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1262</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>TA</td>\n", | |
" <td>6</td>\n", | |
" <td>Typ</td>\n", | |
" <td>1</td>\n", | |
" <td>TA</td>\n", | |
" <td>Attchd</td>\n", | |
" <td>1976.0</td>\n", | |
" <td>RFn</td>\n", | |
" <td>2</td>\n", | |
" <td>460</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>Y</td>\n", | |
" <td>298</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>5</td>\n", | |
" <td>2007</td>\n", | |
" <td>WD</td>\n", | |
" <td>Normal</td>\n", | |
" <td>181500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>60</td>\n", | |
" <td>RL</td>\n", | |
" <td>68.0</td>\n", | |
" <td>11250</td>\n", | |
" <td>Pave</td>\n", | |
" <td>NaN</td>\n", | |
" <td>IR1</td>\n", | |
" <td>Lvl</td>\n", | |
" <td>AllPub</td>\n", | |
" <td>Inside</td>\n", | |
" <td>Gtl</td>\n", | |
" <td>CollgCr</td>\n", | |
" <td>Norm</td>\n", | |
" <td>Norm</td>\n", | |
" <td>1Fam</td>\n", | |
" <td>2Story</td>\n", | |
" <td>7</td>\n", | |
" <td>5</td>\n", | |
" <td>2001</td>\n", | |
" <td>2002</td>\n", | |
" <td>Gable</td>\n", | |
" <td>CompShg</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>BrkFace</td>\n", | |
" <td>162.0</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>PConc</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>Mn</td>\n", | |
" <td>GLQ</td>\n", | |
" <td>486</td>\n", | |
" <td>Unf</td>\n", | |
" <td>0</td>\n", | |
" <td>434</td>\n", | |
" <td>920</td>\n", | |
" <td>GasA</td>\n", | |
" <td>...</td>\n", | |
" <td>Y</td>\n", | |
" <td>SBrkr</td>\n", | |
" <td>920</td>\n", | |
" <td>866</td>\n", | |
" <td>0</td>\n", | |
" <td>1786</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>Gd</td>\n", | |
" <td>6</td>\n", | |
" <td>Typ</td>\n", | |
" <td>1</td>\n", | |
" <td>TA</td>\n", | |
" <td>Attchd</td>\n", | |
" <td>2001.0</td>\n", | |
" <td>RFn</td>\n", | |
" <td>2</td>\n", | |
" <td>608</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>Y</td>\n", | |
" <td>0</td>\n", | |
" <td>42</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>9</td>\n", | |
" <td>2008</td>\n", | |
" <td>WD</td>\n", | |
" <td>Normal</td>\n", | |
" <td>223500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>70</td>\n", | |
" <td>RL</td>\n", | |
" <td>60.0</td>\n", | |
" <td>9550</td>\n", | |
" <td>Pave</td>\n", | |
" <td>NaN</td>\n", | |
" <td>IR1</td>\n", | |
" <td>Lvl</td>\n", | |
" <td>AllPub</td>\n", | |
" <td>Corner</td>\n", | |
" <td>Gtl</td>\n", | |
" <td>Crawfor</td>\n", | |
" <td>Norm</td>\n", | |
" <td>Norm</td>\n", | |
" <td>1Fam</td>\n", | |
" <td>2Story</td>\n", | |
" <td>7</td>\n", | |
" <td>5</td>\n", | |
" <td>1915</td>\n", | |
" <td>1970</td>\n", | |
" <td>Gable</td>\n", | |
" <td>CompShg</td>\n", | |
" <td>Wd Sdng</td>\n", | |
" <td>Wd Shng</td>\n", | |
" <td>None</td>\n", | |
" <td>0.0</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>BrkTil</td>\n", | |
" <td>TA</td>\n", | |
" <td>Gd</td>\n", | |
" <td>No</td>\n", | |
" <td>ALQ</td>\n", | |
" <td>216</td>\n", | |
" <td>Unf</td>\n", | |
" <td>0</td>\n", | |
" <td>540</td>\n", | |
" <td>756</td>\n", | |
" <td>GasA</td>\n", | |
" <td>...</td>\n", | |
" <td>Y</td>\n", | |
" <td>SBrkr</td>\n", | |
" <td>961</td>\n", | |
" <td>756</td>\n", | |
" <td>0</td>\n", | |
" <td>1717</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>Gd</td>\n", | |
" <td>7</td>\n", | |
" <td>Typ</td>\n", | |
" <td>1</td>\n", | |
" <td>Gd</td>\n", | |
" <td>Detchd</td>\n", | |
" <td>1998.0</td>\n", | |
" <td>Unf</td>\n", | |
" <td>3</td>\n", | |
" <td>642</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>Y</td>\n", | |
" <td>0</td>\n", | |
" <td>35</td>\n", | |
" <td>272</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>2006</td>\n", | |
" <td>WD</td>\n", | |
" <td>Abnorml</td>\n", | |
" <td>140000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>60</td>\n", | |
" <td>RL</td>\n", | |
" <td>84.0</td>\n", | |
" <td>14260</td>\n", | |
" <td>Pave</td>\n", | |
" <td>NaN</td>\n", | |
" <td>IR1</td>\n", | |
" <td>Lvl</td>\n", | |
" <td>AllPub</td>\n", | |
" <td>FR2</td>\n", | |
" <td>Gtl</td>\n", | |
" <td>NoRidge</td>\n", | |
" <td>Norm</td>\n", | |
" <td>Norm</td>\n", | |
" <td>1Fam</td>\n", | |
" <td>2Story</td>\n", | |
" <td>8</td>\n", | |
" <td>5</td>\n", | |
" <td>2000</td>\n", | |
" <td>2000</td>\n", | |
" <td>Gable</td>\n", | |
" <td>CompShg</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>VinylSd</td>\n", | |
" <td>BrkFace</td>\n", | |
" <td>350.0</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>PConc</td>\n", | |
" <td>Gd</td>\n", | |
" <td>TA</td>\n", | |
" <td>Av</td>\n", | |
" <td>GLQ</td>\n", | |
" <td>655</td>\n", | |
" <td>Unf</td>\n", | |
" <td>0</td>\n", | |
" <td>490</td>\n", | |
" <td>1145</td>\n", | |
" <td>GasA</td>\n", | |
" <td>...</td>\n", | |
" <td>Y</td>\n", | |
" <td>SBrkr</td>\n", | |
" <td>1145</td>\n", | |
" <td>1053</td>\n", | |
" <td>0</td>\n", | |
" <td>2198</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>Gd</td>\n", | |
" <td>9</td>\n", | |
" <td>Typ</td>\n", | |
" <td>1</td>\n", | |
" <td>TA</td>\n", | |
" <td>Attchd</td>\n", | |
" <td>2000.0</td>\n", | |
" <td>RFn</td>\n", | |
" <td>3</td>\n", | |
" <td>836</td>\n", | |
" <td>TA</td>\n", | |
" <td>TA</td>\n", | |
" <td>Y</td>\n", | |
" <td>192</td>\n", | |
" <td>84</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>12</td>\n", | |
" <td>2008</td>\n", | |
" <td>WD</td>\n", | |
" <td>Normal</td>\n", | |
" <td>250000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 81 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Id MSSubClass MSZoning ... SaleType SaleCondition SalePrice\n", | |
"0 1 60 RL ... WD Normal 208500\n", | |
"1 2 20 RL ... WD Normal 181500\n", | |
"2 3 60 RL ... WD Normal 223500\n", | |
"3 4 70 RL ... WD Abnorml 140000\n", | |
"4 5 60 RL ... WD Normal 250000\n", | |
"\n", | |
"[5 rows x 81 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7GUsa4Q7EaMT", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"report = pp.ProfileReport(df)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "C823PEy6Egc8", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "ea8be962-de8f-44f8-e86b-78dca2f737bd" | |
}, | |
"source": [ | |
"eda = report.description_set\n", | |
"eda.keys()" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"dict_keys(['table', 'variables', 'freq', 'correlations'])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "g_Z6YO8sEwzz", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 224 | |
}, | |
"outputId": "463dd8e7-d9a5-439a-8086-5a3ca14d1196" | |
}, | |
"source": [ | |
"variable_info = eda['variables']\n", | |
"variable_info.head()" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>count</th>\n", | |
" <th>distinct_count</th>\n", | |
" <th>p_missing</th>\n", | |
" <th>n_missing</th>\n", | |
" <th>p_infinite</th>\n", | |
" <th>n_infinite</th>\n", | |
" <th>is_unique</th>\n", | |
" <th>mode</th>\n", | |
" <th>p_unique</th>\n", | |
" <th>memorysize</th>\n", | |
" <th>top</th>\n", | |
" <th>freq</th>\n", | |
" <th>type</th>\n", | |
" <th>mean</th>\n", | |
" <th>std</th>\n", | |
" <th>variance</th>\n", | |
" <th>min</th>\n", | |
" <th>max</th>\n", | |
" <th>range</th>\n", | |
" <th>5%</th>\n", | |
" <th>25%</th>\n", | |
" <th>50%</th>\n", | |
" <th>75%</th>\n", | |
" <th>95%</th>\n", | |
" <th>iqr</th>\n", | |
" <th>kurtosis</th>\n", | |
" <th>skewness</th>\n", | |
" <th>sum</th>\n", | |
" <th>mad</th>\n", | |
" <th>cv</th>\n", | |
" <th>n_zeros</th>\n", | |
" <th>p_zeros</th>\n", | |
" <th>histogram</th>\n", | |
" <th>mini_histogram</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1stFlrSF</th>\n", | |
" <td>1460</td>\n", | |
" <td>753</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>False</td>\n", | |
" <td>864</td>\n", | |
" <td>0.515753</td>\n", | |
" <td>11760</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NUM</td>\n", | |
" <td>1162.63</td>\n", | |
" <td>386.588</td>\n", | |
" <td>149450</td>\n", | |
" <td>334</td>\n", | |
" <td>4692</td>\n", | |
" <td>4358</td>\n", | |
" <td>672.95</td>\n", | |
" <td>882</td>\n", | |
" <td>1087</td>\n", | |
" <td>1391.25</td>\n", | |
" <td>1831.25</td>\n", | |
" <td>509.25</td>\n", | |
" <td>5.74584</td>\n", | |
" <td>1.37676</td>\n", | |
" <td>1697435</td>\n", | |
" <td>300.576</td>\n", | |
" <td>0.332512</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2ndFlrSF</th>\n", | |
" <td>1460</td>\n", | |
" <td>417</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>False</td>\n", | |
" <td>0</td>\n", | |
" <td>0.285616</td>\n", | |
" <td>11760</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NUM</td>\n", | |
" <td>346.992</td>\n", | |
" <td>436.528</td>\n", | |
" <td>190557</td>\n", | |
" <td>0</td>\n", | |
" <td>2065</td>\n", | |
" <td>2065</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>728</td>\n", | |
" <td>1141.05</td>\n", | |
" <td>728</td>\n", | |
" <td>-0.553464</td>\n", | |
" <td>0.81303</td>\n", | |
" <td>506609</td>\n", | |
" <td>396.478</td>\n", | |
" <td>1.25803</td>\n", | |
" <td>829</td>\n", | |
" <td>0.567808</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3SsnPorch</th>\n", | |
" <td>1460</td>\n", | |
" <td>20</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>False</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0136986</td>\n", | |
" <td>11760</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NUM</td>\n", | |
" <td>3.40959</td>\n", | |
" <td>29.3173</td>\n", | |
" <td>859.506</td>\n", | |
" <td>0</td>\n", | |
" <td>508</td>\n", | |
" <td>508</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>123.662</td>\n", | |
" <td>10.3043</td>\n", | |
" <td>4978</td>\n", | |
" <td>6.70708</td>\n", | |
" <td>8.59849</td>\n", | |
" <td>1436</td>\n", | |
" <td>0.983562</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Alley</th>\n", | |
" <td>91</td>\n", | |
" <td>3</td>\n", | |
" <td>0.937671</td>\n", | |
" <td>1369</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>False</td>\n", | |
" <td>Grvl</td>\n", | |
" <td>0.00205479</td>\n", | |
" <td>11760</td>\n", | |
" <td>Grvl</td>\n", | |
" <td>50</td>\n", | |
" <td>CAT</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>BedroomAbvGr</th>\n", | |
" <td>1460</td>\n", | |
" <td>8</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>False</td>\n", | |
" <td>3</td>\n", | |
" <td>0.00547945</td>\n", | |
" <td>11760</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NUM</td>\n", | |
" <td>2.86644</td>\n", | |
" <td>0.815778</td>\n", | |
" <td>0.665494</td>\n", | |
" <td>0</td>\n", | |
" <td>8</td>\n", | |
" <td>8</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>2.23087</td>\n", | |
" <td>0.21179</td>\n", | |
" <td>4185</td>\n", | |
" <td>0.576309</td>\n", | |
" <td>0.284596</td>\n", | |
" <td>6</td>\n", | |
" <td>0.00410959</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" <td>data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" count ... mini_histogram\n", | |
"1stFlrSF 1460 ... data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...\n", | |
"2ndFlrSF 1460 ... data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...\n", | |
"3SsnPorch 1460 ... data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...\n", | |
"Alley 91 ... NaN\n", | |
"BedroomAbvGr 1460 ... data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...\n", | |
"\n", | |
"[5 rows x 34 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 15 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "WW68OsDHGVdZ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Get Categorical Variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FUaJyUnsE3oQ", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 187 | |
}, | |
"outputId": "cfb6948b-ff72-4a18-d2ec-b53dbf62be81" | |
}, | |
"source": [ | |
"cat_vars = variable_info[variable_info['type'] == 'CAT'].index\n", | |
"cat_vars" | |
], | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"Index(['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',\n", | |
" 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2',\n", | |
" 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd',\n", | |
" 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond',\n", | |
" 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC',\n", | |
" 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig',\n", | |
" 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood',\n", | |
" 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition',\n", | |
" 'SaleType', 'Street', 'Utilities'],\n", | |
" dtype='object')" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qPQCIOI9GoqR", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"To be able to use it with fastai tabular just convert it into list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fYo1GihdGna_", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 136 | |
}, | |
"outputId": "e6ea4fe3-606f-4b6e-9b22-bdb8e5b75456" | |
}, | |
"source": [ | |
"cat_vars = list(cat_vars)\n", | |
"cat_vars[:7]" | |
], | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['Alley',\n", | |
" 'BldgType',\n", | |
" 'BsmtCond',\n", | |
" 'BsmtExposure',\n", | |
" 'BsmtFinType1',\n", | |
" 'BsmtFinType2',\n", | |
" 'BsmtQual']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pm2Ex9_VGY7h", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Get Continous Variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0E3pOukQE7v8", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 170 | |
}, | |
"outputId": "83b4ded0-e3bf-404f-bcae-b7f6cf45a6e7" | |
}, | |
"source": [ | |
"cont_vars = variable_info[variable_info['type'] == 'NUM'].index\n", | |
"cont_vars" | |
], | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1',\n", | |
" 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF',\n", | |
" 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars',\n", | |
" 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Id', 'KitchenAbvGr', 'LotArea',\n", | |
" 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal',\n", | |
" 'MoSold', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea',\n", | |
" 'SalePrice', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF',\n", | |
" 'YearBuilt', 'YearRemodAdd', 'YrSold'],\n", | |
" dtype='object')" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 28 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UmJhzS_AFVWl", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 136 | |
}, | |
"outputId": "bbb0385f-c8eb-446f-bdc0-92ed906fd658" | |
}, | |
"source": [ | |
"cont_vars = list(cont_vars)\n", | |
"cont_vars[:7]" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['1stFlrSF',\n", | |
" '2ndFlrSF',\n", | |
" '3SsnPorch',\n", | |
" 'BedroomAbvGr',\n", | |
" 'BsmtFinSF1',\n", | |
" 'BsmtFinSF2',\n", | |
" 'BsmtFullBath']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 29 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment