Skip to content

Instantly share code, notes, and snippets.

@ericvenarusso
Created August 9, 2019 11:46
Show Gist options
  • Save ericvenarusso/92a37474e454abeb2ca215e3b1708636 to your computer and use it in GitHub Desktop.
Save ericvenarusso/92a37474e454abeb2ca215e3b1708636 to your computer and use it in GitHub Desktop.
Pipeline.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Pipeline.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ericvenarusso/92a37474e454abeb2ca215e3b1708636/pipeline.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2zUF9kWoYS2X",
"colab_type": "code",
"outputId": "4e1c93b3-4e42-4e7d-b137-8f9fecfd0f4b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 292
}
},
"source": [
"# Instalando o pacote category_encoders\n",
"! pip install category_encoders"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting category_encoders\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)\n",
"\u001b[K |████████████████████████████████| 92kB 3.4MB/s \n",
"\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (1.16.4)\n",
"Requirement already satisfied: scipy>=0.19.0 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (1.3.0)\n",
"Requirement already satisfied: statsmodels>=0.6.1 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (0.10.1)\n",
"Requirement already satisfied: patsy>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (0.5.1)\n",
"Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (0.21.3)\n",
"Requirement already satisfied: pandas>=0.21.1 in /usr/local/lib/python3.6/dist-packages (from category_encoders) (0.24.2)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from patsy>=0.4.1->category_encoders) (1.12.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.20.0->category_encoders) (0.13.2)\n",
"Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.21.1->category_encoders) (2.5.3)\n",
"Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas>=0.21.1->category_encoders) (2018.9)\n",
"Installing collected packages: category-encoders\n",
"Successfully installed category-encoders-2.0.0\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hnkOQNhQi2gX",
"colab_type": "code",
"outputId": "55a73c08-17e9-4e1b-e4ed-b02aaf8ce00b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 156
}
},
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from category_encoders.target_encoder import TargetEncoder\n",
"\n",
"from sklearn.externals import joblib\n",
"\n",
"# Configurações do google drive\n",
"from google.colab import drive\n",
"drive.mount('/content/gdrive')\n",
"\n",
"import os\n",
"os.chdir('gdrive/My Drive/Colab Notebooks/Sklearn Pipelines')"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=DeprecationWarning)\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n",
"\n",
"Enter your authorization code:\n",
"··········\n",
"Mounted at /content/gdrive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hqp8QMkSm4ek",
"colab_type": "text"
},
"source": [
"## Fazendo o treinamento do Pipeline."
]
},
{
"cell_type": "code",
"metadata": {
"id": "JwNIHLAOi7He",
"colab_type": "code",
"outputId": "ee033104-cdb5-4663-961d-2427cc3c8362",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 270
}
},
"source": [
"# Carregamento dos dados\n",
"train = pd.read_csv('input/train.csv')\n",
"\n",
"print(train.shape)\n",
"\n",
"train.head()"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"(1460, 81)\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>LandSlope</th>\n",
" <th>Neighborhood</th>\n",
" <th>Condition1</th>\n",
" <th>Condition2</th>\n",
" <th>BldgType</th>\n",
" <th>HouseStyle</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>RoofStyle</th>\n",
" <th>RoofMatl</th>\n",
" <th>Exterior1st</th>\n",
" <th>Exterior2nd</th>\n",
" <th>MasVnrType</th>\n",
" <th>MasVnrArea</th>\n",
" <th>ExterQual</th>\n",
" <th>ExterCond</th>\n",
" <th>Foundation</th>\n",
" <th>BsmtQual</th>\n",
" <th>BsmtCond</th>\n",
" <th>BsmtExposure</th>\n",
" <th>BsmtFinType1</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>BsmtFinType2</th>\n",
" <th>BsmtFinSF2</th>\n",
" <th>BsmtUnfSF</th>\n",
" <th>TotalBsmtSF</th>\n",
" <th>Heating</th>\n",
" <th>...</th>\n",
" <th>CentralAir</th>\n",
" <th>Electrical</th>\n",
" <th>1stFlrSF</th>\n",
" <th>2ndFlrSF</th>\n",
" <th>LowQualFinSF</th>\n",
" <th>GrLivArea</th>\n",
" <th>BsmtFullBath</th>\n",
" <th>BsmtHalfBath</th>\n",
" <th>FullBath</th>\n",
" <th>HalfBath</th>\n",
" <th>BedroomAbvGr</th>\n",
" <th>KitchenAbvGr</th>\n",
" <th>KitchenQual</th>\n",
" <th>TotRmsAbvGrd</th>\n",
" <th>Functional</th>\n",
" <th>Fireplaces</th>\n",
" <th>FireplaceQu</th>\n",
" <th>GarageType</th>\n",
" <th>GarageYrBlt</th>\n",
" <th>GarageFinish</th>\n",
" <th>GarageCars</th>\n",
" <th>GarageArea</th>\n",
" <th>GarageQual</th>\n",
" <th>GarageCond</th>\n",
" <th>PavedDrive</th>\n",
" <th>WoodDeckSF</th>\n",
" <th>OpenPorchSF</th>\n",
" <th>EnclosedPorch</th>\n",
" <th>3SsnPorch</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" <th>SalePrice</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>65.0</td>\n",
" <td>8450</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>CollgCr</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2003</td>\n",
" <td>2003</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>BrkFace</td>\n",
" <td>196.0</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>GLQ</td>\n",
" <td>706</td>\n",
" <td>Unf</td>\n",
" <td>0</td>\n",
" <td>150</td>\n",
" <td>856</td>\n",
" <td>GasA</td>\n",
" <td>...</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>856</td>\n",
" <td>854</td>\n",
" <td>0</td>\n",
" <td>1710</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>8</td>\n",
" <td>Typ</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>Attchd</td>\n",
" <td>2003.0</td>\n",
" <td>RFn</td>\n",
" <td>2</td>\n",
" <td>548</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>0</td>\n",
" <td>61</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>208500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>80.0</td>\n",
" <td>9600</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>Veenker</td>\n",
" <td>Feedr</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>1Story</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>1976</td>\n",
" <td>1976</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>MetalSd</td>\n",
" <td>MetalSd</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>CBlock</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>Gd</td>\n",
" <td>ALQ</td>\n",
" <td>978</td>\n",
" <td>Unf</td>\n",
" <td>0</td>\n",
" <td>284</td>\n",
" <td>1262</td>\n",
" <td>GasA</td>\n",
" <td>...</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>1262</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1262</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>6</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>Attchd</td>\n",
" <td>1976.0</td>\n",
" <td>RFn</td>\n",
" <td>2</td>\n",
" <td>460</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>298</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>181500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>68.0</td>\n",
" <td>11250</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>CollgCr</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>2001</td>\n",
" <td>2002</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>BrkFace</td>\n",
" <td>162.0</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>Mn</td>\n",
" <td>GLQ</td>\n",
" <td>486</td>\n",
" <td>Unf</td>\n",
" <td>0</td>\n",
" <td>434</td>\n",
" <td>920</td>\n",
" <td>GasA</td>\n",
" <td>...</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>920</td>\n",
" <td>866</td>\n",
" <td>0</td>\n",
" <td>1786</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>6</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>Attchd</td>\n",
" <td>2001.0</td>\n",
" <td>RFn</td>\n",
" <td>2</td>\n",
" <td>608</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>0</td>\n",
" <td>42</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>223500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>RL</td>\n",
" <td>60.0</td>\n",
" <td>9550</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>Gtl</td>\n",
" <td>Crawfor</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>1915</td>\n",
" <td>1970</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>Wd Sdng</td>\n",
" <td>Wd Shng</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>BrkTil</td>\n",
" <td>TA</td>\n",
" <td>Gd</td>\n",
" <td>No</td>\n",
" <td>ALQ</td>\n",
" <td>216</td>\n",
" <td>Unf</td>\n",
" <td>0</td>\n",
" <td>540</td>\n",
" <td>756</td>\n",
" <td>GasA</td>\n",
" <td>...</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>961</td>\n",
" <td>756</td>\n",
" <td>0</td>\n",
" <td>1717</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>7</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>Detchd</td>\n",
" <td>1998.0</td>\n",
" <td>Unf</td>\n",
" <td>3</td>\n",
" <td>642</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>0</td>\n",
" <td>35</td>\n",
" <td>272</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2006</td>\n",
" <td>WD</td>\n",
" <td>Abnorml</td>\n",
" <td>140000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>84.0</td>\n",
" <td>14260</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>FR2</td>\n",
" <td>Gtl</td>\n",
" <td>NoRidge</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>2000</td>\n",
" <td>2000</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>BrkFace</td>\n",
" <td>350.0</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>Av</td>\n",
" <td>GLQ</td>\n",
" <td>655</td>\n",
" <td>Unf</td>\n",
" <td>0</td>\n",
" <td>490</td>\n",
" <td>1145</td>\n",
" <td>GasA</td>\n",
" <td>...</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>1145</td>\n",
" <td>1053</td>\n",
" <td>0</td>\n",
" <td>2198</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>9</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>Attchd</td>\n",
" <td>2000.0</td>\n",
" <td>RFn</td>\n",
" <td>3</td>\n",
" <td>836</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>192</td>\n",
" <td>84</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2008</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning ... SaleType SaleCondition SalePrice\n",
"0 1 60 RL ... WD Normal 208500\n",
"1 2 20 RL ... WD Normal 181500\n",
"2 3 60 RL ... WD Normal 223500\n",
"3 4 70 RL ... WD Abnorml 140000\n",
"4 5 60 RL ... WD Normal 250000\n",
"\n",
"[5 rows x 81 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Hfn_P4ISkLFC",
"colab_type": "code",
"colab": {}
},
"source": [
"X_train = train.drop('SalePrice', axis = 1)\n",
"y_train = train['SalePrice']"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zqPxT57ujDHH",
"colab_type": "code",
"outputId": "2c5157a5-19cb-4b92-b875-db1be7f88981",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"# Criacao do Pipeline\n",
"train_pipeline = Pipeline(steps=[\n",
" ('Target Encoder', TargetEncoder()),\n",
" ('Simple Imputer', SimpleImputer(strategy = 'median')),\n",
" ('Random Forest', RandomForestRegressor(n_estimators = 1000, random_state = 0))\n",
"])\n",
"\n",
"# Treinamento do Pipeline\n",
"train_pipeline.fit(X_train, y_train)\n",
"\n",
"print('Score do modelo:', train_pipeline.score(X_train, y_train))\n",
"\n",
"# Salvando o Pipeline ja treinado\n",
"joblib.dump(train_pipeline, 'saved_models/pipeline/pipeline.pkl')"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Score do modelo: 0.9829672262950807\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['saved_models/pipeline/pipeline.pkl']"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "uyFLUl_0lssD",
"colab_type": "text"
},
"source": [
"## Fazendo predições para o conjunto de teste com o pipeline treinado."
]
},
{
"cell_type": "code",
"metadata": {
"id": "4q9awJhQmAnf",
"colab_type": "code",
"outputId": "0c3c3b71-ab70-4eb4-e1f5-e2fb019c94df",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
}
},
"source": [
"# Carregamento dos dados\n",
"test = pd.read_csv('input/test.csv')\n",
"\n",
"print(test.shape)\n",
"\n",
"test.head()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"(1459, 80)\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>MSSubClass</th>\n",
" <th>MSZoning</th>\n",
" <th>LotFrontage</th>\n",
" <th>LotArea</th>\n",
" <th>Street</th>\n",
" <th>Alley</th>\n",
" <th>LotShape</th>\n",
" <th>LandContour</th>\n",
" <th>Utilities</th>\n",
" <th>LotConfig</th>\n",
" <th>LandSlope</th>\n",
" <th>Neighborhood</th>\n",
" <th>Condition1</th>\n",
" <th>Condition2</th>\n",
" <th>BldgType</th>\n",
" <th>HouseStyle</th>\n",
" <th>OverallQual</th>\n",
" <th>OverallCond</th>\n",
" <th>YearBuilt</th>\n",
" <th>YearRemodAdd</th>\n",
" <th>RoofStyle</th>\n",
" <th>RoofMatl</th>\n",
" <th>Exterior1st</th>\n",
" <th>Exterior2nd</th>\n",
" <th>MasVnrType</th>\n",
" <th>MasVnrArea</th>\n",
" <th>ExterQual</th>\n",
" <th>ExterCond</th>\n",
" <th>Foundation</th>\n",
" <th>BsmtQual</th>\n",
" <th>BsmtCond</th>\n",
" <th>BsmtExposure</th>\n",
" <th>BsmtFinType1</th>\n",
" <th>BsmtFinSF1</th>\n",
" <th>BsmtFinType2</th>\n",
" <th>BsmtFinSF2</th>\n",
" <th>BsmtUnfSF</th>\n",
" <th>TotalBsmtSF</th>\n",
" <th>Heating</th>\n",
" <th>HeatingQC</th>\n",
" <th>CentralAir</th>\n",
" <th>Electrical</th>\n",
" <th>1stFlrSF</th>\n",
" <th>2ndFlrSF</th>\n",
" <th>LowQualFinSF</th>\n",
" <th>GrLivArea</th>\n",
" <th>BsmtFullBath</th>\n",
" <th>BsmtHalfBath</th>\n",
" <th>FullBath</th>\n",
" <th>HalfBath</th>\n",
" <th>BedroomAbvGr</th>\n",
" <th>KitchenAbvGr</th>\n",
" <th>KitchenQual</th>\n",
" <th>TotRmsAbvGrd</th>\n",
" <th>Functional</th>\n",
" <th>Fireplaces</th>\n",
" <th>FireplaceQu</th>\n",
" <th>GarageType</th>\n",
" <th>GarageYrBlt</th>\n",
" <th>GarageFinish</th>\n",
" <th>GarageCars</th>\n",
" <th>GarageArea</th>\n",
" <th>GarageQual</th>\n",
" <th>GarageCond</th>\n",
" <th>PavedDrive</th>\n",
" <th>WoodDeckSF</th>\n",
" <th>OpenPorchSF</th>\n",
" <th>EnclosedPorch</th>\n",
" <th>3SsnPorch</th>\n",
" <th>ScreenPorch</th>\n",
" <th>PoolArea</th>\n",
" <th>PoolQC</th>\n",
" <th>Fence</th>\n",
" <th>MiscFeature</th>\n",
" <th>MiscVal</th>\n",
" <th>MoSold</th>\n",
" <th>YrSold</th>\n",
" <th>SaleType</th>\n",
" <th>SaleCondition</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1461</td>\n",
" <td>20</td>\n",
" <td>RH</td>\n",
" <td>80.0</td>\n",
" <td>11622</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>Reg</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>NAmes</td>\n",
" <td>Feedr</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>1Story</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1961</td>\n",
" <td>1961</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>CBlock</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>Rec</td>\n",
" <td>468.0</td>\n",
" <td>LwQ</td>\n",
" <td>144.0</td>\n",
" <td>270.0</td>\n",
" <td>882.0</td>\n",
" <td>GasA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>896</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>896</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>5</td>\n",
" <td>Typ</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>Attchd</td>\n",
" <td>1961.0</td>\n",
" <td>Unf</td>\n",
" <td>1.0</td>\n",
" <td>730.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>140</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>120</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1462</td>\n",
" <td>20</td>\n",
" <td>RL</td>\n",
" <td>81.0</td>\n",
" <td>14267</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Corner</td>\n",
" <td>Gtl</td>\n",
" <td>NAmes</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>1Story</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1958</td>\n",
" <td>1958</td>\n",
" <td>Hip</td>\n",
" <td>CompShg</td>\n",
" <td>Wd Sdng</td>\n",
" <td>Wd Sdng</td>\n",
" <td>BrkFace</td>\n",
" <td>108.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>CBlock</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>ALQ</td>\n",
" <td>923.0</td>\n",
" <td>Unf</td>\n",
" <td>0.0</td>\n",
" <td>406.0</td>\n",
" <td>1329.0</td>\n",
" <td>GasA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>1329</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1329</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>6</td>\n",
" <td>Typ</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>Attchd</td>\n",
" <td>1958.0</td>\n",
" <td>Unf</td>\n",
" <td>1.0</td>\n",
" <td>312.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>393</td>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Gar2</td>\n",
" <td>12500</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1463</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>74.0</td>\n",
" <td>13830</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>Gilbert</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1997</td>\n",
" <td>1998</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>GLQ</td>\n",
" <td>791.0</td>\n",
" <td>Unf</td>\n",
" <td>0.0</td>\n",
" <td>137.0</td>\n",
" <td>928.0</td>\n",
" <td>GasA</td>\n",
" <td>Gd</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>928</td>\n",
" <td>701</td>\n",
" <td>0</td>\n",
" <td>1629</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>6</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>TA</td>\n",
" <td>Attchd</td>\n",
" <td>1997.0</td>\n",
" <td>Fin</td>\n",
" <td>2.0</td>\n",
" <td>482.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>212</td>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>MnPrv</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1464</td>\n",
" <td>60</td>\n",
" <td>RL</td>\n",
" <td>78.0</td>\n",
" <td>9978</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>Lvl</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>Gilbert</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>1Fam</td>\n",
" <td>2Story</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>1998</td>\n",
" <td>1998</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>VinylSd</td>\n",
" <td>VinylSd</td>\n",
" <td>BrkFace</td>\n",
" <td>20.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>GLQ</td>\n",
" <td>602.0</td>\n",
" <td>Unf</td>\n",
" <td>0.0</td>\n",
" <td>324.0</td>\n",
" <td>926.0</td>\n",
" <td>GasA</td>\n",
" <td>Ex</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>926</td>\n",
" <td>678</td>\n",
" <td>0</td>\n",
" <td>1604</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>7</td>\n",
" <td>Typ</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>Attchd</td>\n",
" <td>1998.0</td>\n",
" <td>Fin</td>\n",
" <td>2.0</td>\n",
" <td>470.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>360</td>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1465</td>\n",
" <td>120</td>\n",
" <td>RL</td>\n",
" <td>43.0</td>\n",
" <td>5005</td>\n",
" <td>Pave</td>\n",
" <td>NaN</td>\n",
" <td>IR1</td>\n",
" <td>HLS</td>\n",
" <td>AllPub</td>\n",
" <td>Inside</td>\n",
" <td>Gtl</td>\n",
" <td>StoneBr</td>\n",
" <td>Norm</td>\n",
" <td>Norm</td>\n",
" <td>TwnhsE</td>\n",
" <td>1Story</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>1992</td>\n",
" <td>1992</td>\n",
" <td>Gable</td>\n",
" <td>CompShg</td>\n",
" <td>HdBoard</td>\n",
" <td>HdBoard</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>PConc</td>\n",
" <td>Gd</td>\n",
" <td>TA</td>\n",
" <td>No</td>\n",
" <td>ALQ</td>\n",
" <td>263.0</td>\n",
" <td>Unf</td>\n",
" <td>0.0</td>\n",
" <td>1017.0</td>\n",
" <td>1280.0</td>\n",
" <td>GasA</td>\n",
" <td>Ex</td>\n",
" <td>Y</td>\n",
" <td>SBrkr</td>\n",
" <td>1280</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1280</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Gd</td>\n",
" <td>5</td>\n",
" <td>Typ</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>Attchd</td>\n",
" <td>1992.0</td>\n",
" <td>RFn</td>\n",
" <td>2.0</td>\n",
" <td>506.0</td>\n",
" <td>TA</td>\n",
" <td>TA</td>\n",
" <td>Y</td>\n",
" <td>0</td>\n",
" <td>82</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>144</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2010</td>\n",
" <td>WD</td>\n",
" <td>Normal</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id MSSubClass MSZoning ... YrSold SaleType SaleCondition\n",
"0 1461 20 RH ... 2010 WD Normal\n",
"1 1462 20 RL ... 2010 WD Normal\n",
"2 1463 60 RL ... 2010 WD Normal\n",
"3 1464 60 RL ... 2010 WD Normal\n",
"4 1465 120 RL ... 2010 WD Normal\n",
"\n",
"[5 rows x 80 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bANvNzIGj_WG",
"colab_type": "code",
"colab": {}
},
"source": [
"# Carregamento do pipeline salvo treinado.\n",
"saved_pipeline = joblib.load('saved_models/pipeline/pipeline.pkl')\n",
"\n",
"y_pred = saved_pipeline.predict(test)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "lpoYdheJmN_i",
"colab_type": "code",
"outputId": "b059300c-398c-4265-f845-b9164bbf3557",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"y_pred"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([126004.991, 153460.822, 183824.115, ..., 153414.329, 112589.308,\n",
" 197563.65 ])"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment