adamnovotnycom/airbnb_nyc_kaggle.ipynb

## airbnb_nyc_kaggle.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "airbnb_nyc_kaggle.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "mount_file_id": "11_t63audBgQMHVGIIfSgp_Dp1sCbKRHy",
      "authorship_tag": "ABX9TyNH+t3cZzg9AbV+jxQelsso",
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/excitedAtom/1df7ef10649d8241c389c96becb7fe37/airbnb_nyc_kaggle.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AGVZmG3VQ2Ij"
      },
      "source": [
        "# Aibnb in NYC\n",
        "Project Description: [Kaggle](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MXByWp-mQKTp",
        "outputId": "1a5edac8-bede-4ced-d069-cee698e75ef7"
      },
      "source": [
        "import numpy as np\n",
        "import os\n",
        "import pandas as pd\n",
        "import plotly.graph_objects as go\n",
        "import random\n",
        "import sys\n",
        "from sklearn.base import BaseEstimator, TransformerMixin\n",
        "from sklearn.impute import SimpleImputer\n",
        "from sklearn.pipeline import FeatureUnion, Pipeline \n",
        "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
        "print(sys.version)\n",
        "print(pd.__version__)"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "3.6.9 (default, Oct  8 2020, 12:12:24) \n",
            "[GCC 8.4.0]\n",
            "1.1.4\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Gkl01MkHby2x"
      },
      "source": [
        "if pd.__version__[0] == \"0\":\n",
        "    !pip install pandas==1.1.2\n",
        "    import pandas as pd\n",
        "assert pd.__version__[0] == \"1\"    "
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uazEb54Ucs6D",
        "outputId": "387a6428-5217-4559-fbd9-662599557253"
      },
      "source": [
        "!sudo apt-get install build-essential swig\n",
        "!pip install auto-sklearn==0.11.1"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Reading package lists... Done\n",
            "Building dependency tree       \n",
            "Reading state information... Done\n",
            "build-essential is already the newest version (12.4ubuntu1).\n",
            "swig is already the newest version (3.0.12-1).\n",
            "0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.\n",
            "Requirement already satisfied: auto-sklearn==0.11.1 in /usr/local/lib/python3.6/dist-packages (0.11.1)\n",
            "Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.1.4)\n",
            "Requirement already satisfied: smac<0.14,>=0.13.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.13.1)\n",
            "Requirement already satisfied: dask in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.12.0)\n",
            "Requirement already satisfied: ConfigSpace<0.5,>=0.4.14 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.4.16)\n",
            "Requirement already satisfied: pynisher>=0.6.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.6.3)\n",
            "Requirement already satisfied: pyrfr<0.9,>=0.7 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.8.0)\n",
            "Requirement already satisfied: liac-arff in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.5.0)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (50.3.2)\n",
            "Requirement already satisfied: lockfile in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.12.2)\n",
            "Requirement already satisfied: scikit-learn<0.23,>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.22.2.post1)\n",
            "Requirement already satisfied: distributed>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.30.1)\n",
            "Requirement already satisfied: scipy>=0.14.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.4.1)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.17.0)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (3.13)\n",
            "Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.18.5)\n",
            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=1.0->auto-sklearn==0.11.1) (2.8.1)\n",
            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=1.0->auto-sklearn==0.11.1) (2018.9)\n",
            "Requirement already satisfied: lazy-import in /usr/local/lib/python3.6/dist-packages (from smac<0.14,>=0.13.1->auto-sklearn==0.11.1) (0.2.2)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.6/dist-packages (from smac<0.14,>=0.13.1->auto-sklearn==0.11.1) (5.4.8)\n",
            "Requirement already satisfied: cython in /usr/local/lib/python3.6/dist-packages (from ConfigSpace<0.5,>=0.4.14->auto-sklearn==0.11.1) (0.29.21)\n",
            "Requirement already satisfied: pyparsing in /usr/local/lib/python3.6/dist-packages (from ConfigSpace<0.5,>=0.4.14->auto-sklearn==0.11.1) (2.4.7)\n",
            "Requirement already satisfied: docutils>=0.3 in /usr/local/lib/python3.6/dist-packages (from pynisher>=0.6.1->auto-sklearn==0.11.1) (0.16)\n",
            "Requirement already satisfied: zict>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.0.0)\n",
            "Requirement already satisfied: toolz>=0.8.2 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (0.11.1)\n",
            "Requirement already satisfied: tornado>=5; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (5.1.1)\n",
            "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.3.0)\n",
            "Requirement already satisfied: click>=6.6 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (7.1.2)\n",
            "Requirement already satisfied: tblib>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.7.0)\n",
            "Requirement already satisfied: contextvars; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.4)\n",
            "Requirement already satisfied: cloudpickle>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.6.0)\n",
            "Requirement already satisfied: msgpack>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.0.0)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->auto-sklearn==0.11.1) (1.15.0)\n",
            "Requirement already satisfied: heapdict in /usr/local/lib/python3.6/dist-packages (from zict>=0.1.3->distributed>=2.2.0->auto-sklearn==0.11.1) (1.0.1)\n",
            "Requirement already satisfied: immutables>=0.9 in /usr/local/lib/python3.6/dist-packages (from contextvars; python_version < \"3.7\"->distributed>=2.2.0->auto-sklearn==0.11.1) (0.14)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bjlx7he7aQ_y"
      },
      "source": [
        "# Fails after the initial instalation of auto-sklearn -> Restart runtime and run all\n",
        "from autosklearn.classification import AutoSklearnClassifier"
      ],
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 411
        },
        "id": "9B4TMIsZQOIC",
        "outputId": "f0bc89ad-e6ae-4037-9f42-e0aeee39cc1c"
      },
      "source": [
        "df = pd.read_csv(\"/content/drive/My Drive/Colab Notebooks/colab-auto-sklearn-setup/data/airbnb_nyc_kaggle.csv\")\n",
        "df.head(5)"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>name</th>\n",
              "      <th>host_id</th>\n",
              "      <th>host_name</th>\n",
              "      <th>neighbourhood_group</th>\n",
              "      <th>neighbourhood</th>\n",
              "      <th>latitude</th>\n",
              "      <th>longitude</th>\n",
              "      <th>room_type</th>\n",
              "      <th>price</th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>last_review</th>\n",
              "      <th>reviews_per_month</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>availability_365</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2539</td>\n",
              "      <td>Clean &amp; quiet apt home by the park</td>\n",
              "      <td>2787</td>\n",
              "      <td>John</td>\n",
              "      <td>Brooklyn</td>\n",
              "      <td>Kensington</td>\n",
              "      <td>40.64749</td>\n",
              "      <td>-73.97237</td>\n",
              "      <td>Private room</td>\n",
              "      <td>149</td>\n",
              "      <td>1</td>\n",
              "      <td>9</td>\n",
              "      <td>2018-10-19</td>\n",
              "      <td>0.21</td>\n",
              "      <td>6</td>\n",
              "      <td>365</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2595</td>\n",
              "      <td>Skylit Midtown Castle</td>\n",
              "      <td>2845</td>\n",
              "      <td>Jennifer</td>\n",
              "      <td>Manhattan</td>\n",
              "      <td>Midtown</td>\n",
              "      <td>40.75362</td>\n",
              "      <td>-73.98377</td>\n",
              "      <td>Entire home/apt</td>\n",
              "      <td>225</td>\n",
              "      <td>1</td>\n",
              "      <td>45</td>\n",
              "      <td>2019-05-21</td>\n",
              "      <td>0.38</td>\n",
              "      <td>2</td>\n",
              "      <td>355</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3647</td>\n",
              "      <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
              "      <td>4632</td>\n",
              "      <td>Elisabeth</td>\n",
              "      <td>Manhattan</td>\n",
              "      <td>Harlem</td>\n",
              "      <td>40.80902</td>\n",
              "      <td>-73.94190</td>\n",
              "      <td>Private room</td>\n",
              "      <td>150</td>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1</td>\n",
              "      <td>365</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3831</td>\n",
              "      <td>Cozy Entire Floor of Brownstone</td>\n",
              "      <td>4869</td>\n",
              "      <td>LisaRoxanne</td>\n",
              "      <td>Brooklyn</td>\n",
              "      <td>Clinton Hill</td>\n",
              "      <td>40.68514</td>\n",
              "      <td>-73.95976</td>\n",
              "      <td>Entire home/apt</td>\n",
              "      <td>89</td>\n",
              "      <td>1</td>\n",
              "      <td>270</td>\n",
              "      <td>2019-07-05</td>\n",
              "      <td>4.64</td>\n",
              "      <td>1</td>\n",
              "      <td>194</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5022</td>\n",
              "      <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
              "      <td>7192</td>\n",
              "      <td>Laura</td>\n",
              "      <td>Manhattan</td>\n",
              "      <td>East Harlem</td>\n",
              "      <td>40.79851</td>\n",
              "      <td>-73.94399</td>\n",
              "      <td>Entire home/apt</td>\n",
              "      <td>80</td>\n",
              "      <td>10</td>\n",
              "      <td>9</td>\n",
              "      <td>2018-11-19</td>\n",
              "      <td>0.10</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     id  ... availability_365\n",
              "0  2539  ...              365\n",
              "1  2595  ...              355\n",
              "2  3647  ...              365\n",
              "3  3831  ...              194\n",
              "4  5022  ...                0\n",
              "\n",
              "[5 rows x 16 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 444
        },
        "id": "ULtdK-mhTp23",
        "outputId": "f36bc5db-64b3-411b-8b9b-a971c89f113f"
      },
      "source": [
        "df.describe(include=\"all\")"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "INFO:numexpr.utils:NumExpr defaulting to 2 threads.\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>name</th>\n",
              "      <th>host_id</th>\n",
              "      <th>host_name</th>\n",
              "      <th>neighbourhood_group</th>\n",
              "      <th>neighbourhood</th>\n",
              "      <th>latitude</th>\n",
              "      <th>longitude</th>\n",
              "      <th>room_type</th>\n",
              "      <th>price</th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>last_review</th>\n",
              "      <th>reviews_per_month</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>availability_365</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>4.889500e+04</td>\n",
              "      <td>48879</td>\n",
              "      <td>4.889500e+04</td>\n",
              "      <td>48874</td>\n",
              "      <td>48895</td>\n",
              "      <td>48895</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>38843</td>\n",
              "      <td>38843.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>NaN</td>\n",
              "      <td>47905</td>\n",
              "      <td>NaN</td>\n",
              "      <td>11452</td>\n",
              "      <td>5</td>\n",
              "      <td>221</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>3</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1764</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>NaN</td>\n",
              "      <td>Hillside Hotel</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Michael</td>\n",
              "      <td>Manhattan</td>\n",
              "      <td>Williamsburg</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Entire home/apt</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2019-06-23</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>NaN</td>\n",
              "      <td>18</td>\n",
              "      <td>NaN</td>\n",
              "      <td>417</td>\n",
              "      <td>21661</td>\n",
              "      <td>3920</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>25409</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1413</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>1.901714e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>6.762001e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.728949</td>\n",
              "      <td>-73.952170</td>\n",
              "      <td>NaN</td>\n",
              "      <td>152.720687</td>\n",
              "      <td>7.029962</td>\n",
              "      <td>23.274466</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.373221</td>\n",
              "      <td>7.143982</td>\n",
              "      <td>112.781327</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>1.098311e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>7.861097e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.054530</td>\n",
              "      <td>0.046157</td>\n",
              "      <td>NaN</td>\n",
              "      <td>240.154170</td>\n",
              "      <td>20.510550</td>\n",
              "      <td>44.550582</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.680442</td>\n",
              "      <td>32.952519</td>\n",
              "      <td>131.622289</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>2.539000e+03</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2.438000e+03</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.499790</td>\n",
              "      <td>-74.244420</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.010000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>9.471945e+06</td>\n",
              "      <td>NaN</td>\n",
              "      <td>7.822033e+06</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.690100</td>\n",
              "      <td>-73.983070</td>\n",
              "      <td>NaN</td>\n",
              "      <td>69.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.190000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>1.967728e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>3.079382e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.723070</td>\n",
              "      <td>-73.955680</td>\n",
              "      <td>NaN</td>\n",
              "      <td>106.000000</td>\n",
              "      <td>3.000000</td>\n",
              "      <td>5.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.720000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>45.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>2.915218e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.074344e+08</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.763115</td>\n",
              "      <td>-73.936275</td>\n",
              "      <td>NaN</td>\n",
              "      <td>175.000000</td>\n",
              "      <td>5.000000</td>\n",
              "      <td>24.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2.020000</td>\n",
              "      <td>2.000000</td>\n",
              "      <td>227.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>3.648724e+07</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2.743213e+08</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>40.913060</td>\n",
              "      <td>-73.712990</td>\n",
              "      <td>NaN</td>\n",
              "      <td>10000.000000</td>\n",
              "      <td>1250.000000</td>\n",
              "      <td>629.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>58.500000</td>\n",
              "      <td>327.000000</td>\n",
              "      <td>365.000000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                  id  ... availability_365\n",
              "count   4.889500e+04  ...     48895.000000\n",
              "unique           NaN  ...              NaN\n",
              "top              NaN  ...              NaN\n",
              "freq             NaN  ...              NaN\n",
              "mean    1.901714e+07  ...       112.781327\n",
              "std     1.098311e+07  ...       131.622289\n",
              "min     2.539000e+03  ...         0.000000\n",
              "25%     9.471945e+06  ...         0.000000\n",
              "50%     1.967728e+07  ...        45.000000\n",
              "75%     2.915218e+07  ...       227.000000\n",
              "max     3.648724e+07  ...       365.000000\n",
              "\n",
              "[11 rows x 16 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sx2xeS7XWVda"
      },
      "source": [
        "df[\"label\"] = df[\"price\"]"
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1uvlOF52S28v"
      },
      "source": [
        "numerical_features = [\"minimum_nights\", \"number_of_reviews\", \"calculated_host_listings_count\"]\n",
        "categorical_features = [\"neighbourhood\", \"room_type\"]"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 390
        },
        "id": "MWik3ZAgFBKX",
        "outputId": "23e6bb2c-7b25-45d2-fea9-c1895181a9a1"
      },
      "source": [
        "df.loc[:, numerical_features + categorical_features + [\"label\"]].describe(include=\"all\")"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>neighbourhood</th>\n",
              "      <th>room_type</th>\n",
              "      <th>label</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895.000000</td>\n",
              "      <td>48895</td>\n",
              "      <td>48895</td>\n",
              "      <td>48895.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>221</td>\n",
              "      <td>3</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Williamsburg</td>\n",
              "      <td>Entire home/apt</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>3920</td>\n",
              "      <td>25409</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>7.029962</td>\n",
              "      <td>23.274466</td>\n",
              "      <td>7.143982</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>152.720687</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>20.510550</td>\n",
              "      <td>44.550582</td>\n",
              "      <td>32.952519</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>240.154170</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>69.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>3.000000</td>\n",
              "      <td>5.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>106.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>5.000000</td>\n",
              "      <td>24.000000</td>\n",
              "      <td>2.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>175.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1250.000000</td>\n",
              "      <td>629.000000</td>\n",
              "      <td>327.000000</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>10000.000000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "        minimum_nights  number_of_reviews  ...        room_type         label\n",
              "count     48895.000000       48895.000000  ...            48895  48895.000000\n",
              "unique             NaN                NaN  ...                3           NaN\n",
              "top                NaN                NaN  ...  Entire home/apt           NaN\n",
              "freq               NaN                NaN  ...            25409           NaN\n",
              "mean          7.029962          23.274466  ...              NaN    152.720687\n",
              "std          20.510550          44.550582  ...              NaN    240.154170\n",
              "min           1.000000           0.000000  ...              NaN      0.000000\n",
              "25%           1.000000           1.000000  ...              NaN     69.000000\n",
              "50%           3.000000           5.000000  ...              NaN    106.000000\n",
              "75%           5.000000          24.000000  ...              NaN    175.000000\n",
              "max        1250.000000         629.000000  ...              NaN  10000.000000\n",
              "\n",
              "[11 rows x 6 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EHo-QfKRTayK",
        "outputId": "a93b0d2a-df49-43f5-c463-2a99c1491f9d"
      },
      "source": [
        "neighbourhood_counts = df[\"neighbourhood\"].value_counts(ascending=False)\n",
        "min_presence_threshold = 0.005 * len(df)\n",
        "print(min_presence_threshold)\n",
        "df[\"neighbourhood_clean\"] = df[\"neighbourhood\"].apply(\n",
        "    lambda x: x if min_presence_threshold < neighbourhood_counts[x] else \"neighbourhood_other\"\n",
        ")\n",
        "df[\"neighbourhood_clean\"].value_counts(ascending=False)"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "244.475\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "neighbourhood_other          7773\n",
              "Williamsburg                 3920\n",
              "Bedford-Stuyvesant           3714\n",
              "Harlem                       2658\n",
              "Bushwick                     2465\n",
              "Upper West Side              1971\n",
              "Hell's Kitchen               1958\n",
              "East Village                 1853\n",
              "Upper East Side              1798\n",
              "Crown Heights                1564\n",
              "Midtown                      1545\n",
              "East Harlem                  1117\n",
              "Greenpoint                   1115\n",
              "Chelsea                      1113\n",
              "Lower East Side               911\n",
              "Astoria                       900\n",
              "Washington Heights            899\n",
              "West Village                  768\n",
              "Financial District            744\n",
              "Flatbush                      621\n",
              "Clinton Hill                  572\n",
              "Long Island City              537\n",
              "Prospect-Lefferts Gardens     535\n",
              "Park Slope                    506\n",
              "East Flatbush                 500\n",
              "Fort Greene                   489\n",
              "Murray Hill                   485\n",
              "Kips Bay                      470\n",
              "Flushing                      426\n",
              "Ridgewood                     423\n",
              "Greenwich Village             392\n",
              "Sunset Park                   390\n",
              "Chinatown                     368\n",
              "Sunnyside                     363\n",
              "SoHo                          358\n",
              "Prospect Heights              357\n",
              "Morningside Heights           346\n",
              "Gramercy                      338\n",
              "Ditmars Steinway              309\n",
              "Theater District              288\n",
              "South Slope                   284\n",
              "Nolita                        253\n",
              "Inwood                        252\n",
              "Gowanus                       247\n",
              "Name: neighbourhood_clean, dtype: int64"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dYEVK8P_Vdn-",
        "outputId": "7439110e-eaf2-4537-9a43-932fe88e5353"
      },
      "source": [
        "room_type_counts = df[\"room_type\"].value_counts(ascending=False)\n",
        "min_presence_threshold = 0.005 * len(df)\n",
        "print(room_type_counts)\n",
        "print(min_presence_threshold)"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Entire home/apt    25409\n",
            "Private room       22326\n",
            "Shared room         1160\n",
            "Name: room_type, dtype: int64\n",
            "244.475\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Yr5C2t0vFvRL"
      },
      "source": [
        "categorical_features = [\"neighbourhood_clean\", \"room_type\"]"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2461lPbPS4Kf"
      },
      "source": [
        "## Pipeline to transform data to numerical"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0DWzF8yNQjlQ"
      },
      "source": [
        "class FeatureSelector(BaseEstimator, TransformerMixin):\n",
        "    def __init__(self, feature_names):\n",
        "        self.feature_names = feature_names   \n",
        "    def fit(self, X, y = None):\n",
        "        return self\n",
        "    def transform(self, X, y=None):\n",
        "        return X[self.feature_names] "
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QS6p5c7qSp7X"
      },
      "source": [
        "numerical_pipeline = Pipeline(steps = [ \n",
        "    (\"num_selector\", FeatureSelector(numerical_features)),\n",
        "    (\"impute_median\", SimpleImputer(strategy=\"median\")),\n",
        "    (\"std_scaler\", StandardScaler()) \n",
        "])"
      ],
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qQY5PrNFSvyd"
      },
      "source": [
        "categorical_pipeline = Pipeline(steps = [ \n",
        "    (\"num_selector\", FeatureSelector(categorical_features)),\n",
        "    (\"ohe\", OneHotEncoder(\n",
        "        handle_unknown=\"ignore\", \n",
        "        sparse=False,\n",
        "        categories=[\n",
        "            list(df[\"neighbourhood_clean\"].unique()),\n",
        "            list(df[\"room_type\"].unique())\n",
        "        ])\n",
        "    ) \n",
        "])"
      ],
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i9AZZFr9WHsA"
      },
      "source": [
        "feature_pipeline = FeatureUnion(n_jobs=1, transformer_list=[ \n",
        "    (\"numerical_pipeline\", numerical_pipeline),\n",
        "    (\"categorical_pipeline\", categorical_pipeline),\n",
        "])"
      ],
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZJpef2u4EEkr"
      },
      "source": [
        "## Split data and transform"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pXHQfDGPEJQf"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "X_train, X_test, y_train, y_test = train_test_split(\n",
        "    df.loc[:, numerical_features + categorical_features], \n",
        "    df.loc[:, [\"label\"]], \n",
        "    test_size=0.2, random_state=42\n",
        ")"
      ],
      "execution_count": 17,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        },
        "id": "rguB5LLbEQNx",
        "outputId": "3b5925f7-258a-413d-db61-d04502071f94"
      },
      "source": [
        "X_train_transformed = feature_pipeline.fit_transform(X_train, y_train)\n",
        "transformed_categories = [list(x) for x in feature_pipeline.transformer_list[1][1].steps[1][1].categories_] # list of lists\n",
        "transformed_categories = [val for lst in transformed_categories for val in lst] # flatten list of lists\n",
        "X_train_transformed = pd.DataFrame(\n",
        "    X_train_transformed,\n",
        "    columns = numerical_features + transformed_categories\n",
        ")\n",
        "X_train.head(5)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>neighbourhood_clean</th>\n",
              "      <th>room_type</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>32645</th>\n",
              "      <td>3</td>\n",
              "      <td>11</td>\n",
              "      <td>1</td>\n",
              "      <td>Williamsburg</td>\n",
              "      <td>Entire home/apt</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23615</th>\n",
              "      <td>2</td>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "      <td>Washington Heights</td>\n",
              "      <td>Private room</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>31183</th>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "      <td>Bedford-Stuyvesant</td>\n",
              "      <td>Private room</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>29260</th>\n",
              "      <td>3</td>\n",
              "      <td>87</td>\n",
              "      <td>1</td>\n",
              "      <td>Bedford-Stuyvesant</td>\n",
              "      <td>Entire home/apt</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7275</th>\n",
              "      <td>5</td>\n",
              "      <td>13</td>\n",
              "      <td>1</td>\n",
              "      <td>neighbourhood_other</td>\n",
              "      <td>Private room</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       minimum_nights  number_of_reviews  ...  neighbourhood_clean        room_type\n",
              "32645               3                 11  ...         Williamsburg  Entire home/apt\n",
              "23615               2                  2  ...   Washington Heights     Private room\n",
              "31183               2                  0  ...   Bedford-Stuyvesant     Private room\n",
              "29260               3                 87  ...   Bedford-Stuyvesant  Entire home/apt\n",
              "7275                5                 13  ...  neighbourhood_other     Private room\n",
              "\n",
              "[5 rows x 5 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 258
        },
        "id": "wLDREplwK_7g",
        "outputId": "1966a05f-8a9b-4a1d-ac02-7c5f6cdef9fc"
      },
      "source": [
        "X_train_transformed.head(5)"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>neighbourhood_other</th>\n",
              "      <th>Midtown</th>\n",
              "      <th>Harlem</th>\n",
              "      <th>Clinton Hill</th>\n",
              "      <th>East Harlem</th>\n",
              "      <th>Murray Hill</th>\n",
              "      <th>Bedford-Stuyvesant</th>\n",
              "      <th>Hell's Kitchen</th>\n",
              "      <th>Upper West Side</th>\n",
              "      <th>Chinatown</th>\n",
              "      <th>South Slope</th>\n",
              "      <th>West Village</th>\n",
              "      <th>Williamsburg</th>\n",
              "      <th>Fort Greene</th>\n",
              "      <th>Chelsea</th>\n",
              "      <th>Crown Heights</th>\n",
              "      <th>Park Slope</th>\n",
              "      <th>Inwood</th>\n",
              "      <th>East Village</th>\n",
              "      <th>Greenpoint</th>\n",
              "      <th>Bushwick</th>\n",
              "      <th>Flatbush</th>\n",
              "      <th>Lower East Side</th>\n",
              "      <th>Prospect-Lefferts Gardens</th>\n",
              "      <th>Long Island City</th>\n",
              "      <th>Kips Bay</th>\n",
              "      <th>SoHo</th>\n",
              "      <th>Upper East Side</th>\n",
              "      <th>Prospect Heights</th>\n",
              "      <th>Washington Heights</th>\n",
              "      <th>Gowanus</th>\n",
              "      <th>Flushing</th>\n",
              "      <th>Sunnyside</th>\n",
              "      <th>Financial District</th>\n",
              "      <th>Ridgewood</th>\n",
              "      <th>Morningside Heights</th>\n",
              "      <th>Ditmars Steinway</th>\n",
              "      <th>Greenwich Village</th>\n",
              "      <th>East Flatbush</th>\n",
              "      <th>Astoria</th>\n",
              "      <th>Nolita</th>\n",
              "      <th>Gramercy</th>\n",
              "      <th>Theater District</th>\n",
              "      <th>Sunset Park</th>\n",
              "      <th>Private room</th>\n",
              "      <th>Entire home/apt</th>\n",
              "      <th>Shared room</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-0.193025</td>\n",
              "      <td>-0.277198</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-0.239596</td>\n",
              "      <td>-0.479451</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-0.239596</td>\n",
              "      <td>-0.524396</td>\n",
              "      <td>-0.156424</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>-0.193025</td>\n",
              "      <td>1.430714</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>-0.099884</td>\n",
              "      <td>-0.232253</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   minimum_nights  number_of_reviews  ...  Entire home/apt  Shared room\n",
              "0       -0.193025          -0.277198  ...              1.0          0.0\n",
              "1       -0.239596          -0.479451  ...              0.0          0.0\n",
              "2       -0.239596          -0.524396  ...              0.0          0.0\n",
              "3       -0.193025           1.430714  ...              1.0          0.0\n",
              "4       -0.099884          -0.232253  ...              0.0          0.0\n",
              "\n",
              "[5 rows x 50 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 394
        },
        "id": "exwTu6aFF5-v",
        "outputId": "46973b05-0de3-4805-9e52-106d0da592da"
      },
      "source": [
        "X_test_transformed = feature_pipeline.transform(X_test)\n",
        "X_test_transformed = pd.DataFrame(\n",
        "    X_test_transformed,\n",
        "    columns = numerical_features + transformed_categories\n",
        ")\n",
        "print(X_test.head(5))\n",
        "X_test_transformed.head(5)"
      ],
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "       minimum_nights  number_of_reviews  ...  neighbourhood_clean        room_type\n",
            "879                 3                 62  ...  neighbourhood_other  Entire home/apt\n",
            "44383              21                  0  ...            Ridgewood     Private room\n",
            "15394               2                 17  ...       Hell's Kitchen     Private room\n",
            "43230               2                  5  ...   Financial District  Entire home/apt\n",
            "16332               2                 30  ...          East Harlem  Entire home/apt\n",
            "\n",
            "[5 rows x 5 columns]\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>minimum_nights</th>\n",
              "      <th>number_of_reviews</th>\n",
              "      <th>calculated_host_listings_count</th>\n",
              "      <th>neighbourhood_other</th>\n",
              "      <th>Midtown</th>\n",
              "      <th>Harlem</th>\n",
              "      <th>Clinton Hill</th>\n",
              "      <th>East Harlem</th>\n",
              "      <th>Murray Hill</th>\n",
              "      <th>Bedford-Stuyvesant</th>\n",
              "      <th>Hell's Kitchen</th>\n",
              "      <th>Upper West Side</th>\n",
              "      <th>Chinatown</th>\n",
              "      <th>South Slope</th>\n",
              "      <th>West Village</th>\n",
              "      <th>Williamsburg</th>\n",
              "      <th>Fort Greene</th>\n",
              "      <th>Chelsea</th>\n",
              "      <th>Crown Heights</th>\n",
              "      <th>Park Slope</th>\n",
              "      <th>Inwood</th>\n",
              "      <th>East Village</th>\n",
              "      <th>Greenpoint</th>\n",
              "      <th>Bushwick</th>\n",
              "      <th>Flatbush</th>\n",
              "      <th>Lower East Side</th>\n",
              "      <th>Prospect-Lefferts Gardens</th>\n",
              "      <th>Long Island City</th>\n",
              "      <th>Kips Bay</th>\n",
              "      <th>SoHo</th>\n",
              "      <th>Upper East Side</th>\n",
              "      <th>Prospect Heights</th>\n",
              "      <th>Washington Heights</th>\n",
              "      <th>Gowanus</th>\n",
              "      <th>Flushing</th>\n",
              "      <th>Sunnyside</th>\n",
              "      <th>Financial District</th>\n",
              "      <th>Ridgewood</th>\n",
              "      <th>Morningside Heights</th>\n",
              "      <th>Ditmars Steinway</th>\n",
              "      <th>Greenwich Village</th>\n",
              "      <th>East Flatbush</th>\n",
              "      <th>Astoria</th>\n",
              "      <th>Nolita</th>\n",
              "      <th>Gramercy</th>\n",
              "      <th>Theater District</th>\n",
              "      <th>Sunset Park</th>\n",
              "      <th>Private room</th>\n",
              "      <th>Entire home/apt</th>\n",
              "      <th>Shared room</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-0.193025</td>\n",
              "      <td>0.868901</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>0.645248</td>\n",
              "      <td>-0.524396</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-0.239596</td>\n",
              "      <td>-0.142363</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>-0.239596</td>\n",
              "      <td>-0.412033</td>\n",
              "      <td>9.640935</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>-0.239596</td>\n",
              "      <td>0.149780</td>\n",
              "      <td>-0.186570</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   minimum_nights  number_of_reviews  ...  Entire home/apt  Shared room\n",
              "0       -0.193025           0.868901  ...              1.0          0.0\n",
              "1        0.645248          -0.524396  ...              0.0          0.0\n",
              "2       -0.239596          -0.142363  ...              0.0          0.0\n",
              "3       -0.239596          -0.412033  ...              1.0          0.0\n",
              "4       -0.239596           0.149780  ...              1.0          0.0\n",
              "\n",
              "[5 rows x 50 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GxFji2ZnC40g"
      },
      "source": [
        "## Auto Sklearn"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hXlJx7GcC-T5",
        "outputId": "97778277-49e5-403d-8e74-061d0564f0f8"
      },
      "source": [
        "import autosklearn.regression\n",
        "automl = autosklearn.regression.AutoSklearnRegressor(\n",
        "    time_left_for_this_task=120,\n",
        "    per_run_time_limit=30,\n",
        "    n_jobs=1\n",
        ")\n",
        "automl.fit(\n",
        "    X_train_transformed, \n",
        "    y_train\n",
        ")"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/sklearn/base.py:197: FutureWarning:\n",
            "\n",
            "From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n",
            "\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "AutoSklearnRegressor(dask_client=None,\n",
              "                     delete_output_folder_after_terminate=True,\n",
              "                     delete_tmp_folder_after_terminate=True,\n",
              "                     disable_evaluator_output=False, ensemble_nbest=50,\n",
              "                     ensemble_size=50, exclude_estimators=None,\n",
              "                     exclude_preprocessors=None, get_smac_object_callback=None,\n",
              "                     include_estimators=None, include_preprocessors=None,\n",
              "                     initial_configurations_via_metalearning=25,\n",
              "                     load_models=None, logging_config=None,\n",
              "                     max_models_on_disc=50, memory_limit=3072,\n",
              "                     metadata_directory=None, metric=None, n_jobs=1,\n",
              "                     output_folder=None, per_run_time_limit=30,\n",
              "                     resampling_strategy='holdout',\n",
              "                     resampling_strategy_arguments=None, seed=1,\n",
              "                     smac_scenario_args=None, time_left_for_this_task=120,\n",
              "                     tmp_folder=None)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "id": "iF8zQHX1F2Mf",
        "outputId": "2a86704d-1392-4d0a-c88c-e70c937c3fff"
      },
      "source": [
        "automl.sprint_statistics()"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'auto-sklearn results:\\n  Dataset name: 193c0133dfc3b6116268c126137b3a31\\n  Metric: r2\\n  Best validation score: 0.159246\\n  Number of target algorithm runs: 8\\n  Number of successful target algorithm runs: 5\\n  Number of crashed target algorithm runs: 0\\n  Number of target algorithms that exceeded the time limit: 3\\n  Number of target algorithms that exceeded the memory limit: 0\\n'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 22
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 137
        },
        "id": "oDmw2UM5DaV-",
        "outputId": "71f2b243-9e9c-4cc9-b74a-33719e1bc70d"
      },
      "source": [
        "automl.show_models()"
      ],
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "\"[(0.440000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.00924813449669181, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 987, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:select_rates_regression:alpha': 0.1, 'feature_preprocessor:select_rates_regression:mode': 'fwe', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 2.5833231171101403e-10, 'regressor:gradient_boosting:learning_rate': 0.10682575320034993, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 31, 'regressor:gradient_boosting:min_samples_leaf': 12, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n  'task': 4,\\n  'sparse': False,\\n  'multioutput': False,\\n  'target_type': 'regression',\\n  'signed': False})),\\n(0.260000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'feature_preprocessor:select_rates_regression:alpha': 0.40612979303062824, 'feature_preprocessor:select_rates_regression:mode': 'fdr', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 2.2742572602302616e-08, 'regressor:gradient_boosting:learning_rate': 0.02539518548794612, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 57, 'regressor:gradient_boosting:min_samples_leaf': 10, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n  'task': 4,\\n  'sparse': False,\\n  'multioutput': False,\\n  'target_type': 'regression',\\n  'signed': False})),\\n(0.240000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 148, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:select_rates_regression:alpha': 0.10789818795901182, 'feature_preprocessor:select_rates_regression:mode': 'fwe', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 0.0002841741346637058, 'regressor:gradient_boosting:learning_rate': 0.024253735132716756, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 36, 'regressor:gradient_boosting:min_samples_leaf': 1, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n  'task': 4,\\n  'sparse': False,\\n  'multioutput': False,\\n  'target_type': 'regression',\\n  'signed': False})),\\n(0.060000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:__choice__': 'nystroem_sampler', 'regressor:__choice__': 'liblinear_svr', 'feature_preprocessor:nystroem_sampler:kernel': 'rbf', 'feature_preprocessor:nystroem_sampler:n_components': 305, 'regressor:liblinear_svr:C': 18080.177016227895, 'regressor:liblinear_svr:dual': 'False', 'regressor:liblinear_svr:epsilon': 0.0017155220175954669, 'regressor:liblinear_svr:fit_intercept': 'True', 'regressor:liblinear_svr:intercept_scaling': 1, 'regressor:liblinear_svr:loss': 'squared_epsilon_insensitive', 'regressor:liblinear_svr:tol': 7.035440846539455e-05, 'feature_preprocessor:nystroem_sampler:gamma': 0.00025844019785412086},\\ndataset_properties={\\n  'task': 4,\\n  'sparse': False,\\n  'multioutput': False,\\n  'target_type': 'regression',\\n  'signed': False})),\\n]\""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a0JnklSDDgfx",
        "outputId": "321e23d3-b2a9-48cc-ed2c-dfeda30ed3cc"
      },
      "source": [
        "import sklearn.metrics\n",
        "predictions = automl.predict(X_test_transformed)\n",
        "sklearn.metrics.r2_score(y_test, predictions)"
      ],
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.18626743087093045"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 24
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "O7neO0jKHk5p"
      },
      "source": [
        "## Grid search - single model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yq4RcZ78ELR7",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "3a9e0304-0174-4964-877b-7d8de91b3c8d"
      },
      "source": [
        "from sklearn.ensemble import RandomForestRegressor\n",
        "from sklearn.model_selection import GridSearchCV\n",
        "model = RandomForestRegressor(max_depth=3, random_state=0)\n",
        "parameters = {\n",
        "    \"max_depth\": (2, 3, 5)\n",
        "}\n",
        "grid = GridSearchCV(model, parameters, cv=5, scoring=\"r2\")\n",
        "grid.fit(X_train_transformed, y_train.values.ravel())"
      ],
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "GridSearchCV(cv=5, error_score=nan,\n",
              "             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,\n",
              "                                             criterion='mse', max_depth=3,\n",
              "                                             max_features='auto',\n",
              "                                             max_leaf_nodes=None,\n",
              "                                             max_samples=None,\n",
              "                                             min_impurity_decrease=0.0,\n",
              "                                             min_impurity_split=None,\n",
              "                                             min_samples_leaf=1,\n",
              "                                             min_samples_split=2,\n",
              "                                             min_weight_fraction_leaf=0.0,\n",
              "                                             n_estimators=100, n_jobs=None,\n",
              "                                             oob_score=False, random_state=0,\n",
              "                                             verbose=0, warm_start=False),\n",
              "             iid='deprecated', n_jobs=None, param_grid={'max_depth': (2, 3, 5)},\n",
              "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
              "             scoring='r2', verbose=0)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1xLArYFiI1JR",
        "outputId": "c64127b8-93df-44ba-a30d-3e19e38676a8"
      },
      "source": [
        "grid.best_estimator_"
      ],
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n",
              "                      max_depth=3, max_features='auto', max_leaf_nodes=None,\n",
              "                      max_samples=None, min_impurity_decrease=0.0,\n",
              "                      min_impurity_split=None, min_samples_leaf=1,\n",
              "                      min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
              "                      n_estimators=100, n_jobs=None, oob_score=False,\n",
              "                      random_state=0, verbose=0, warm_start=False)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 26
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xgHZYwXGI-3t",
        "outputId": "9aae988d-c9a4-440e-d1b1-07f56736b34d"
      },
      "source": [
        "grid.best_params_"
      ],
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'max_depth': 3}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 27
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Xxlns6rjH9XF",
        "outputId": "baa8705f-58eb-4289-f67d-feb09644cd1a"
      },
      "source": [
        "predictions = grid.predict(X_test_transformed)\n",
        "sklearn.metrics.r2_score(y_test, predictions)"
      ],
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.09826359771273341"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 28
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NPxkEzo7IzXy"
      },
      "source": [
        ""
      ],
      "execution_count": 28,
      "outputs": []
    }
  ]
}