Skip to content

Instantly share code, notes, and snippets.

@adamnovotnycom
Created December 4, 2020 19:40
Show Gist options
  • Save adamnovotnycom/1df7ef10649d8241c389c96becb7fe37 to your computer and use it in GitHub Desktop.
Save adamnovotnycom/1df7ef10649d8241c389c96becb7fe37 to your computer and use it in GitHub Desktop.
airbnb_nyc_kaggle.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "airbnb_nyc_kaggle.ipynb",
"provenance": [],
"collapsed_sections": [],
"mount_file_id": "11_t63audBgQMHVGIIfSgp_Dp1sCbKRHy",
"authorship_tag": "ABX9TyNH+t3cZzg9AbV+jxQelsso",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/excitedAtom/1df7ef10649d8241c389c96becb7fe37/airbnb_nyc_kaggle.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AGVZmG3VQ2Ij"
},
"source": [
"# Aibnb in NYC\n",
"Project Description: [Kaggle](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MXByWp-mQKTp",
"outputId": "1a5edac8-bede-4ced-d069-cee698e75ef7"
},
"source": [
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"import plotly.graph_objects as go\n",
"import random\n",
"import sys\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import FeatureUnion, Pipeline \n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"print(sys.version)\n",
"print(pd.__version__)"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"3.6.9 (default, Oct 8 2020, 12:12:24) \n",
"[GCC 8.4.0]\n",
"1.1.4\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Gkl01MkHby2x"
},
"source": [
"if pd.__version__[0] == \"0\":\n",
" !pip install pandas==1.1.2\n",
" import pandas as pd\n",
"assert pd.__version__[0] == \"1\" "
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uazEb54Ucs6D",
"outputId": "387a6428-5217-4559-fbd9-662599557253"
},
"source": [
"!sudo apt-get install build-essential swig\n",
"!pip install auto-sklearn==0.11.1"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"Reading package lists... Done\n",
"Building dependency tree \n",
"Reading state information... Done\n",
"build-essential is already the newest version (12.4ubuntu1).\n",
"swig is already the newest version (3.0.12-1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.\n",
"Requirement already satisfied: auto-sklearn==0.11.1 in /usr/local/lib/python3.6/dist-packages (0.11.1)\n",
"Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.1.4)\n",
"Requirement already satisfied: smac<0.14,>=0.13.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.13.1)\n",
"Requirement already satisfied: dask in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.12.0)\n",
"Requirement already satisfied: ConfigSpace<0.5,>=0.4.14 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.4.16)\n",
"Requirement already satisfied: pynisher>=0.6.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.6.3)\n",
"Requirement already satisfied: pyrfr<0.9,>=0.7 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.8.0)\n",
"Requirement already satisfied: liac-arff in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.5.0)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (50.3.2)\n",
"Requirement already satisfied: lockfile in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.12.2)\n",
"Requirement already satisfied: scikit-learn<0.23,>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.22.2.post1)\n",
"Requirement already satisfied: distributed>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (2.30.1)\n",
"Requirement already satisfied: scipy>=0.14.1 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.4.1)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (0.17.0)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (3.13)\n",
"Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from auto-sklearn==0.11.1) (1.18.5)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=1.0->auto-sklearn==0.11.1) (2.8.1)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=1.0->auto-sklearn==0.11.1) (2018.9)\n",
"Requirement already satisfied: lazy-import in /usr/local/lib/python3.6/dist-packages (from smac<0.14,>=0.13.1->auto-sklearn==0.11.1) (0.2.2)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.6/dist-packages (from smac<0.14,>=0.13.1->auto-sklearn==0.11.1) (5.4.8)\n",
"Requirement already satisfied: cython in /usr/local/lib/python3.6/dist-packages (from ConfigSpace<0.5,>=0.4.14->auto-sklearn==0.11.1) (0.29.21)\n",
"Requirement already satisfied: pyparsing in /usr/local/lib/python3.6/dist-packages (from ConfigSpace<0.5,>=0.4.14->auto-sklearn==0.11.1) (2.4.7)\n",
"Requirement already satisfied: docutils>=0.3 in /usr/local/lib/python3.6/dist-packages (from pynisher>=0.6.1->auto-sklearn==0.11.1) (0.16)\n",
"Requirement already satisfied: zict>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.0.0)\n",
"Requirement already satisfied: toolz>=0.8.2 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (0.11.1)\n",
"Requirement already satisfied: tornado>=5; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (5.1.1)\n",
"Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.3.0)\n",
"Requirement already satisfied: click>=6.6 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (7.1.2)\n",
"Requirement already satisfied: tblib>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.7.0)\n",
"Requirement already satisfied: contextvars; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (2.4)\n",
"Requirement already satisfied: cloudpickle>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.6.0)\n",
"Requirement already satisfied: msgpack>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from distributed>=2.2.0->auto-sklearn==0.11.1) (1.0.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->auto-sklearn==0.11.1) (1.15.0)\n",
"Requirement already satisfied: heapdict in /usr/local/lib/python3.6/dist-packages (from zict>=0.1.3->distributed>=2.2.0->auto-sklearn==0.11.1) (1.0.1)\n",
"Requirement already satisfied: immutables>=0.9 in /usr/local/lib/python3.6/dist-packages (from contextvars; python_version < \"3.7\"->distributed>=2.2.0->auto-sklearn==0.11.1) (0.14)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bjlx7he7aQ_y"
},
"source": [
"# Fails after the initial instalation of auto-sklearn -> Restart runtime and run all\n",
"from autosklearn.classification import AutoSklearnClassifier"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 411
},
"id": "9B4TMIsZQOIC",
"outputId": "f0bc89ad-e6ae-4037-9f42-e0aeee39cc1c"
},
"source": [
"df = pd.read_csv(\"/content/drive/My Drive/Colab Notebooks/colab-auto-sklearn-setup/data/airbnb_nyc_kaggle.csv\")\n",
"df.head(5)"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>host_id</th>\n",
" <th>host_name</th>\n",
" <th>neighbourhood_group</th>\n",
" <th>neighbourhood</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>room_type</th>\n",
" <th>price</th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>last_review</th>\n",
" <th>reviews_per_month</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>availability_365</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2539</td>\n",
" <td>Clean &amp; quiet apt home by the park</td>\n",
" <td>2787</td>\n",
" <td>John</td>\n",
" <td>Brooklyn</td>\n",
" <td>Kensington</td>\n",
" <td>40.64749</td>\n",
" <td>-73.97237</td>\n",
" <td>Private room</td>\n",
" <td>149</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>2018-10-19</td>\n",
" <td>0.21</td>\n",
" <td>6</td>\n",
" <td>365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2595</td>\n",
" <td>Skylit Midtown Castle</td>\n",
" <td>2845</td>\n",
" <td>Jennifer</td>\n",
" <td>Manhattan</td>\n",
" <td>Midtown</td>\n",
" <td>40.75362</td>\n",
" <td>-73.98377</td>\n",
" <td>Entire home/apt</td>\n",
" <td>225</td>\n",
" <td>1</td>\n",
" <td>45</td>\n",
" <td>2019-05-21</td>\n",
" <td>0.38</td>\n",
" <td>2</td>\n",
" <td>355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3647</td>\n",
" <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
" <td>4632</td>\n",
" <td>Elisabeth</td>\n",
" <td>Manhattan</td>\n",
" <td>Harlem</td>\n",
" <td>40.80902</td>\n",
" <td>-73.94190</td>\n",
" <td>Private room</td>\n",
" <td>150</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3831</td>\n",
" <td>Cozy Entire Floor of Brownstone</td>\n",
" <td>4869</td>\n",
" <td>LisaRoxanne</td>\n",
" <td>Brooklyn</td>\n",
" <td>Clinton Hill</td>\n",
" <td>40.68514</td>\n",
" <td>-73.95976</td>\n",
" <td>Entire home/apt</td>\n",
" <td>89</td>\n",
" <td>1</td>\n",
" <td>270</td>\n",
" <td>2019-07-05</td>\n",
" <td>4.64</td>\n",
" <td>1</td>\n",
" <td>194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5022</td>\n",
" <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
" <td>7192</td>\n",
" <td>Laura</td>\n",
" <td>Manhattan</td>\n",
" <td>East Harlem</td>\n",
" <td>40.79851</td>\n",
" <td>-73.94399</td>\n",
" <td>Entire home/apt</td>\n",
" <td>80</td>\n",
" <td>10</td>\n",
" <td>9</td>\n",
" <td>2018-11-19</td>\n",
" <td>0.10</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id ... availability_365\n",
"0 2539 ... 365\n",
"1 2595 ... 355\n",
"2 3647 ... 365\n",
"3 3831 ... 194\n",
"4 5022 ... 0\n",
"\n",
"[5 rows x 16 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 444
},
"id": "ULtdK-mhTp23",
"outputId": "f36bc5db-64b3-411b-8b9b-a971c89f113f"
},
"source": [
"df.describe(include=\"all\")"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"INFO:numexpr.utils:NumExpr defaulting to 2 threads.\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>host_id</th>\n",
" <th>host_name</th>\n",
" <th>neighbourhood_group</th>\n",
" <th>neighbourhood</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>room_type</th>\n",
" <th>price</th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>last_review</th>\n",
" <th>reviews_per_month</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>availability_365</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>4.889500e+04</td>\n",
" <td>48879</td>\n",
" <td>4.889500e+04</td>\n",
" <td>48874</td>\n",
" <td>48895</td>\n",
" <td>48895</td>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>48895</td>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>38843</td>\n",
" <td>38843.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>47905</td>\n",
" <td>NaN</td>\n",
" <td>11452</td>\n",
" <td>5</td>\n",
" <td>221</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1764</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>Hillside Hotel</td>\n",
" <td>NaN</td>\n",
" <td>Michael</td>\n",
" <td>Manhattan</td>\n",
" <td>Williamsburg</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Entire home/apt</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2019-06-23</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>18</td>\n",
" <td>NaN</td>\n",
" <td>417</td>\n",
" <td>21661</td>\n",
" <td>3920</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>25409</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1413</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.901714e+07</td>\n",
" <td>NaN</td>\n",
" <td>6.762001e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.728949</td>\n",
" <td>-73.952170</td>\n",
" <td>NaN</td>\n",
" <td>152.720687</td>\n",
" <td>7.029962</td>\n",
" <td>23.274466</td>\n",
" <td>NaN</td>\n",
" <td>1.373221</td>\n",
" <td>7.143982</td>\n",
" <td>112.781327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.098311e+07</td>\n",
" <td>NaN</td>\n",
" <td>7.861097e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.054530</td>\n",
" <td>0.046157</td>\n",
" <td>NaN</td>\n",
" <td>240.154170</td>\n",
" <td>20.510550</td>\n",
" <td>44.550582</td>\n",
" <td>NaN</td>\n",
" <td>1.680442</td>\n",
" <td>32.952519</td>\n",
" <td>131.622289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2.539000e+03</td>\n",
" <td>NaN</td>\n",
" <td>2.438000e+03</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.499790</td>\n",
" <td>-74.244420</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.010000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>9.471945e+06</td>\n",
" <td>NaN</td>\n",
" <td>7.822033e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.690100</td>\n",
" <td>-73.983070</td>\n",
" <td>NaN</td>\n",
" <td>69.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.190000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.967728e+07</td>\n",
" <td>NaN</td>\n",
" <td>3.079382e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.723070</td>\n",
" <td>-73.955680</td>\n",
" <td>NaN</td>\n",
" <td>106.000000</td>\n",
" <td>3.000000</td>\n",
" <td>5.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.720000</td>\n",
" <td>1.000000</td>\n",
" <td>45.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2.915218e+07</td>\n",
" <td>NaN</td>\n",
" <td>1.074344e+08</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.763115</td>\n",
" <td>-73.936275</td>\n",
" <td>NaN</td>\n",
" <td>175.000000</td>\n",
" <td>5.000000</td>\n",
" <td>24.000000</td>\n",
" <td>NaN</td>\n",
" <td>2.020000</td>\n",
" <td>2.000000</td>\n",
" <td>227.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>3.648724e+07</td>\n",
" <td>NaN</td>\n",
" <td>2.743213e+08</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.913060</td>\n",
" <td>-73.712990</td>\n",
" <td>NaN</td>\n",
" <td>10000.000000</td>\n",
" <td>1250.000000</td>\n",
" <td>629.000000</td>\n",
" <td>NaN</td>\n",
" <td>58.500000</td>\n",
" <td>327.000000</td>\n",
" <td>365.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id ... availability_365\n",
"count 4.889500e+04 ... 48895.000000\n",
"unique NaN ... NaN\n",
"top NaN ... NaN\n",
"freq NaN ... NaN\n",
"mean 1.901714e+07 ... 112.781327\n",
"std 1.098311e+07 ... 131.622289\n",
"min 2.539000e+03 ... 0.000000\n",
"25% 9.471945e+06 ... 0.000000\n",
"50% 1.967728e+07 ... 45.000000\n",
"75% 2.915218e+07 ... 227.000000\n",
"max 3.648724e+07 ... 365.000000\n",
"\n",
"[11 rows x 16 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "sx2xeS7XWVda"
},
"source": [
"df[\"label\"] = df[\"price\"]"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1uvlOF52S28v"
},
"source": [
"numerical_features = [\"minimum_nights\", \"number_of_reviews\", \"calculated_host_listings_count\"]\n",
"categorical_features = [\"neighbourhood\", \"room_type\"]"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 390
},
"id": "MWik3ZAgFBKX",
"outputId": "23e6bb2c-7b25-45d2-fea9-c1895181a9a1"
},
"source": [
"df.loc[:, numerical_features + categorical_features + [\"label\"]].describe(include=\"all\")"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>neighbourhood</th>\n",
" <th>room_type</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>48895.000000</td>\n",
" <td>48895</td>\n",
" <td>48895</td>\n",
" <td>48895.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>221</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Williamsburg</td>\n",
" <td>Entire home/apt</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3920</td>\n",
" <td>25409</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>7.029962</td>\n",
" <td>23.274466</td>\n",
" <td>7.143982</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>152.720687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>20.510550</td>\n",
" <td>44.550582</td>\n",
" <td>32.952519</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>240.154170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>69.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>3.000000</td>\n",
" <td>5.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>106.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>5.000000</td>\n",
" <td>24.000000</td>\n",
" <td>2.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>175.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1250.000000</td>\n",
" <td>629.000000</td>\n",
" <td>327.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>10000.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" minimum_nights number_of_reviews ... room_type label\n",
"count 48895.000000 48895.000000 ... 48895 48895.000000\n",
"unique NaN NaN ... 3 NaN\n",
"top NaN NaN ... Entire home/apt NaN\n",
"freq NaN NaN ... 25409 NaN\n",
"mean 7.029962 23.274466 ... NaN 152.720687\n",
"std 20.510550 44.550582 ... NaN 240.154170\n",
"min 1.000000 0.000000 ... NaN 0.000000\n",
"25% 1.000000 1.000000 ... NaN 69.000000\n",
"50% 3.000000 5.000000 ... NaN 106.000000\n",
"75% 5.000000 24.000000 ... NaN 175.000000\n",
"max 1250.000000 629.000000 ... NaN 10000.000000\n",
"\n",
"[11 rows x 6 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EHo-QfKRTayK",
"outputId": "a93b0d2a-df49-43f5-c463-2a99c1491f9d"
},
"source": [
"neighbourhood_counts = df[\"neighbourhood\"].value_counts(ascending=False)\n",
"min_presence_threshold = 0.005 * len(df)\n",
"print(min_presence_threshold)\n",
"df[\"neighbourhood_clean\"] = df[\"neighbourhood\"].apply(\n",
" lambda x: x if min_presence_threshold < neighbourhood_counts[x] else \"neighbourhood_other\"\n",
")\n",
"df[\"neighbourhood_clean\"].value_counts(ascending=False)"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"244.475\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"neighbourhood_other 7773\n",
"Williamsburg 3920\n",
"Bedford-Stuyvesant 3714\n",
"Harlem 2658\n",
"Bushwick 2465\n",
"Upper West Side 1971\n",
"Hell's Kitchen 1958\n",
"East Village 1853\n",
"Upper East Side 1798\n",
"Crown Heights 1564\n",
"Midtown 1545\n",
"East Harlem 1117\n",
"Greenpoint 1115\n",
"Chelsea 1113\n",
"Lower East Side 911\n",
"Astoria 900\n",
"Washington Heights 899\n",
"West Village 768\n",
"Financial District 744\n",
"Flatbush 621\n",
"Clinton Hill 572\n",
"Long Island City 537\n",
"Prospect-Lefferts Gardens 535\n",
"Park Slope 506\n",
"East Flatbush 500\n",
"Fort Greene 489\n",
"Murray Hill 485\n",
"Kips Bay 470\n",
"Flushing 426\n",
"Ridgewood 423\n",
"Greenwich Village 392\n",
"Sunset Park 390\n",
"Chinatown 368\n",
"Sunnyside 363\n",
"SoHo 358\n",
"Prospect Heights 357\n",
"Morningside Heights 346\n",
"Gramercy 338\n",
"Ditmars Steinway 309\n",
"Theater District 288\n",
"South Slope 284\n",
"Nolita 253\n",
"Inwood 252\n",
"Gowanus 247\n",
"Name: neighbourhood_clean, dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dYEVK8P_Vdn-",
"outputId": "7439110e-eaf2-4537-9a43-932fe88e5353"
},
"source": [
"room_type_counts = df[\"room_type\"].value_counts(ascending=False)\n",
"min_presence_threshold = 0.005 * len(df)\n",
"print(room_type_counts)\n",
"print(min_presence_threshold)"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"Entire home/apt 25409\n",
"Private room 22326\n",
"Shared room 1160\n",
"Name: room_type, dtype: int64\n",
"244.475\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Yr5C2t0vFvRL"
},
"source": [
"categorical_features = [\"neighbourhood_clean\", \"room_type\"]"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "2461lPbPS4Kf"
},
"source": [
"## Pipeline to transform data to numerical"
]
},
{
"cell_type": "code",
"metadata": {
"id": "0DWzF8yNQjlQ"
},
"source": [
"class FeatureSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, feature_names):\n",
" self.feature_names = feature_names \n",
" def fit(self, X, y = None):\n",
" return self\n",
" def transform(self, X, y=None):\n",
" return X[self.feature_names] "
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "QS6p5c7qSp7X"
},
"source": [
"numerical_pipeline = Pipeline(steps = [ \n",
" (\"num_selector\", FeatureSelector(numerical_features)),\n",
" (\"impute_median\", SimpleImputer(strategy=\"median\")),\n",
" (\"std_scaler\", StandardScaler()) \n",
"])"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "qQY5PrNFSvyd"
},
"source": [
"categorical_pipeline = Pipeline(steps = [ \n",
" (\"num_selector\", FeatureSelector(categorical_features)),\n",
" (\"ohe\", OneHotEncoder(\n",
" handle_unknown=\"ignore\", \n",
" sparse=False,\n",
" categories=[\n",
" list(df[\"neighbourhood_clean\"].unique()),\n",
" list(df[\"room_type\"].unique())\n",
" ])\n",
" ) \n",
"])"
],
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "i9AZZFr9WHsA"
},
"source": [
"feature_pipeline = FeatureUnion(n_jobs=1, transformer_list=[ \n",
" (\"numerical_pipeline\", numerical_pipeline),\n",
" (\"categorical_pipeline\", categorical_pipeline),\n",
"])"
],
"execution_count": 16,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZJpef2u4EEkr"
},
"source": [
"## Split data and transform"
]
},
{
"cell_type": "code",
"metadata": {
"id": "pXHQfDGPEJQf"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df.loc[:, numerical_features + categorical_features], \n",
" df.loc[:, [\"label\"]], \n",
" test_size=0.2, random_state=42\n",
")"
],
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "rguB5LLbEQNx",
"outputId": "3b5925f7-258a-413d-db61-d04502071f94"
},
"source": [
"X_train_transformed = feature_pipeline.fit_transform(X_train, y_train)\n",
"transformed_categories = [list(x) for x in feature_pipeline.transformer_list[1][1].steps[1][1].categories_] # list of lists\n",
"transformed_categories = [val for lst in transformed_categories for val in lst] # flatten list of lists\n",
"X_train_transformed = pd.DataFrame(\n",
" X_train_transformed,\n",
" columns = numerical_features + transformed_categories\n",
")\n",
"X_train.head(5)"
],
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>neighbourhood_clean</th>\n",
" <th>room_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>32645</th>\n",
" <td>3</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>Williamsburg</td>\n",
" <td>Entire home/apt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23615</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Washington Heights</td>\n",
" <td>Private room</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31183</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>Bedford-Stuyvesant</td>\n",
" <td>Private room</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29260</th>\n",
" <td>3</td>\n",
" <td>87</td>\n",
" <td>1</td>\n",
" <td>Bedford-Stuyvesant</td>\n",
" <td>Entire home/apt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7275</th>\n",
" <td>5</td>\n",
" <td>13</td>\n",
" <td>1</td>\n",
" <td>neighbourhood_other</td>\n",
" <td>Private room</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" minimum_nights number_of_reviews ... neighbourhood_clean room_type\n",
"32645 3 11 ... Williamsburg Entire home/apt\n",
"23615 2 2 ... Washington Heights Private room\n",
"31183 2 0 ... Bedford-Stuyvesant Private room\n",
"29260 3 87 ... Bedford-Stuyvesant Entire home/apt\n",
"7275 5 13 ... neighbourhood_other Private room\n",
"\n",
"[5 rows x 5 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 258
},
"id": "wLDREplwK_7g",
"outputId": "1966a05f-8a9b-4a1d-ac02-7c5f6cdef9fc"
},
"source": [
"X_train_transformed.head(5)"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>neighbourhood_other</th>\n",
" <th>Midtown</th>\n",
" <th>Harlem</th>\n",
" <th>Clinton Hill</th>\n",
" <th>East Harlem</th>\n",
" <th>Murray Hill</th>\n",
" <th>Bedford-Stuyvesant</th>\n",
" <th>Hell's Kitchen</th>\n",
" <th>Upper West Side</th>\n",
" <th>Chinatown</th>\n",
" <th>South Slope</th>\n",
" <th>West Village</th>\n",
" <th>Williamsburg</th>\n",
" <th>Fort Greene</th>\n",
" <th>Chelsea</th>\n",
" <th>Crown Heights</th>\n",
" <th>Park Slope</th>\n",
" <th>Inwood</th>\n",
" <th>East Village</th>\n",
" <th>Greenpoint</th>\n",
" <th>Bushwick</th>\n",
" <th>Flatbush</th>\n",
" <th>Lower East Side</th>\n",
" <th>Prospect-Lefferts Gardens</th>\n",
" <th>Long Island City</th>\n",
" <th>Kips Bay</th>\n",
" <th>SoHo</th>\n",
" <th>Upper East Side</th>\n",
" <th>Prospect Heights</th>\n",
" <th>Washington Heights</th>\n",
" <th>Gowanus</th>\n",
" <th>Flushing</th>\n",
" <th>Sunnyside</th>\n",
" <th>Financial District</th>\n",
" <th>Ridgewood</th>\n",
" <th>Morningside Heights</th>\n",
" <th>Ditmars Steinway</th>\n",
" <th>Greenwich Village</th>\n",
" <th>East Flatbush</th>\n",
" <th>Astoria</th>\n",
" <th>Nolita</th>\n",
" <th>Gramercy</th>\n",
" <th>Theater District</th>\n",
" <th>Sunset Park</th>\n",
" <th>Private room</th>\n",
" <th>Entire home/apt</th>\n",
" <th>Shared room</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.193025</td>\n",
" <td>-0.277198</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.239596</td>\n",
" <td>-0.479451</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.239596</td>\n",
" <td>-0.524396</td>\n",
" <td>-0.156424</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.193025</td>\n",
" <td>1.430714</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.099884</td>\n",
" <td>-0.232253</td>\n",
" <td>-0.186570</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" minimum_nights number_of_reviews ... Entire home/apt Shared room\n",
"0 -0.193025 -0.277198 ... 1.0 0.0\n",
"1 -0.239596 -0.479451 ... 0.0 0.0\n",
"2 -0.239596 -0.524396 ... 0.0 0.0\n",
"3 -0.193025 1.430714 ... 1.0 0.0\n",
"4 -0.099884 -0.232253 ... 0.0 0.0\n",
"\n",
"[5 rows x 50 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 394
},
"id": "exwTu6aFF5-v",
"outputId": "46973b05-0de3-4805-9e52-106d0da592da"
},
"source": [
"X_test_transformed = feature_pipeline.transform(X_test)\n",
"X_test_transformed = pd.DataFrame(\n",
" X_test_transformed,\n",
" columns = numerical_features + transformed_categories\n",
")\n",
"print(X_test.head(5))\n",
"X_test_transformed.head(5)"
],
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"text": [
" minimum_nights number_of_reviews ... neighbourhood_clean room_type\n",
"879 3 62 ... neighbourhood_other Entire home/apt\n",
"44383 21 0 ... Ridgewood Private room\n",
"15394 2 17 ... Hell's Kitchen Private room\n",
"43230 2 5 ... Financial District Entire home/apt\n",
"16332 2 30 ... East Harlem Entire home/apt\n",
"\n",
"[5 rows x 5 columns]\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>minimum_nights</th>\n",
" <th>number_of_reviews</th>\n",
" <th>calculated_host_listings_count</th>\n",
" <th>neighbourhood_other</th>\n",
" <th>Midtown</th>\n",
" <th>Harlem</th>\n",
" <th>Clinton Hill</th>\n",
" <th>East Harlem</th>\n",
" <th>Murray Hill</th>\n",
" <th>Bedford-Stuyvesant</th>\n",
" <th>Hell's Kitchen</th>\n",
" <th>Upper West Side</th>\n",
" <th>Chinatown</th>\n",
" <th>South Slope</th>\n",
" <th>West Village</th>\n",
" <th>Williamsburg</th>\n",
" <th>Fort Greene</th>\n",
" <th>Chelsea</th>\n",
" <th>Crown Heights</th>\n",
" <th>Park Slope</th>\n",
" <th>Inwood</th>\n",
" <th>East Village</th>\n",
" <th>Greenpoint</th>\n",
" <th>Bushwick</th>\n",
" <th>Flatbush</th>\n",
" <th>Lower East Side</th>\n",
" <th>Prospect-Lefferts Gardens</th>\n",
" <th>Long Island City</th>\n",
" <th>Kips Bay</th>\n",
" <th>SoHo</th>\n",
" <th>Upper East Side</th>\n",
" <th>Prospect Heights</th>\n",
" <th>Washington Heights</th>\n",
" <th>Gowanus</th>\n",
" <th>Flushing</th>\n",
" <th>Sunnyside</th>\n",
" <th>Financial District</th>\n",
" <th>Ridgewood</th>\n",
" <th>Morningside Heights</th>\n",
" <th>Ditmars Steinway</th>\n",
" <th>Greenwich Village</th>\n",
" <th>East Flatbush</th>\n",
" <th>Astoria</th>\n",
" <th>Nolita</th>\n",
" <th>Gramercy</th>\n",
" <th>Theater District</th>\n",
" <th>Sunset Park</th>\n",
" <th>Private room</th>\n",
" <th>Entire home/apt</th>\n",
" <th>Shared room</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.193025</td>\n",
" <td>0.868901</td>\n",
" <td>-0.186570</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.645248</td>\n",
" <td>-0.524396</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.239596</td>\n",
" <td>-0.142363</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.239596</td>\n",
" <td>-0.412033</td>\n",
" <td>9.640935</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.239596</td>\n",
" <td>0.149780</td>\n",
" <td>-0.186570</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" minimum_nights number_of_reviews ... Entire home/apt Shared room\n",
"0 -0.193025 0.868901 ... 1.0 0.0\n",
"1 0.645248 -0.524396 ... 0.0 0.0\n",
"2 -0.239596 -0.142363 ... 0.0 0.0\n",
"3 -0.239596 -0.412033 ... 1.0 0.0\n",
"4 -0.239596 0.149780 ... 1.0 0.0\n",
"\n",
"[5 rows x 50 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 20
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GxFji2ZnC40g"
},
"source": [
"## Auto Sklearn"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hXlJx7GcC-T5",
"outputId": "97778277-49e5-403d-8e74-061d0564f0f8"
},
"source": [
"import autosklearn.regression\n",
"automl = autosklearn.regression.AutoSklearnRegressor(\n",
" time_left_for_this_task=120,\n",
" per_run_time_limit=30,\n",
" n_jobs=1\n",
")\n",
"automl.fit(\n",
" X_train_transformed, \n",
" y_train\n",
")"
],
"execution_count": 21,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/base.py:197: FutureWarning:\n",
"\n",
"From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n",
"\n"
],
"name": "stderr"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"AutoSklearnRegressor(dask_client=None,\n",
" delete_output_folder_after_terminate=True,\n",
" delete_tmp_folder_after_terminate=True,\n",
" disable_evaluator_output=False, ensemble_nbest=50,\n",
" ensemble_size=50, exclude_estimators=None,\n",
" exclude_preprocessors=None, get_smac_object_callback=None,\n",
" include_estimators=None, include_preprocessors=None,\n",
" initial_configurations_via_metalearning=25,\n",
" load_models=None, logging_config=None,\n",
" max_models_on_disc=50, memory_limit=3072,\n",
" metadata_directory=None, metric=None, n_jobs=1,\n",
" output_folder=None, per_run_time_limit=30,\n",
" resampling_strategy='holdout',\n",
" resampling_strategy_arguments=None, seed=1,\n",
" smac_scenario_args=None, time_left_for_this_task=120,\n",
" tmp_folder=None)"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 69
},
"id": "iF8zQHX1F2Mf",
"outputId": "2a86704d-1392-4d0a-c88c-e70c937c3fff"
},
"source": [
"automl.sprint_statistics()"
],
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'auto-sklearn results:\\n Dataset name: 193c0133dfc3b6116268c126137b3a31\\n Metric: r2\\n Best validation score: 0.159246\\n Number of target algorithm runs: 8\\n Number of successful target algorithm runs: 5\\n Number of crashed target algorithm runs: 0\\n Number of target algorithms that exceeded the time limit: 3\\n Number of target algorithms that exceeded the memory limit: 0\\n'"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 137
},
"id": "oDmw2UM5DaV-",
"outputId": "71f2b243-9e9c-4cc9-b74a-33719e1bc70d"
},
"source": [
"automl.show_models()"
],
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"\"[(0.440000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.00924813449669181, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 987, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:select_rates_regression:alpha': 0.1, 'feature_preprocessor:select_rates_regression:mode': 'fwe', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 2.5833231171101403e-10, 'regressor:gradient_boosting:learning_rate': 0.10682575320034993, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 31, 'regressor:gradient_boosting:min_samples_leaf': 12, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n 'task': 4,\\n 'sparse': False,\\n 'multioutput': False,\\n 'target_type': 'regression',\\n 'signed': False})),\\n(0.260000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'feature_preprocessor:select_rates_regression:alpha': 0.40612979303062824, 'feature_preprocessor:select_rates_regression:mode': 'fdr', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 2.2742572602302616e-08, 'regressor:gradient_boosting:learning_rate': 0.02539518548794612, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 57, 'regressor:gradient_boosting:min_samples_leaf': 10, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n 'task': 4,\\n 'sparse': False,\\n 'multioutput': False,\\n 'target_type': 'regression',\\n 'signed': False})),\\n(0.240000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'select_rates_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 148, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:select_rates_regression:alpha': 0.10789818795901182, 'feature_preprocessor:select_rates_regression:mode': 'fwe', 'feature_preprocessor:select_rates_regression:score_func': 'f_regression', 'regressor:gradient_boosting:early_stop': 'off', 'regressor:gradient_boosting:l2_regularization': 0.0002841741346637058, 'regressor:gradient_boosting:learning_rate': 0.024253735132716756, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 36, 'regressor:gradient_boosting:min_samples_leaf': 1, 'regressor:gradient_boosting:scoring': 'loss', 'regressor:gradient_boosting:tol': 1e-07},\\ndataset_properties={\\n 'task': 4,\\n 'sparse': False,\\n 'multioutput': False,\\n 'target_type': 'regression',\\n 'signed': False})),\\n(0.060000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:__choice__': 'nystroem_sampler', 'regressor:__choice__': 'liblinear_svr', 'feature_preprocessor:nystroem_sampler:kernel': 'rbf', 'feature_preprocessor:nystroem_sampler:n_components': 305, 'regressor:liblinear_svr:C': 18080.177016227895, 'regressor:liblinear_svr:dual': 'False', 'regressor:liblinear_svr:epsilon': 0.0017155220175954669, 'regressor:liblinear_svr:fit_intercept': 'True', 'regressor:liblinear_svr:intercept_scaling': 1, 'regressor:liblinear_svr:loss': 'squared_epsilon_insensitive', 'regressor:liblinear_svr:tol': 7.035440846539455e-05, 'feature_preprocessor:nystroem_sampler:gamma': 0.00025844019785412086},\\ndataset_properties={\\n 'task': 4,\\n 'sparse': False,\\n 'multioutput': False,\\n 'target_type': 'regression',\\n 'signed': False})),\\n]\""
]
},
"metadata": {
"tags": []
},
"execution_count": 23
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "a0JnklSDDgfx",
"outputId": "321e23d3-b2a9-48cc-ed2c-dfeda30ed3cc"
},
"source": [
"import sklearn.metrics\n",
"predictions = automl.predict(X_test_transformed)\n",
"sklearn.metrics.r2_score(y_test, predictions)"
],
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.18626743087093045"
]
},
"metadata": {
"tags": []
},
"execution_count": 24
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "O7neO0jKHk5p"
},
"source": [
"## Grid search - single model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yq4RcZ78ELR7",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3a9e0304-0174-4964-877b-7d8de91b3c8d"
},
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import GridSearchCV\n",
"model = RandomForestRegressor(max_depth=3, random_state=0)\n",
"parameters = {\n",
" \"max_depth\": (2, 3, 5)\n",
"}\n",
"grid = GridSearchCV(model, parameters, cv=5, scoring=\"r2\")\n",
"grid.fit(X_train_transformed, y_train.values.ravel())"
],
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"GridSearchCV(cv=5, error_score=nan,\n",
" estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,\n",
" criterion='mse', max_depth=3,\n",
" max_features='auto',\n",
" max_leaf_nodes=None,\n",
" max_samples=None,\n",
" min_impurity_decrease=0.0,\n",
" min_impurity_split=None,\n",
" min_samples_leaf=1,\n",
" min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0,\n",
" n_estimators=100, n_jobs=None,\n",
" oob_score=False, random_state=0,\n",
" verbose=0, warm_start=False),\n",
" iid='deprecated', n_jobs=None, param_grid={'max_depth': (2, 3, 5)},\n",
" pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
" scoring='r2', verbose=0)"
]
},
"metadata": {
"tags": []
},
"execution_count": 25
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1xLArYFiI1JR",
"outputId": "c64127b8-93df-44ba-a30d-3e19e38676a8"
},
"source": [
"grid.best_estimator_"
],
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n",
" max_depth=3, max_features='auto', max_leaf_nodes=None,\n",
" max_samples=None, min_impurity_decrease=0.0,\n",
" min_impurity_split=None, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=100, n_jobs=None, oob_score=False,\n",
" random_state=0, verbose=0, warm_start=False)"
]
},
"metadata": {
"tags": []
},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xgHZYwXGI-3t",
"outputId": "9aae988d-c9a4-440e-d1b1-07f56736b34d"
},
"source": [
"grid.best_params_"
],
"execution_count": 27,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'max_depth': 3}"
]
},
"metadata": {
"tags": []
},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Xxlns6rjH9XF",
"outputId": "baa8705f-58eb-4289-f67d-feb09644cd1a"
},
"source": [
"predictions = grid.predict(X_test_transformed)\n",
"sklearn.metrics.r2_score(y_test, predictions)"
],
"execution_count": 28,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.09826359771273341"
]
},
"metadata": {
"tags": []
},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "NPxkEzo7IzXy"
},
"source": [
""
],
"execution_count": 28,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment