DeepakRavi/Expedia.ipynb

## Expedia.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Expedia Hotel Recommendations\n",
    "\n",
    "Which hotel type will an Expedia customer book? Currently Expedia uses search parameters to adjust hotel recommendations of customers but there aren't any customer specific data to personalize them for each user. In the below analysis, we contextualize millions of rows of customer data and predict the likelihood a user will stay at 100 different hotel groups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Expedia Hotel Recommendation\n",
    "# Submission 1\n",
    "# 5/24/2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#importing pandas for data frame operations\n",
    "#Reading csv files \n",
    "import pandas as pd\n",
    "\n",
    "destinations = pd.read_csv(\"destinations.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>srch_destination_id</th>\n",
       "      <th>d1</th>\n",
       "      <th>d2</th>\n",
       "      <th>d3</th>\n",
       "      <th>d4</th>\n",
       "      <th>d5</th>\n",
       "      <th>d6</th>\n",
       "      <th>d7</th>\n",
       "      <th>d8</th>\n",
       "      <th>d9</th>\n",
       "      <th>...</th>\n",
       "      <th>d140</th>\n",
       "      <th>d141</th>\n",
       "      <th>d142</th>\n",
       "      <th>d143</th>\n",
       "      <th>d144</th>\n",
       "      <th>d145</th>\n",
       "      <th>d146</th>\n",
       "      <th>d147</th>\n",
       "      <th>d148</th>\n",
       "      <th>d149</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-1.897627</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-1.897627</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "      <td>-2.198657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.082564</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.165028</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.031597</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.165028</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.165028</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.165028</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "      <td>-2.181690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>-2.183490</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.189562</td>\n",
       "      <td>-2.105819</td>\n",
       "      <td>-2.075407</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.118483</td>\n",
       "      <td>-2.140393</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.196379</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.192009</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.224164</td>\n",
       "      <td>-2.057548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.115485</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.161081</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "      <td>-2.177409</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>-2.189562</td>\n",
       "      <td>-2.187783</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.171153</td>\n",
       "      <td>-2.152303</td>\n",
       "      <td>-2.056618</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.145911</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.187356</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.191779</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.185161</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.194008</td>\n",
       "      <td>-2.188037</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 150 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   srch_destination_id        d1        d2        d3        d4        d5  \\\n",
       "0                    0 -2.198657 -2.198657 -2.198657 -2.198657 -2.198657   \n",
       "1                    1 -2.181690 -2.181690 -2.181690 -2.082564 -2.181690   \n",
       "2                    2 -2.183490 -2.224164 -2.224164 -2.189562 -2.105819   \n",
       "3                    3 -2.177409 -2.177409 -2.177409 -2.177409 -2.177409   \n",
       "4                    4 -2.189562 -2.187783 -2.194008 -2.171153 -2.152303   \n",
       "\n",
       "         d6        d7        d8        d9    ...         d140      d141  \\\n",
       "0 -1.897627 -2.198657 -2.198657 -1.897627    ...    -2.198657 -2.198657   \n",
       "1 -2.165028 -2.181690 -2.181690 -2.031597    ...    -2.165028 -2.181690   \n",
       "2 -2.075407 -2.224164 -2.118483 -2.140393    ...    -2.224164 -2.224164   \n",
       "3 -2.115485 -2.177409 -2.177409 -2.177409    ...    -2.161081 -2.177409   \n",
       "4 -2.056618 -2.194008 -2.194008 -2.145911    ...    -2.187356 -2.194008   \n",
       "\n",
       "       d142      d143      d144      d145      d146      d147      d148  \\\n",
       "0 -2.198657 -2.198657 -2.198657 -2.198657 -2.198657 -2.198657 -2.198657   \n",
       "1 -2.165028 -2.181690 -2.181690 -2.165028 -2.181690 -2.181690 -2.181690   \n",
       "2 -2.196379 -2.224164 -2.192009 -2.224164 -2.224164 -2.224164 -2.224164   \n",
       "3 -2.177409 -2.177409 -2.177409 -2.177409 -2.177409 -2.177409 -2.177409   \n",
       "4 -2.191779 -2.194008 -2.194008 -2.185161 -2.194008 -2.194008 -2.194008   \n",
       "\n",
       "       d149  \n",
       "0 -2.198657  \n",
       "1 -2.181690  \n",
       "2 -2.057548  \n",
       "3 -2.177409  \n",
       "4 -2.188037  \n",
       "\n",
       "[5 rows x 150 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "destinations.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading the entire training dataset into memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "train = pd.read_csv('train.csv',\n",
    "                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32,\n",
    "                           'user_location_country':np.int32, 'user_location_region':np.int32, 'user_location_city':np.int32,\n",
    "                          'orig_destination_distance':np.float64, 'hotel_market':np.int32},\n",
    "                    usecols=['srch_destination_id','is_booking','hotel_cluster','user_location_country','user_location_region',\n",
    "                            'user_location_city','orig_destination_distance','hotel_market'],\n",
    "                    chunksize=1000000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df=pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.concat(chunk for chunk in train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(37670293, 8)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Viewing the training dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_location_country</th>\n",
       "      <th>user_location_region</th>\n",
       "      <th>user_location_city</th>\n",
       "      <th>orig_destination_distance</th>\n",
       "      <th>srch_destination_id</th>\n",
       "      <th>is_booking</th>\n",
       "      <th>hotel_market</th>\n",
       "      <th>hotel_cluster</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>66</td>\n",
       "      <td>348</td>\n",
       "      <td>48862</td>\n",
       "      <td>2234.2641</td>\n",
       "      <td>8250</td>\n",
       "      <td>False</td>\n",
       "      <td>628</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>66</td>\n",
       "      <td>348</td>\n",
       "      <td>48862</td>\n",
       "      <td>2234.2641</td>\n",
       "      <td>8250</td>\n",
       "      <td>True</td>\n",
       "      <td>628</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>66</td>\n",
       "      <td>348</td>\n",
       "      <td>48862</td>\n",
       "      <td>2234.2641</td>\n",
       "      <td>8250</td>\n",
       "      <td>False</td>\n",
       "      <td>628</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>66</td>\n",
       "      <td>442</td>\n",
       "      <td>35390</td>\n",
       "      <td>913.1932</td>\n",
       "      <td>14984</td>\n",
       "      <td>False</td>\n",
       "      <td>1457</td>\n",
       "      <td>80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>66</td>\n",
       "      <td>442</td>\n",
       "      <td>35390</td>\n",
       "      <td>913.6259</td>\n",
       "      <td>14984</td>\n",
       "      <td>False</td>\n",
       "      <td>1457</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_location_country  user_location_region  user_location_city  \\\n",
       "0                     66                   348               48862   \n",
       "1                     66                   348               48862   \n",
       "2                     66                   348               48862   \n",
       "3                     66                   442               35390   \n",
       "4                     66                   442               35390   \n",
       "\n",
       "   orig_destination_distance  srch_destination_id is_booking  hotel_market  \\\n",
       "0                  2234.2641                 8250      False           628   \n",
       "1                  2234.2641                 8250       True           628   \n",
       "2                  2234.2641                 8250      False           628   \n",
       "3                   913.1932                14984      False          1457   \n",
       "4                   913.6259                14984      False          1457   \n",
       "\n",
       "   hotel_cluster  \n",
       "0              1  \n",
       "1              1  \n",
       "2              1  \n",
       "3             80  \n",
       "4             21  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_location_country          int32\n",
       "user_location_region           int32\n",
       "user_location_city             int32\n",
       "orig_destination_distance    float64\n",
       "srch_destination_id            int32\n",
       "is_booking                      bool\n",
       "hotel_market                   int32\n",
       "hotel_cluster                  int32\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = df\n",
    "train.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading the testing dataset into memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2528243, 22)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Viewing the testing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>date_time</th>\n",
       "      <th>site_name</th>\n",
       "      <th>posa_continent</th>\n",
       "      <th>user_location_country</th>\n",
       "      <th>user_location_region</th>\n",
       "      <th>user_location_city</th>\n",
       "      <th>orig_destination_distance</th>\n",
       "      <th>user_id</th>\n",
       "      <th>is_mobile</th>\n",
       "      <th>...</th>\n",
       "      <th>srch_ci</th>\n",
       "      <th>srch_co</th>\n",
       "      <th>srch_adults_cnt</th>\n",
       "      <th>srch_children_cnt</th>\n",
       "      <th>srch_rm_cnt</th>\n",
       "      <th>srch_destination_id</th>\n",
       "      <th>srch_destination_type_id</th>\n",
       "      <th>hotel_continent</th>\n",
       "      <th>hotel_country</th>\n",
       "      <th>hotel_market</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2015-09-03 17:09:54</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>66</td>\n",
       "      <td>174</td>\n",
       "      <td>37449</td>\n",
       "      <td>5539.0567</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>2016-05-19</td>\n",
       "      <td>2016-05-23</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>12243</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>204</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2015-09-24 17:38:35</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>66</td>\n",
       "      <td>174</td>\n",
       "      <td>37449</td>\n",
       "      <td>5873.2923</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>2016-05-12</td>\n",
       "      <td>2016-05-15</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>14474</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>204</td>\n",
       "      <td>1540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2015-06-07 15:53:02</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>66</td>\n",
       "      <td>142</td>\n",
       "      <td>17440</td>\n",
       "      <td>3975.9776</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2015-07-26</td>\n",
       "      <td>2015-07-27</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>11353</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2015-09-14 14:49:10</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>66</td>\n",
       "      <td>258</td>\n",
       "      <td>34156</td>\n",
       "      <td>1508.5975</td>\n",
       "      <td>28</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2015-09-14</td>\n",
       "      <td>2015-09-16</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>8250</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>628</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2015-07-17 09:32:04</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>66</td>\n",
       "      <td>467</td>\n",
       "      <td>36345</td>\n",
       "      <td>66.7913</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2015-07-22</td>\n",
       "      <td>2015-07-23</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>11812</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>538</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id            date_time  site_name  posa_continent  user_location_country  \\\n",
       "0   0  2015-09-03 17:09:54          2               3                     66   \n",
       "1   1  2015-09-24 17:38:35          2               3                     66   \n",
       "2   2  2015-06-07 15:53:02          2               3                     66   \n",
       "3   3  2015-09-14 14:49:10          2               3                     66   \n",
       "4   4  2015-07-17 09:32:04          2               3                     66   \n",
       "\n",
       "   user_location_region  user_location_city  orig_destination_distance  \\\n",
       "0                   174               37449                  5539.0567   \n",
       "1                   174               37449                  5873.2923   \n",
       "2                   142               17440                  3975.9776   \n",
       "3                   258               34156                  1508.5975   \n",
       "4                   467               36345                    66.7913   \n",
       "\n",
       "   user_id  is_mobile      ...          srch_ci     srch_co srch_adults_cnt  \\\n",
       "0        1          1      ...       2016-05-19  2016-05-23               2   \n",
       "1        1          1      ...       2016-05-12  2016-05-15               2   \n",
       "2       20          0      ...       2015-07-26  2015-07-27               4   \n",
       "3       28          0      ...       2015-09-14  2015-09-16               2   \n",
       "4       50          0      ...       2015-07-22  2015-07-23               2   \n",
       "\n",
       "  srch_children_cnt  srch_rm_cnt  srch_destination_id  \\\n",
       "0                 0            1                12243   \n",
       "1                 0            1                14474   \n",
       "2                 0            1                11353   \n",
       "3                 0            1                 8250   \n",
       "4                 0            1                11812   \n",
       "\n",
       "   srch_destination_type_id  hotel_continent  hotel_country  hotel_market  \n",
       "0                         6                6            204            27  \n",
       "1                         7                6            204          1540  \n",
       "2                         1                2             50           699  \n",
       "3                         1                2             50           628  \n",
       "4                         1                2             50           538  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Algorithm for finding suitable clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def make_key(items):\n",
    "    return \"_\".join([str(i) for i in items])\n",
    "\n",
    "match_cols = [\"srch_destination_id\"]\n",
    "cluster_cols = match_cols + ['hotel_cluster']\n",
    "groups = train.groupby(cluster_cols)\n",
    "top_clusters = {}\n",
    "for name, group in groups:\n",
    "    clicks = len(group.is_booking[group.is_booking == False])\n",
    "    bookings = len(group.is_booking[group.is_booking == True])\n",
    "    \n",
    "    score = bookings + .086 * clicks    \n",
    "    \n",
    "    clus_name = make_key(name[:len(match_cols)])\n",
    "    if clus_name not in top_clusters:\n",
    "        top_clusters[clus_name] = {}\n",
    "    top_clusters[clus_name][name[-1]] = score\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Finding top 5 clusters for each search destination"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import operator\n",
    "\n",
    "cluster_dict = {}\n",
    "for n in top_clusters:\n",
    "    tc = top_clusters[n]\n",
    "    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]\n",
    "    cluster_dict[n] = top"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Making predictions based on top 5 clusters for each destination"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "preds = []\n",
    "for index, row in test.iterrows():\n",
    "    key = make_key([row[m] for m in match_cols])\n",
    "    if key in cluster_dict:\n",
    "        preds.append(cluster_dict[key])\n",
    "    else:\n",
    "        preds.append([])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Error Calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Calculate error\n",
    "#Implementation from ml_metrics package by Ben Hamner\n",
    "#Error calculation is useful only while building the model and not for the entire test dataset\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def apk(actual, predicted, k=10):\n",
    "    if len(predicted)>k:\n",
    "        predicted = predicted[:k]\n",
    "\n",
    "    score = 0.0\n",
    "    num_hits = 0.0\n",
    "    \n",
    "    for i,p in enumerate(predicted):\n",
    "        if p in actual and p not in predicted[:i]:\n",
    "            num_hits += 1.0\n",
    "            score += num_hits / (i+1.0)\n",
    "\n",
    "    if not actual:\n",
    "        return 0.0\n",
    "\n",
    "    return score / min(len(actual), k)\n",
    "\n",
    "\n",
    "def mapk(actual, predicted, k=10):    \n",
    "    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])\n",
    "\n",
    "mapk([[l] for l in test[\"hotel_cluster\"]], preds, k=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Making an initial submission file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Making a submission file\n",
    "\n",
    "write_p = [\" \".join([str(l) for l in p]) for p in preds]\n",
    "write_frame = [\"{0},{1}\".format(test[\"id\"][i], write_p[i]) for i in range(len(preds))]\n",
    "write_frame = [\"id,hotel_cluster\"] + write_frame\n",
    "with open(\"predictions.csv\", \"w+\") as f:\n",
    "    f.write(\"\\n\".join(write_frame))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "predictions = pd.read_csv('predictions.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2528243, 2)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Prediction file sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>hotel_cluster</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>5 37 55 11 22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0 31 96 91 77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1 45 79 24 54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>91 42 2 48 59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  hotel_cluster\n",
       "0   0  5 37 55 11 22\n",
       "1   1              5\n",
       "2   2  0 31 96 91 77\n",
       "3   3  1 45 79 24 54\n",
       "4   4  91 42 2 48 59"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Improving Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Improving predictions\n",
    "#should be a new training dataset containing all columns\n",
    "#Finding common columns in training and testing datasets\n",
    "\n",
    "match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']\n",
    "\n",
    "groups = train.groupby(match_cols)\n",
    "    \n",
    "def generate_exact_matches(row, match_cols):\n",
    "    index = tuple([row[t] for t in match_cols])\n",
    "    try:\n",
    "        group = groups.get_group(index)\n",
    "    except Exception:\n",
    "        return []\n",
    "    clus = list(set(group.hotel_cluster))\n",
    "    return clus\n",
    "\n",
    "exact_matches = []\n",
    "for i in range(test.shape[0]):\n",
    "    exact_matches.append(generate_exact_matches(test.iloc[i], match_cols))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Most common clusters based on value count\n",
    "\n",
    "most_common_clusters = list(train.hotel_cluster.value_counts().head().index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Full predictions as a combination of all the above predictions\n",
    "\n",
    "def f5(seq, idfun=None): \n",
    "    if idfun is None:\n",
    "        def idfun(x): return x\n",
    "    seen = {}\n",
    "    result = []\n",
    "    for item in seq:\n",
    "        marker = idfun(item)\n",
    "        if marker in seen: continue\n",
    "        seen[marker] = 1\n",
    "        result.append(item)\n",
    "    return result\n",
    "    \n",
    "full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Making final submission file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Making a submission file\n",
    "\n",
    "write_p = [\" \".join([str(l) for l in p]) for p in full_preds]\n",
    "write_frame = [\"{0},{1}\".format(test[\"id\"][i], write_p[i]) for i in range(len(full_preds))]\n",
    "write_frame = [\"id,hotel_cluster\"] + write_frame\n",
    "with open(\"predictions.csv\", \"w+\") as f:\n",
    "    f.write(\"\\n\".join(write_frame))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "predictions = pd.read_csv('predictions.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2528243, 2)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Final Prediction of first 5 hotel id groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>hotel_cluster</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>5 37 55 11 22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>5 91 41 48 64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>91 0 31 96 77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1 45 79 24 54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>50 51 91 42 2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  hotel_cluster\n",
       "0   0  5 37 55 11 22\n",
       "1   1  5 91 41 48 64\n",
       "2   2  91 0 31 96 77\n",
       "3   3  1 45 79 24 54\n",
       "4   4  50 51 91 42 2"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Conclusions\n",
    "\n",
    "Hotel groups have been predicted above. For the full list of hotel groups, check out the predictions file generated from the above algorithm."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}