analyticsindiamagazine/Soln_Predict_Flight_Ticket.ipynb

## Soln_Predict_Flight_Ticket.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Soln_Predict_Flight_Ticket.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "metadata": {
        "id": "QFdzQ7cSJeyb",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "##Importing the Datasets\n",
        "\n"
      ]
    },
    {
      "metadata": {
        "id": "W9AtctmgJ6q5",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "\n",
        "training_set = pd.read_excel(\"Data_Train.xlsx\")\n",
        "test_set = pd.read_excel(\"Test_set.xlsx\")\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "laBIWbP-J9cR",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "##Performing a simple Exploratory Data Analysis"
      ]
    },
    {
      "metadata": {
        "id": "iApzJbwnKD65",
        "colab_type": "code",
        "outputId": "9ed10b8a-6cde-493c-c87d-9ec3ae7b0d70",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1598
        }
      },
      "cell_type": "code",
      "source": [
        "# chechking the features in the Datasets\n",
        "\n",
        "#Training Set\n",
        "\n",
        "print(\"\\nEDA on Training Set\\n\")\n",
        "print(\"#\"*30)\n",
        "\n",
        "print(\"\\nFeatures/Columns : \\n\", training_set.columns)\n",
        "print(\"\\n\\nNumber of Features/Columns : \", len(training_set.columns))\n",
        "print(\"\\nNumber of Rows : \",len(training_set))\n",
        "print(\"\\n\\nData Types :\\n\", training_set.dtypes)\n",
        "\n",
        "print(\"\\n Contains NaN/Empty cells : \", training_set.isnull().values.any())\n",
        "\n",
        "print(\"\\n Total empty cells by column :\\n\", training_set.isnull().sum(), \"\\n\\n\")\n",
        "\n",
        "\n",
        "# Test Set\n",
        "print(\"#\"*30)\n",
        "print(\"\\nEDA on Test Set\\n\")\n",
        "print(\"#\"*30)\n",
        "\n",
        "\n",
        "print(\"\\nFeatures/Columns : \\n\",test_set.columns)\n",
        "print(\"\\n\\nNumber of Features/Columns : \",len(test_set.columns))\n",
        "print(\"\\nNumber of Rows : \",len(test_set))\n",
        "print(\"\\n\\nData Types :\\n\", test_set.dtypes)\n",
        "print(\"\\n Contains NaN/Empty cells : \", test_set.isnull().values.any())\n",
        "print(\"\\n Total empty cells by column :\\n\", test_set.isnull().sum())\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\n",
            "EDA on Training Set\n",
            "\n",
            "##############################\n",
            "\n",
            "Features/Columns : \n",
            " Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',\n",
            "       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',\n",
            "       'Additional_Info', 'Price'],\n",
            "      dtype='object')\n",
            "\n",
            "\n",
            "Number of Features/Columns :  11\n",
            "\n",
            "Number of Rows :  10683\n",
            "\n",
            "\n",
            "Data Types :\n",
            " Airline            object\n",
            "Date_of_Journey    object\n",
            "Source             object\n",
            "Destination        object\n",
            "Route              object\n",
            "Dep_Time           object\n",
            "Arrival_Time       object\n",
            "Duration           object\n",
            "Total_Stops        object\n",
            "Additional_Info    object\n",
            "Price               int64\n",
            "dtype: object\n",
            "\n",
            " Contains NaN/Empty cells :  True\n",
            "\n",
            " Total empty cells by column :\n",
            " Airline            0\n",
            "Date_of_Journey    0\n",
            "Source             0\n",
            "Destination        0\n",
            "Route              1\n",
            "Dep_Time           0\n",
            "Arrival_Time       0\n",
            "Duration           0\n",
            "Total_Stops        1\n",
            "Additional_Info    0\n",
            "Price              0\n",
            "dtype: int64 \n",
            "\n",
            "\n",
            "##############################\n",
            "\n",
            "EDA on Test Set\n",
            "\n",
            "##############################\n",
            "\n",
            "Features/Columns : \n",
            " Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',\n",
            "       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',\n",
            "       'Additional_Info'],\n",
            "      dtype='object')\n",
            "\n",
            "\n",
            "Number of Features/Columns :  10\n",
            "\n",
            "Number of Rows :  2671\n",
            "\n",
            "\n",
            "Data Types :\n",
            " Airline            object\n",
            "Date_of_Journey    object\n",
            "Source             object\n",
            "Destination        object\n",
            "Route              object\n",
            "Dep_Time           object\n",
            "Arrival_Time       object\n",
            "Duration           object\n",
            "Total_Stops        object\n",
            "Additional_Info    object\n",
            "dtype: object\n",
            "\n",
            " Contains NaN/Empty cells :  False\n",
            "\n",
            " Total empty cells by column :\n",
            " Airline            0\n",
            "Date_of_Journey    0\n",
            "Source             0\n",
            "Destination        0\n",
            "Route              0\n",
            "Dep_Time           0\n",
            "Arrival_Time       0\n",
            "Duration           0\n",
            "Total_Stops        0\n",
            "Additional_Info    0\n",
            "dtype: int64\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "Yp5ybsWMKNQQ",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "##Cleaning and Preprocessing the Datasets"
      ]
    },
    {
      "metadata": {
        "id": "weWTV59eXdnR",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Missing Fields/ NaNs"
      ]
    },
    {
      "metadata": {
        "id": "bIYSEkpzKU2w",
        "colab_type": "code",
        "outputId": "89c9dd32-51ab-4aff-f2e1-aa6c7c69875e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "cell_type": "code",
      "source": [
        "# Dealing with the Missing Value\n",
        "\n",
        "print(\"Original Length of Training Set : \", len(training_set))\n",
        "\n",
        "training_set = training_set.dropna()\n",
        "\n",
        "print(\"Length of Training Set after dropping NaN: \", len(training_set))\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Original Length of Training Set :  10683\n",
            "Length of Training Set after dropping NaN:  10682\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "Qtw8aC15Y1cz",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "Since we observe from the EDA that only one row has an empty field we will just remove the entire record/row.\n",
        "\n",
        "**Note**:\n",
        "\n",
        "If the test set contains any missing fields dropping NaNs is not the right way as the number of rows are important. Also dropping rows should be used only for large datasets with very few missing fields"
      ]
    },
    {
      "metadata": {
        "id": "ajkcXR_MYGSt",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "##Cleaning Data"
      ]
    },
    {
      "metadata": {
        "id": "7kj2GMiBXj-X",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Date_of_Journey"
      ]
    },
    {
      "metadata": {
        "id": "YrgBP6PbXre3",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "#Cleaning Journey Date \n",
        "\n",
        "#Training Set\n",
        "\n",
        "training_set['Journey_Day'] = pd.to_datetime(training_set.Date_of_Journey, format='%d/%m/%Y').dt.day\n",
        "\n",
        "training_set['Journey_Month'] = pd.to_datetime(training_set.Date_of_Journey, format='%d/%m/%Y').dt.month\n",
        "\n",
        "# Test Set\n",
        "\n",
        "test_set['Journey_Day'] = pd.to_datetime(test_set.Date_of_Journey, format='%d/%m/%Y').dt.day\n",
        "\n",
        "test_set['Journey_Month'] = pd.to_datetime(test_set.Date_of_Journey, format='%d/%m/%Y').dt.month\n",
        "\n",
        "# Compare the dates and delete the original date feature\n",
        "\n",
        "training_set.drop(labels = 'Date_of_Journey', axis = 1, inplace = True)\n",
        "\n",
        "test_set.drop(labels = 'Date_of_Journey', axis = 1, inplace = True)\n",
        "\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "8iyNYQtrb9UD",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "The Date_of_Journey column contains dates in string format. We will first conver the strings in to pandas datetime object. \n",
        "\n",
        "The Date and Time data can be dealt in various ways. Here we will split the date into days and months. Year is not considered as it is same (2019) for all rows.\n",
        "\n",
        "\n",
        "\n",
        "1.   pd.to_datetime(training_set.Date_of_Journey, format='%d/%m/%Y') : Converts the string in to datetime object.\n",
        "2.   pd.to_datetime(training_set.Date_of_Journey, format='%d/%m/%Y').dt.day : Returns the day in the date\n",
        "1.   pd.to_datetime(training_set.Date_of_Journey, format='%d/%m/%Y').dt.month : Returns the month in the date\n",
        "\n",
        "\n",
        "\n"
      ]
    },
    {
      "metadata": {
        "id": "GQ6WvGUlXzdn",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Duration"
      ]
    },
    {
      "metadata": {
        "id": "4S7ga_N9Xyqu",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "# Cleaning Duration\n",
        "\n",
        "# Training Set\n",
        "\n",
        "duration = list(training_set['Duration'])\n",
        "\n",
        "for i in range(len(duration)) :\n",
        "    if len(duration[i].split()) != 2:\n",
        "        if 'h' in duration[i] :\n",
        "            duration[i] = duration[i].strip() + ' 0m'\n",
        "        elif 'm' in duration[i] :\n",
        "            duration[i] = '0h {}'.format(duration[i].strip())\n",
        "\n",
        "dur_hours = []\n",
        "dur_minutes = []  \n",
        "\n",
        "for i in range(len(duration)) :\n",
        "    dur_hours.append(int(duration[i].split()[0][:-1]))\n",
        "    dur_minutes.append(int(duration[i].split()[1][:-1]))\n",
        "    \n",
        "training_set['Duration_hours'] = dur_hours\n",
        "training_set['Duration_minutes'] =dur_minutes\n",
        "\n",
        "training_set.drop(labels = 'Duration', axis = 1, inplace = True)\n",
        "\n",
        "\n",
        "# Test Set\n",
        "\n",
        "durationT = list(test_set['Duration'])\n",
        "\n",
        "for i in range(len(durationT)) :\n",
        "    if len(durationT[i].split()) != 2:\n",
        "        if 'h' in durationT[i] :\n",
        "            durationT[i] = durationT[i].strip() + ' 0m'\n",
        "        elif 'm' in durationT[i] :\n",
        "            durationT[i] = '0h {}'.format(durationT[i].strip())\n",
        "            \n",
        "dur_hours = []\n",
        "dur_minutes = []  \n",
        "\n",
        "for i in range(len(durationT)) :\n",
        "    dur_hours.append(int(durationT[i].split()[0][:-1]))\n",
        "    dur_minutes.append(int(durationT[i].split()[1][:-1]))\n",
        "  \n",
        "    \n",
        "test_set['Duration_hours'] = dur_hours\n",
        "test_set['Duration_minutes'] = dur_minutes\n",
        "\n",
        "test_set.drop(labels = 'Duration', axis = 1, inplace = True)\n",
        "\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "CfeXrXoOeXuL",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "Here we split the Duration in to two features Duration_hours and Duration_minutes. \n",
        "\n",
        "We will first make all rows symmetric by adding \"0m\" where minute is not mentioned and \"0h\" where hour is not mentioned.Now we split it in to 2 integer values.\n",
        "\n",
        "Always keep in mind that there are multiple approaches to cleaning data and it depends on the programmers logic."
      ]
    },
    {
      "metadata": {
        "id": "ST_Lqa9pX-Dl",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Departure and Arrival Times"
      ]
    },
    {
      "metadata": {
        "id": "GZPmrUFVX9Dn",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "#Cleaning Departure and Arrival Times\n",
        "\n",
        "# Training Set\n",
        "\n",
        "\n",
        "training_set['Depart_Time_Hour'] = pd.to_datetime(training_set.Dep_Time).dt.hour\n",
        "training_set['Depart_Time_Minutes'] = pd.to_datetime(training_set.Dep_Time).dt.minute\n",
        "\n",
        "training_set.drop(labels = 'Dep_Time', axis = 1, inplace = True)\n",
        "\n",
        "\n",
        "training_set['Arr_Time_Hour'] = pd.to_datetime(training_set.Arrival_Time).dt.hour\n",
        "training_set['Arr_Time_Minutes'] = pd.to_datetime(training_set.Arrival_Time).dt.minute\n",
        "\n",
        "training_set.drop(labels = 'Arrival_Time', axis = 1, inplace = True)\n",
        "\n",
        "\n",
        "# Test Set\n",
        "\n",
        "\n",
        "test_set['Depart_Time_Hour'] = pd.to_datetime(test_set.Dep_Time).dt.hour\n",
        "test_set['Depart_Time_Minutes'] = pd.to_datetime(test_set.Dep_Time).dt.minute\n",
        "\n",
        "\n",
        "test_set.drop(labels = 'Dep_Time', axis = 1, inplace = True)\n",
        "\n",
        "test_set['Arr_Time_Hour'] = pd.to_datetime(test_set.Arrival_Time).dt.hour\n",
        "test_set['Arr_Time_Minutes'] = pd.to_datetime(test_set.Arrival_Time).dt.minute\n",
        "\n",
        "test_set.drop(labels = 'Arrival_Time', axis = 1, inplace = True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "fY7rB8pugLum",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "We will also split the Dep_Time and Arrival_Time features in to two features each, hours and minutes."
      ]
    },
    {
      "metadata": {
        "id": "Tpml9l8wggm5",
        "colab_type": "code",
        "outputId": "f81e689d-2c9d-4040-857e-57d9e5d40225",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 374
        }
      },
      "cell_type": "code",
      "source": [
        "training_set.head()"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Airline</th>\n",
              "      <th>Source</th>\n",
              "      <th>Destination</th>\n",
              "      <th>Route</th>\n",
              "      <th>Total_Stops</th>\n",
              "      <th>Additional_Info</th>\n",
              "      <th>Price</th>\n",
              "      <th>Journey_Day</th>\n",
              "      <th>Journey_Month</th>\n",
              "      <th>Duration_hours</th>\n",
              "      <th>Duration_minutes</th>\n",
              "      <th>Depart_Time_Hour</th>\n",
              "      <th>Depart_Time_Minutes</th>\n",
              "      <th>Arr_Time_Hour</th>\n",
              "      <th>Arr_Time_Minutes</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>IndiGo</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>New Delhi</td>\n",
              "      <td>BLR → DEL</td>\n",
              "      <td>non-stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>3897</td>\n",
              "      <td>24</td>\n",
              "      <td>3</td>\n",
              "      <td>2</td>\n",
              "      <td>50</td>\n",
              "      <td>22</td>\n",
              "      <td>20</td>\n",
              "      <td>1</td>\n",
              "      <td>10</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Air India</td>\n",
              "      <td>Kolkata</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>CCU → IXR → BBI → BLR</td>\n",
              "      <td>2 stops</td>\n",
              "      <td>No info</td>\n",
              "      <td>7662</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "      <td>7</td>\n",
              "      <td>25</td>\n",
              "      <td>5</td>\n",
              "      <td>50</td>\n",
              "      <td>13</td>\n",
              "      <td>15</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Jet Airways</td>\n",
              "      <td>Delhi</td>\n",
              "      <td>Cochin</td>\n",
              "      <td>DEL → LKO → BOM → COK</td>\n",
              "      <td>2 stops</td>\n",
              "      <td>No info</td>\n",
              "      <td>13882</td>\n",
              "      <td>9</td>\n",
              "      <td>6</td>\n",
              "      <td>19</td>\n",
              "      <td>0</td>\n",
              "      <td>9</td>\n",
              "      <td>25</td>\n",
              "      <td>4</td>\n",
              "      <td>25</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>IndiGo</td>\n",
              "      <td>Kolkata</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>CCU → NAG → BLR</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>6218</td>\n",
              "      <td>12</td>\n",
              "      <td>5</td>\n",
              "      <td>5</td>\n",
              "      <td>25</td>\n",
              "      <td>18</td>\n",
              "      <td>5</td>\n",
              "      <td>23</td>\n",
              "      <td>30</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>IndiGo</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>New Delhi</td>\n",
              "      <td>BLR → NAG → DEL</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>13302</td>\n",
              "      <td>1</td>\n",
              "      <td>3</td>\n",
              "      <td>4</td>\n",
              "      <td>45</td>\n",
              "      <td>16</td>\n",
              "      <td>50</td>\n",
              "      <td>21</td>\n",
              "      <td>35</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       Airline    Source Destination                  Route Total_Stops  \\\n",
              "0       IndiGo  Banglore   New Delhi              BLR → DEL    non-stop   \n",
              "1    Air India   Kolkata    Banglore  CCU → IXR → BBI → BLR     2 stops   \n",
              "2  Jet Airways     Delhi      Cochin  DEL → LKO → BOM → COK     2 stops   \n",
              "3       IndiGo   Kolkata    Banglore        CCU → NAG → BLR      1 stop   \n",
              "4       IndiGo  Banglore   New Delhi        BLR → NAG → DEL      1 stop   \n",
              "\n",
              "  Additional_Info  Price  Journey_Day  Journey_Month  Duration_hours  \\\n",
              "0         No info   3897           24              3               2   \n",
              "1         No info   7662            1              5               7   \n",
              "2         No info  13882            9              6              19   \n",
              "3         No info   6218           12              5               5   \n",
              "4         No info  13302            1              3               4   \n",
              "\n",
              "   Duration_minutes  Depart_Time_Hour  Depart_Time_Minutes  Arr_Time_Hour  \\\n",
              "0                50                22                   20              1   \n",
              "1                25                 5                   50             13   \n",
              "2                 0                 9                   25              4   \n",
              "3                25                18                    5             23   \n",
              "4                45                16                   50             21   \n",
              "\n",
              "   Arr_Time_Minutes  \n",
              "0                10  \n",
              "1                15  \n",
              "2                25  \n",
              "3                30  \n",
              "4                35  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "metadata": {
        "id": "zzomrtWqhLCc",
        "colab_type": "code",
        "outputId": "75cf614f-44a2-4728-f22e-5e78172d9424",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 272
        }
      },
      "cell_type": "code",
      "source": [
        "test_set.head()"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Airline</th>\n",
              "      <th>Source</th>\n",
              "      <th>Destination</th>\n",
              "      <th>Route</th>\n",
              "      <th>Total_Stops</th>\n",
              "      <th>Additional_Info</th>\n",
              "      <th>Journey_Day</th>\n",
              "      <th>Journey_Month</th>\n",
              "      <th>Duration_hours</th>\n",
              "      <th>Duration_minutes</th>\n",
              "      <th>Depart_Time_Hour</th>\n",
              "      <th>Depart_Time_Minutes</th>\n",
              "      <th>Arr_Time_Hour</th>\n",
              "      <th>Arr_Time_Minutes</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Jet Airways</td>\n",
              "      <td>Delhi</td>\n",
              "      <td>Cochin</td>\n",
              "      <td>DEL → BOM → COK</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>6</td>\n",
              "      <td>6</td>\n",
              "      <td>10</td>\n",
              "      <td>55</td>\n",
              "      <td>17</td>\n",
              "      <td>30</td>\n",
              "      <td>4</td>\n",
              "      <td>25</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>IndiGo</td>\n",
              "      <td>Kolkata</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>CCU → MAA → BLR</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>12</td>\n",
              "      <td>5</td>\n",
              "      <td>4</td>\n",
              "      <td>0</td>\n",
              "      <td>6</td>\n",
              "      <td>20</td>\n",
              "      <td>10</td>\n",
              "      <td>20</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Jet Airways</td>\n",
              "      <td>Delhi</td>\n",
              "      <td>Cochin</td>\n",
              "      <td>DEL → BOM → COK</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>In-flight meal not included</td>\n",
              "      <td>21</td>\n",
              "      <td>5</td>\n",
              "      <td>23</td>\n",
              "      <td>45</td>\n",
              "      <td>19</td>\n",
              "      <td>15</td>\n",
              "      <td>19</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Multiple carriers</td>\n",
              "      <td>Delhi</td>\n",
              "      <td>Cochin</td>\n",
              "      <td>DEL → BOM → COK</td>\n",
              "      <td>1 stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>21</td>\n",
              "      <td>5</td>\n",
              "      <td>13</td>\n",
              "      <td>0</td>\n",
              "      <td>8</td>\n",
              "      <td>0</td>\n",
              "      <td>21</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Air Asia</td>\n",
              "      <td>Banglore</td>\n",
              "      <td>Delhi</td>\n",
              "      <td>BLR → DEL</td>\n",
              "      <td>non-stop</td>\n",
              "      <td>No info</td>\n",
              "      <td>24</td>\n",
              "      <td>6</td>\n",
              "      <td>2</td>\n",
              "      <td>50</td>\n",
              "      <td>23</td>\n",
              "      <td>55</td>\n",
              "      <td>2</td>\n",
              "      <td>45</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "             Airline    Source Destination            Route Total_Stops  \\\n",
              "0        Jet Airways     Delhi      Cochin  DEL → BOM → COK      1 stop   \n",
              "1             IndiGo   Kolkata    Banglore  CCU → MAA → BLR      1 stop   \n",
              "2        Jet Airways     Delhi      Cochin  DEL → BOM → COK      1 stop   \n",
              "3  Multiple carriers     Delhi      Cochin  DEL → BOM → COK      1 stop   \n",
              "4           Air Asia  Banglore       Delhi        BLR → DEL    non-stop   \n",
              "\n",
              "               Additional_Info  Journey_Day  Journey_Month  Duration_hours  \\\n",
              "0                      No info            6              6              10   \n",
              "1                      No info           12              5               4   \n",
              "2  In-flight meal not included           21              5              23   \n",
              "3                      No info           21              5              13   \n",
              "4                      No info           24              6               2   \n",
              "\n",
              "   Duration_minutes  Depart_Time_Hour  Depart_Time_Minutes  Arr_Time_Hour  \\\n",
              "0                55                17                   30              4   \n",
              "1                 0                 6                   20             10   \n",
              "2                45                19                   15             19   \n",
              "3                 0                 8                    0             21   \n",
              "4                50                23                   55              2   \n",
              "\n",
              "   Arr_Time_Minutes  \n",
              "0                25  \n",
              "1                20  \n",
              "2                 0  \n",
              "3                 0  \n",
              "4                45  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "metadata": {
        "id": "5L57iDANKaO4",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "##Preprocessing the Datasets"
      ]
    },
    {
      "metadata": {
        "id": "Pg5zaIgzYOPV",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Classifying the Dependent and Independent Variables"
      ]
    },
    {
      "metadata": {
        "id": "EVLJz0uGYVdO",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "# Dependent Variable\n",
        "Y_train = training_set.iloc[:,6].values  # 6 is the index of \"Price\" in the Training Set \n",
        "\n",
        "# Independent Variables\n",
        "X_train = training_set.iloc[:,training_set.columns != 'Price'].values # selects all columns except \"Price\"\n",
        "\n",
        "# Independent Variables for Test Set\n",
        "X_test = test_set.iloc[:,:].values"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "NW8IWSglYbL8",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Encoding Categorical Variables"
      ]
    },
    {
      "metadata": {
        "id": "ZB1jwM-UYaGV",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import LabelEncoder\n",
        "\n",
        "le1 = LabelEncoder()\n",
        "le2 = LabelEncoder()\n",
        "\n",
        "# Training Set    \n",
        "\n",
        "X_train[:,0] = le1.fit_transform(X_train[:,0])\n",
        "\n",
        "X_train[:,1] = le1.fit_transform(X_train[:,1])\n",
        "\n",
        "X_train[:,2] = le1.fit_transform(X_train[:,2])\n",
        "\n",
        "X_train[:,3] = le1.fit_transform(X_train[:,3])\n",
        "\n",
        "X_train[:,4] = le1.fit_transform(X_train[:,4])\n",
        "\n",
        "X_train[:,5] = le1.fit_transform(X_train[:,5])\n",
        "\n",
        "# Test Set\n",
        "\n",
        "\n",
        "X_test[:,0] = le2.fit_transform(X_test[:,0])\n",
        "\n",
        "X_test[:,1] = le2.fit_transform(X_test[:,1])\n",
        "\n",
        "X_test[:,2] = le2.fit_transform(X_test[:,2])\n",
        "\n",
        "X_test[:,3] = le2.fit_transform(X_test[:,3])\n",
        "\n",
        "X_test[:,4] = le2.fit_transform(X_test[:,4])\n",
        "\n",
        "X_test[:,5] = le2.fit_transform(X_test[:,5])\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "gSss4r1niWTB",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Data After Encoding"
      ]
    },
    {
      "metadata": {
        "id": "6HdXl9v1iIF4",
        "colab_type": "code",
        "outputId": "bd34a481-a3a8-49e1-fd75-1d2396f79f4d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        }
      },
      "cell_type": "code",
      "source": [
        "print(pd.DataFrame(X_train).head())"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "  0  1  2    3  4  5   6  7   8   9   10  11  12  13\n",
            "0  3  0  5   18  4  8  24  3   2  50  22  20   1  10\n",
            "1  1  3  0   84  1  8   1  5   7  25   5  50  13  15\n",
            "2  4  2  1  118  1  8   9  6  19   0   9  25   4  25\n",
            "3  3  3  0   91  0  8  12  5   5  25  18   5  23  30\n",
            "4  3  0  5   29  0  8   1  3   4  45  16  50  21  35\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "kPDG1PAsYh0O",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Feature Scaling"
      ]
    },
    {
      "metadata": {
        "id": "UPmGO6mAKiMa",
        "colab_type": "code",
        "outputId": "811a473a-db90-49a3-d105-fc0649449e95",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 187
        }
      },
      "cell_type": "code",
      "source": [
        "\n",
        "# Feature Scaling\n",
        "\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "\n",
        "sc_X = StandardScaler()\n",
        "\n",
        "X_train = sc_X.fit_transform(X_train)\n",
        "\n",
        "X_test = sc_X.transform(X_test)\n",
        "\n",
        "#sc_y = StandardScaler()\n",
        "\n",
        "Y_train = Y_train.reshape((len(Y_train), 1)) \n",
        "\n",
        "Y_train = sc_X.fit_transform(Y_train)\n",
        "\n",
        "Y_train = Y_train.ravel()\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n",
            "  warnings.warn(msg, DataConversionWarning)\n",
            "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n",
            "  warnings.warn(msg, DataConversionWarning)\n",
            "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n",
            "  warnings.warn(msg, DataConversionWarning)\n",
            "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
            "  warnings.warn(msg, DataConversionWarning)\n",
            "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
            "  warnings.warn(msg, DataConversionWarning)\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "metadata": {
        "id": "NfxlQYQbikSw",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "###Data After Scaling"
      ]
    },
    {
      "metadata": {
        "id": "-Qk3rowvindK",
        "colab_type": "code",
        "outputId": "6c1d9cbf-ea84-4c17-de26-82a728894ba1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 238
        }
      },
      "cell_type": "code",
      "source": [
        "print(pd.DataFrame(X_train).head()) # Xtrain after scaling"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "         0         1         2         3         4         5         6   \\\n",
            "0 -0.410805 -1.658359  2.416534 -1.547082  1.407210  0.499921  1.237288   \n",
            "1 -1.261152  0.890014 -0.973812  0.249946 -0.253703  0.499921 -1.475307   \n",
            "2  0.014369  0.040556 -0.295743  1.175687 -0.253703  0.499921 -0.531796   \n",
            "3 -0.410805  0.890014 -0.973812  0.440539 -0.807341  0.499921 -0.177979   \n",
            "4 -0.410805 -1.658359  2.416534 -1.247577 -0.807341  0.499921 -1.475307   \n",
            "\n",
            "         7         8         9         10        11        12        13  \n",
            "0 -1.467402 -0.970614  1.279041  1.654154 -0.234950 -1.800436 -0.890014  \n",
            "1  0.250289 -0.381999 -0.196319 -1.303113  1.363607 -0.050909 -0.587094  \n",
            "2  1.109135  1.030677 -1.671678 -0.607286  0.031476 -1.363054  0.018745  \n",
            "3  0.250289 -0.617445 -0.196319  0.958326 -1.034229  1.407030  0.321664  \n",
            "4 -1.467402 -0.735168  0.983969  0.610412  1.363607  1.115442  0.624584  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "o9fGdPS7i23I",
        "colab_type": "code",
        "outputId": "9836c70a-beb6-4eda-97db-b0b5d1474e5d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        }
      },
      "cell_type": "code",
      "source": [
        "print(pd.DataFrame(Y_train).head())"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "          0\n",
            "0 -1.125535\n",
            "1 -0.309068\n",
            "2  1.039783\n",
            "3 -0.622209\n",
            "4  0.914006\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "1_giRsy4KjbZ",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Modelling with Support Vector Regressor"
      ]
    },
    {
      "metadata": {
        "id": "naviX4YjJXTd",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "\n",
        "from sklearn.svm import SVR\n",
        "\n",
        "svr = SVR(kernel = \"rbf\")\n",
        "\n",
        "svr.fit(X_train,Y_train)\n",
        "\n",
        "Y_pred = sc_X.inverse_transform(svr.predict(X_test))\n",
        "\n",
        "\n",
        "pd.DataFrame(Y_pred, columns = ['Price']).to_excel(\"Final_Pred.xlsx\", index = False)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "DGtFHyIXjP6v",
        "colab_type": "code",
        "outputId": "5c608f0f-e4d5-43ea-d9c9-415f09ddfd3b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        }
      },
      "cell_type": "code",
      "source": [
        "print(pd.DataFrame(Y_pred).head())"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              0\n",
            "0  10547.002077\n",
            "1   6773.635780\n",
            "2  11482.732670\n",
            "3  11281.247128\n",
            "4   4379.175423\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}