analyticsindiamagazine/gist:35334c13b67136cf0665de0e878de772

## gistfile1.txt
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "IEFmR5J_COHU"
   },
   "source": [
    "## Import Required Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YosGM4uOCOHZ"
   },
   "outputs": [],
   "source": [
    "## change it to the unzip path of the downloaded dataset..\n",
    "data_folder = r'/Users/anurag/Downloads/movie genre classification/Scripts'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "5MGrGh94COHh"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total Number of Files : 2827\n"
     ]
    }
   ],
   "source": [
    "all_files = os.listdir(data_folder)\n",
    "print('Total Number of Files :', len(all_files))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read Train Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>File_Name</th>\n",
       "      <th>Labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>file_2180.txt</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>file_693.txt</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>file_2469.txt</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>file_2542.txt</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>file_378.txt</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       File_Name  Labels\n",
       "0  file_2180.txt       8\n",
       "1   file_693.txt       4\n",
       "2  file_2469.txt       6\n",
       "3  file_2542.txt       6\n",
       "4   file_378.txt      16"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv('Train.csv')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>File_Name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>file_2300.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>file_809.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>file_1383.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>file_983.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>file_1713.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>844</td>\n",
       "      <td>file_2474.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>845</td>\n",
       "      <td>file_863.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>846</td>\n",
       "      <td>file_1547.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>847</td>\n",
       "      <td>file_1292.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>848</td>\n",
       "      <td>file_1910.txt</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>849 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         File_Name\n",
       "0    file_2300.txt\n",
       "1     file_809.txt\n",
       "2    file_1383.txt\n",
       "3     file_983.txt\n",
       "4    file_1713.txt\n",
       "..             ...\n",
       "844  file_2474.txt\n",
       "845   file_863.txt\n",
       "846  file_1547.txt\n",
       "847  file_1292.txt\n",
       "848  file_1910.txt\n",
       "\n",
       "[849 rows x 1 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df = pd.read_csv('Test.csv', names=['File_Name'])\n",
    "# test_df.columns = ['File_Name']\n",
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## let's read the text scripts in the train and test dataframes..\n",
    "\n",
    "train_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in train_df['File_Name']]\n",
    "test_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in test_df['File_Name']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lets look at a script file after Reading.."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<b><!--\n",
      "\n",
      "</b>if (window!= top)\n",
      "\n",
      "top.location.href=location.href\n",
      "\n",
      "<b>// -->\n",
      "\n",
      "</b>\n",
      "\n",
      "The Abyss - by James Cameron \n",
      "\n",
      "                                  THE ABYSS\n",
      "\n",
      "                            AN ORIGINAL SCREENPLAY\n",
      "\n",
      "                                      BY\n",
      "\n",
      "                                JAMES CAMERON\n",
      "\n",
      "                               August 2, 1988\n",
      "\n",
      "                             Director's Revision\n",
      "\n",
      "------------------------------------------------------------------------------\n",
      "\n",
      "                                  THE ABYSS\n",
      "\n",
      "OMITTED                                                                 1\n",
      "\n",
      "OMITTED                                                                 2\n",
      "\n",
      "TITLE: THE ABYSS -- ON BLACK, DISSOLVING TO COBALT BLUE\n",
      "\n",
      "EXT. OCEAN/UNDERWATER -- DAY                                            3\n",
      "\n",
      "Blue, deep and featureless, the twilight of five hundred feet down.\n",
      "\n",
      "PROPELLER SOUND.  Materializing out of the blue limbo is the enormous but\n",
      "\n",
      "sleek form of an Ohio-class SSBN ballistic missile submarine.\n",
      "\n",
      "INT. U.S.S. MONTANA -- DAY                                              4\n",
      "\n",
      "In the attack center, darkened to womb-red, the crew's faces shine with sweat\n",
      "\n",
      "in the glow of their instruments.  The SKIPPER and his EXEC crowd around\n",
      "\n",
      "BARNES, the sonarman.\n",
      "\n",
      "                                CAPTAIN\n",
      "\n",
      "                Sixty knots?  No way, Barnes... the reds don't\n",
      "\n",
      "                have anything that fast.\n",
      "\n",
      "                                BARNES\n",
      "\n",
      "                Checked it twice, skipper.  It's a real unique\n",
      "\n",
      "                signature.  No cavitation, no reactor noise...\n",
      "\n",
      "                doesn't even sound like screws.\n",
      "\n",
      "He puts the signal onto a speaker and everyone in the attack room listens to\n",
      "\n",
      "the intruder's acoustic signature, a strange THRUMMING.  The captain studies\n",
      "\n",
      "the electronic position board, a graphic representation of the contours of\n",
      "\n",
      "the steep-walled canyon, a symbol for the Montana, and converging with it, an\n",
      "\n",
      "amorphous trace, representing the bogey.\n",
      "\n",
      "                                CAPTAIN\n",
      "\n",
      "                What the hell is it?\n",
      "\n",
      "                                EXEC\n",
      "\n",
      "                I'll tell you what it's not, it's not one of\n",
      "\n",
      "                ours.\n",
      "\n",
      "                                BARNES\n",
      "\n",
      "                Sir!  Contact changing heading to two-one-four,\n",
      "\n",
      "                diving.  Speed eighty knots!  Eighty knots!\n",
      "\n",
      "                                EXEC\n",
      "\n",
      "                Eighty knots...\n",
      "\n",
      "                                BARNES\n",
      "\n",
      "                Still diving, depth nine hundred feet.  Port\n",
      "\n",
      "                clearance to cliff wall, one hundred fifty feet.\n",
      "\n",
      "                                FRANK\n",
      "\n",
      "                           (simultaneously)\n",
      "\n",
      "                Still diving, depth nine hundred feet.  Port\n",
      "\n",
      "                clearance to cliff wall, one hundred fifty feet.\n",
      "\n",
      "Tension builds in the attack room as the Montana surges to intercept the\n",
      "\n",
      "intruder.  The exec tensely watches the vector-graphic readout for the side-\n",
      "\n",
      "scan sonar array.  The sub is running uncomfortably\n"
     ]
    }
   ],
   "source": [
    "#lets check one of the scripts..\n",
    "print(train_df['Script'][4][:3000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                          KING KONG\n",
      "\n",
      "                                          Written by\n",
      "\n",
      "                        Fran Walsh, Philippa Boyens and Peter Jackson\n",
      "\n",
      "                                     Based on a Story by\n",
      "\n",
      "                             Merian C. Cooper and Edgar Wallace\n",
      "\n",
      "                                                                   1.\n",
      "\n",
      "          EXT. CENTRAL PARK - DAY\n",
      "\n",
      "          CLOSE ON: A scrawny MONKEY scratches.\n",
      "\n",
      "          ANGLES ON: Defeated, listless ANIMALS, in the bleak environs of a\n",
      "\n",
      "          dilapidated ZOO.\n",
      "\n",
      "          WIDER: It is CENTRAL PARK ZOO in depression era NEW YORK. The PARK\n",
      "\n",
      "          itself is like a GARBAGE DUMP, dotted with squalid SHANTY TOWNS.\n",
      "\n",
      "          Against these BLEAK IMAGES, the SOUND of a BRIGHT, BRASSY SONG\n",
      "\n",
      "          fades up: Al Jolson, singing \"I'm Sitting on Top of the World\".\n",
      "\n",
      "          The sky line of MANHATTAN rises in the background, a grim steaming\n",
      "\n",
      "          jungle on this cold FALL day.\n",
      "\n",
      "                                                        I\n",
      "\n",
      "          EXT. NY STREETS - DAY\n",
      "\n",
      "          LONG continues over:\n",
      "\n",
      "          IMAGES: The CROWDED STREETS of NEW YORK ... beneath the bustle is\n",
      "\n",
      "          a sense of despair.\n",
      "\n",
      "          LONG SOUP LINES snake along the STREETS.\n",
      "\n",
      "          The HUNGRY search through RUBBISH BINS for FOOD. SKYSCRAPERS rise\n",
      "\n",
      "          steadily upwards as more people are evicted from their homes.\n",
      "\n",
      "          HOMELESS sleep amid steaming VENTS and GARBAGE STREWN GUTTERS.\n",
      "\n",
      "                                                             Intercut:\n",
      "\n",
      "          INT. VAUDEVILLE THEATRE - NIGHT\n",
      "\n",
      "          SONG continues over:\n",
      "\n",
      "          I\n",
      "\n",
      "          SANNY, an old-time VAUDEVILLIAN, hurriedly fixes a large DROOPY\n",
      "\n",
      "          MOUSTACHE on to a YOUNG WOMAN'S TOP LIP ... this is ANN DARROW.\n",
      "\n",
      "          IMAGES: Weird and wonderful snatches of VAUDEVILLE ACTS follow ...\n",
      "\n",
      "          singers, jugglers, boxing ladies.\n",
      "\n",
      "          E\n",
      "\n",
      "                                                        Intercut with:\n",
      "\n",
      "          EXT. NY STREETS - DAY\n",
      "\n",
      "          The COLOR and MUSIC contrast with the SOUP LINES and SLUMPED\n",
      "\n",
      "          SHOULDERS of the REAL WORLD.\n",
      "\n",
      "          INT. VAUDEVILLE THEATRE - NIGHT\n",
      "\n",
      "          ANGLE ON: ANN on STAGE ... dressed as an ELEGANT GENT, she\n",
      "\n",
      "          launches into `I'm Just Wild About Harry' with HARRY, a larger-\n",
      "\n",
      "          than-life PERFORMER dressed in a FRILLY DRESS, BRASSY RED WIG and\n",
      "\n",
      "          FALSIES.\n",
      "\n",
      "                                                                   2.\n",
      "\n",
      "          MANNY's CHARACTER joins in ... SNEEZING LOUDLY and causing ANN to\n",
      "\n",
      "          take a SUDDEN PRAT FALL.\n",
      "\n",
      "           nd so the ROUTINE BUILDS ... ANN and HARRY singing and dancing\n",
      "\n",
      "          ... MANNY SNEEZING ... ANN falling.\n",
      "\n",
      "          The AUDIENCE look on with bored expressions on their faces. All\n",
      "\n",
      "          except ONE MAN at the BACK, who is LAUGHING HYSTERICALLY.\n",
      "\n",
      "          CLOSE ON: ANN throwing everything into her ACT ... SWEAT rolls\n",
      "\n",
      "          down her face ... she tries\n"
     ]
    }
   ],
   "source": [
    "print(test_df['Script'][4][:3000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 408
    },
    "colab_type": "code",
    "id": "5Z6GyVjFD9aH",
    "outputId": "309eb172-19cc-40a4-e198-bc24dbf4d804"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6     405\n",
       "19    261\n",
       "4     243\n",
       "0     203\n",
       "5     141\n",
       "15    134\n",
       "1     116\n",
       "16    109\n",
       "11    104\n",
       "8      79\n",
       "14     75\n",
       "7      27\n",
       "2      25\n",
       "20     18\n",
       "13     15\n",
       "21      9\n",
       "12      4\n",
       "9       3\n",
       "3       2\n",
       "17      2\n",
       "10      2\n",
       "18      1\n",
       "Name: Labels, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.Labels.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_df = train_df.append(train_df[train_df['Labels'] == 18].reset_index(drop=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df = train_df.drop(1859, axis=0)\n",
    "# train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_df = test_df.drop(407, axis=0)\n",
    "# test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "LdCtl00oCOIZ"
   },
   "outputs": [],
   "source": [
    "# !pip install keras\n",
    "# !pip install nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import the Modeling Libraries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "_cell_guid": "d46ba3fd-26f1-4635-b2f9-fca916ff3066",
    "_uuid": "21f3ccd962d1556dc2346699d45a29e9ef791367",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "XvGfW6yECOIb",
    "outputId": "3f818c87-cc25-4cf7-cd09-2a530b458b7a"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from sklearn.svm import SVC\n",
    "from keras.models import Sequential\n",
    "from keras.layers.recurrent import LSTM, GRU\n",
    "from keras.layers.core import Dense, Activation, Dropout\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.utils import np_utils\n",
    "from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\n",
    "from keras.preprocessing import sequence, text\n",
    "from keras.callbacks import EarlyStopping\n",
    "import nltk\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "60326be1-82d1-4677-8ef8-da5b1eac475c",
    "_uuid": "adb496504ab8453ce2b4f91dd6e5f17cbdaf4f68",
    "colab_type": "text",
    "id": "erL7TVmYCOId"
   },
   "source": [
    "Let's load the datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "f-2g2ajRCOIe",
    "outputId": "06d8c71f-741e-4c9d-9d2b-ab90cfb07a2e",
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/anurag/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download('stopwords')\n",
    "stop_words = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define the Scoring Metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "_cell_guid": "deb46a3c-6170-4323-8fac-2710662ae0b9",
    "_uuid": "62cd92e75f858aa7c97234e8267a64b00c6d04d0",
    "colab": {},
    "colab_type": "code",
    "id": "bBd9n3z9COIo"
   },
   "outputs": [],
   "source": [
    "def multiclass_logloss(actual, predicted, eps=1e-15):\n",
    "    \n",
    "    \"\"\"Multi class version of Logarithmic Loss metric.\n",
    "    \n",
    "    :param actual: Array containing the actual target classes\n",
    "    :param predicted: Matrix with class predictions, one probability per class\n",
    "    \"\"\"\n",
    "    # Convert 'actual' to a binary array if it's not already:\n",
    "    if len(actual.shape) == 1:\n",
    "        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))\n",
    "        for i, val in enumerate(actual):\n",
    "            actual2[i, val] = 1\n",
    "        actual = actual2\n",
    "\n",
    "    clip = np.clip(predicted, eps, 1 - eps)\n",
    "    rows = actual.shape[0]\n",
    "    print(clip)\n",
    "    vsota = np.sum(actual * np.log(clip.tolist()))\n",
    "    return -1.0 / rows * vsota"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "65403e74-091f-43c4-9523-3e15d8a75a1e",
    "_uuid": "4ffd04f40d9e921673d06ad64e01b9a7395d8e76",
    "colab_type": "text",
    "id": "8OtK6PLsCOIs"
   },
   "source": [
    "### Before going further it is important that we split the data into training and validation sets. We can do it using \n",
    "#### `train_test_split` from the `model_selection` module of scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "_cell_guid": "ba8e606d-8dee-495e-8c3f-62aa916e9927",
    "_uuid": "b45676b121e2b719d355619e24cfed13d0d33f74",
    "colab": {},
    "colab_type": "code",
    "id": "tqXblO-mCOIt"
   },
   "outputs": [],
   "source": [
    "xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.Script.values, train_df.Labels, \n",
    "                                                  stratify=train_df.Labels, \n",
    "                                                  random_state=42, \n",
    "                                                  test_size=0.1, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "_cell_guid": "9e2fe6a9-8de0-4bbd-8264-f6b78e7993e2",
    "_uuid": "6c8659049537836fdf00d19d6d656630a306d217",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "Ce_Eu4CrCOIu",
    "outputId": "042067e3-a5ad-4f5a-c727-0581878fdd86"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1781,)\n",
      "(198,)\n"
     ]
    }
   ],
   "source": [
    "print (xtrain.shape)\n",
    "print (xvalid.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "3db70c26-d684-478a-bcd4-980ed6c6d65b",
    "_uuid": "794fb768f4a8e42c4be4f1dbb27144aae4d00c79",
    "colab_type": "text",
    "id": "unBlwkZ1COIx"
   },
   "source": [
    "# Building Basic Models\n",
    "\n",
    "### Let's start building our very first model. \n",
    "\n",
    "### Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "_cell_guid": "b387f2af-11b1-455d-ad8d-320ed1005be3",
    "_uuid": "350d453dc982f494c3774dbdcf731d856546d611",
    "colab": {},
    "colab_type": "code",
    "id": "qnJ8aPthCOIx"
   },
   "outputs": [],
   "source": [
    "# Always start with these features. They work (almost) everytime!\n",
    "tfv = TfidfVectorizer(min_df=3,  max_features=None, \n",
    "            strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
    "            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
    "            stop_words = 'english')\n",
    "\n",
    "# Fitting TF-IDF to both training and test sets (semi-supervised learning)\n",
    "tfv.fit(list(xtrain) + list(xvalid))\n",
    "\n",
    "xtrain_tfv =  tfv.transform(xtrain) \n",
    "xvalid_tfv = tfv.transform(xvalid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF on test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test_tfv = tfv.transform(test_df['Script'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "_cell_guid": "4106bbd1-dc35-4dc2-bda0-3024d3c056d3",
    "_uuid": "3f5dd9ce043364fc61ba3a30298acd9cb72a2938",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "NhMszJiuCOIz",
    "outputId": "d4060a49-dae8-4d78-e909-f752f83cdf8e"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
      "  \"this warning.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "logloss: 2.486 \n"
     ]
    }
   ],
   "source": [
    "# Fitting a simple Logistic Regression on TFIDF\n",
    "clf = LogisticRegression(C=1.0)\n",
    "clf.fit(xtrain_tfv, ytrain)\n",
    "predictions = clf.predict_proba(xvalid_tfv)\n",
    "\n",
    "print (\"logloss: %0.3f \" % multiclass_logloss(yvalid, predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submission "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's predict on the entire test data..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_set_preds = pd.DataFrame(columns = train_df.Genre.unique().tolist())\n",
    "test_set_preds.insert(0, 'File_name', test_df.File_Name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_set_preds[train_df.Genre.unique().tolist()] = clf.predict_proba(x_test_tfv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'test_set_preds' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-28-d91266b0dd82>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_set_preds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'test_set_preds' is not defined"
     ]
    }
   ],
   "source": [
    "test_set_preds.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>File_name</th>\n",
       "      <th>Fantasy</th>\n",
       "      <th>Comedy</th>\n",
       "      <th>Drama</th>\n",
       "      <th>Sci-Fi</th>\n",
       "      <th>Romance</th>\n",
       "      <th>Thriller</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Mystery</th>\n",
       "      <th>Action</th>\n",
       "      <th>...</th>\n",
       "      <th>Family</th>\n",
       "      <th>Biography</th>\n",
       "      <th>Musical</th>\n",
       "      <th>War</th>\n",
       "      <th>Western</th>\n",
       "      <th>Music</th>\n",
       "      <th>History</th>\n",
       "      <th>Short</th>\n",
       "      <th>Film-Noir</th>\n",
       "      <th>Sport</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>file_2300.txt</td>\n",
       "      <td>0.0621472</td>\n",
       "      <td>0.0421466</td>\n",
       "      <td>0.0130443</td>\n",
       "      <td>0.0038829</td>\n",
       "      <td>0.127008</td>\n",
       "      <td>0.0613026</td>\n",
       "      <td>0.196798</td>\n",
       "      <td>0.014007</td>\n",
       "      <td>0.0353658</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00476286</td>\n",
       "      <td>0.00886165</td>\n",
       "      <td>0.0364149</td>\n",
       "      <td>0.0726348</td>\n",
       "      <td>0.042123</td>\n",
       "      <td>0.00388578</td>\n",
       "      <td>0.00382821</td>\n",
       "      <td>0.201259</td>\n",
       "      <td>0.0101705</td>\n",
       "      <td>0.00659258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>file_809.txt</td>\n",
       "      <td>0.0618728</td>\n",
       "      <td>0.0424302</td>\n",
       "      <td>0.0128616</td>\n",
       "      <td>0.00384745</td>\n",
       "      <td>0.162379</td>\n",
       "      <td>0.0644784</td>\n",
       "      <td>0.195431</td>\n",
       "      <td>0.0140639</td>\n",
       "      <td>0.0337543</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00473483</td>\n",
       "      <td>0.00875095</td>\n",
       "      <td>0.0326453</td>\n",
       "      <td>0.155985</td>\n",
       "      <td>0.0393143</td>\n",
       "      <td>0.00379855</td>\n",
       "      <td>0.00377858</td>\n",
       "      <td>0.0948981</td>\n",
       "      <td>0.0099209</td>\n",
       "      <td>0.00636009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>file_1383.txt</td>\n",
       "      <td>0.0975445</td>\n",
       "      <td>0.144602</td>\n",
       "      <td>0.0133359</td>\n",
       "      <td>0.00399558</td>\n",
       "      <td>0.0758506</td>\n",
       "      <td>0.0479403</td>\n",
       "      <td>0.136187</td>\n",
       "      <td>0.0142812</td>\n",
       "      <td>0.0355864</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00493601</td>\n",
       "      <td>0.00917026</td>\n",
       "      <td>0.0342179</td>\n",
       "      <td>0.0529838</td>\n",
       "      <td>0.151729</td>\n",
       "      <td>0.00399736</td>\n",
       "      <td>0.00395552</td>\n",
       "      <td>0.0972641</td>\n",
       "      <td>0.0110146</td>\n",
       "      <td>0.00689023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>file_983.txt</td>\n",
       "      <td>0.081371</td>\n",
       "      <td>0.049697</td>\n",
       "      <td>0.0135967</td>\n",
       "      <td>0.00402333</td>\n",
       "      <td>0.103302</td>\n",
       "      <td>0.0638348</td>\n",
       "      <td>0.163699</td>\n",
       "      <td>0.0146887</td>\n",
       "      <td>0.0363712</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00499925</td>\n",
       "      <td>0.00934587</td>\n",
       "      <td>0.036615</td>\n",
       "      <td>0.0638244</td>\n",
       "      <td>0.0479881</td>\n",
       "      <td>0.00404956</td>\n",
       "      <td>0.00401123</td>\n",
       "      <td>0.225464</td>\n",
       "      <td>0.0109523</td>\n",
       "      <td>0.00689013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>file_1713.txt</td>\n",
       "      <td>0.153704</td>\n",
       "      <td>0.110646</td>\n",
       "      <td>0.0141907</td>\n",
       "      <td>0.00410832</td>\n",
       "      <td>0.0723873</td>\n",
       "      <td>0.0499341</td>\n",
       "      <td>0.18619</td>\n",
       "      <td>0.0153917</td>\n",
       "      <td>0.0376808</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00506564</td>\n",
       "      <td>0.00963215</td>\n",
       "      <td>0.0720046</td>\n",
       "      <td>0.0489596</td>\n",
       "      <td>0.0497823</td>\n",
       "      <td>0.00413493</td>\n",
       "      <td>0.00411517</td>\n",
       "      <td>0.0888852</td>\n",
       "      <td>0.0111348</td>\n",
       "      <td>0.00707706</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       File_name    Fantasy     Comedy      Drama      Sci-Fi    Romance  \\\n",
       "0  file_2300.txt  0.0621472  0.0421466  0.0130443   0.0038829   0.127008   \n",
       "1   file_809.txt  0.0618728  0.0424302  0.0128616  0.00384745   0.162379   \n",
       "2  file_1383.txt  0.0975445   0.144602  0.0133359  0.00399558  0.0758506   \n",
       "3   file_983.txt   0.081371   0.049697  0.0135967  0.00402333   0.103302   \n",
       "4  file_1713.txt   0.153704   0.110646  0.0141907  0.00410832  0.0723873   \n",
       "\n",
       "    Thriller Adventure    Mystery     Action  ...      Family   Biography  \\\n",
       "0  0.0613026  0.196798   0.014007  0.0353658  ...  0.00476286  0.00886165   \n",
       "1  0.0644784  0.195431  0.0140639  0.0337543  ...  0.00473483  0.00875095   \n",
       "2  0.0479403  0.136187  0.0142812  0.0355864  ...  0.00493601  0.00917026   \n",
       "3  0.0638348  0.163699  0.0146887  0.0363712  ...  0.00499925  0.00934587   \n",
       "4  0.0499341   0.18619  0.0153917  0.0376808  ...  0.00506564  0.00963215   \n",
       "\n",
       "     Musical        War    Western       Music     History      Short  \\\n",
       "0  0.0364149  0.0726348   0.042123  0.00388578  0.00382821   0.201259   \n",
       "1  0.0326453   0.155985  0.0393143  0.00379855  0.00377858  0.0948981   \n",
       "2  0.0342179  0.0529838   0.151729  0.00399736  0.00395552  0.0972641   \n",
       "3   0.036615  0.0638244  0.0479881  0.00404956  0.00401123   0.225464   \n",
       "4  0.0720046  0.0489596  0.0497823  0.00413493  0.00411517  0.0888852   \n",
       "\n",
       "   Film-Noir       Sport  \n",
       "0  0.0101705  0.00659258  \n",
       "1  0.0099209  0.00636009  \n",
       "2  0.0110146  0.00689023  \n",
       "3  0.0109523  0.00689013  \n",
       "4  0.0111348  0.00707706  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_set_preds.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_set_preds.to_excel('test_set_preds.xlsx', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pred_df = pd.read_excel(\"/Users/anurag/Downloads/Movie_Scripts_Sample_SubmissionL.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.06214723355175664 0.042146623610022714 0.013044332037354145 ...\n",
      "  0.20125856438571393 0.010170535073625206 0.006592576535592437]\n",
      " [0.06187280816682498 0.04243018124017416 0.012861594741520026 ...\n",
      "  0.0948981163189656 0.009920896808506535 0.006360092595007451]\n",
      " [0.09754452792196255 0.1446015828825774 0.013335926776471056 ...\n",
      "  0.09726405512888492 0.011014628830312737 0.006890231855105332]\n",
      " ...\n",
      " [0.08640182590027343 0.11063912938749708 0.013709650461187581 ...\n",
      "  0.09776645184552446 0.01073621514500798 0.006588622029522577]\n",
      " [0.11033200163939762 0.05623496173057861 0.01356362469949817 ...\n",
      "  0.12585100435340937 0.010992541569449196 0.006971135126053225]\n",
      " [0.1911758586509735 0.043270688228447846 0.012024185208458112 ...\n",
      "  0.13988146958682884 0.009851563910019954 0.006509688375101668]]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2.4597586095272357"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiclass_logloss(test_df['Labels'].values, test_set_preds[train_df.Genre.unique().tolist()].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'test_set_preds' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-27-152e5a3bf49c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_set_preds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'test_set_preds' is not defined"
     ]
    }
   ],
   "source": [
    "test_set_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Starter Notebook - Movie Genre Classification",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}