Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save analyticsindiamagazine/35334c13b67136cf0665de0e878de772 to your computer and use it in GitHub Desktop.
Save analyticsindiamagazine/35334c13b67136cf0665de0e878de772 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "IEFmR5J_COHU"
},
"source": [
"## Import Required Files"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "YosGM4uOCOHZ"
},
"outputs": [],
"source": [
"## change it to the unzip path of the downloaded dataset..\n",
"data_folder = r'/Users/anurag/Downloads/movie genre classification/Scripts'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5MGrGh94COHh"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Number of Files : 2827\n"
]
}
],
"source": [
"all_files = os.listdir(data_folder)\n",
"print('Total Number of Files :', len(all_files))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read Train Files"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_Name</th>\n",
" <th>Labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2180.txt</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_693.txt</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_2469.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_2542.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_378.txt</td>\n",
" <td>16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" File_Name Labels\n",
"0 file_2180.txt 8\n",
"1 file_693.txt 4\n",
"2 file_2469.txt 6\n",
"3 file_2542.txt 6\n",
"4 file_378.txt 16"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df = pd.read_csv('Train.csv')\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2300.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_809.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_1383.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_983.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_1713.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>844</td>\n",
" <td>file_2474.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>845</td>\n",
" <td>file_863.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>846</td>\n",
" <td>file_1547.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>847</td>\n",
" <td>file_1292.txt</td>\n",
" </tr>\n",
" <tr>\n",
" <td>848</td>\n",
" <td>file_1910.txt</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>849 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" File_Name\n",
"0 file_2300.txt\n",
"1 file_809.txt\n",
"2 file_1383.txt\n",
"3 file_983.txt\n",
"4 file_1713.txt\n",
".. ...\n",
"844 file_2474.txt\n",
"845 file_863.txt\n",
"846 file_1547.txt\n",
"847 file_1292.txt\n",
"848 file_1910.txt\n",
"\n",
"[849 rows x 1 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.read_csv('Test.csv', names=['File_Name'])\n",
"# test_df.columns = ['File_Name']\n",
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## let's read the text scripts in the train and test dataframes..\n",
"\n",
"train_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in train_df['File_Name']]\n",
"test_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in test_df['File_Name']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lets look at a script file after Reading.."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<b><!--\n",
"\n",
"</b>if (window!= top)\n",
"\n",
"top.location.href=location.href\n",
"\n",
"<b>// -->\n",
"\n",
"</b>\n",
"\n",
"The Abyss - by James Cameron \n",
"\n",
" THE ABYSS\n",
"\n",
" AN ORIGINAL SCREENPLAY\n",
"\n",
" BY\n",
"\n",
" JAMES CAMERON\n",
"\n",
" August 2, 1988\n",
"\n",
" Director's Revision\n",
"\n",
"------------------------------------------------------------------------------\n",
"\n",
" THE ABYSS\n",
"\n",
"OMITTED 1\n",
"\n",
"OMITTED 2\n",
"\n",
"TITLE: THE ABYSS -- ON BLACK, DISSOLVING TO COBALT BLUE\n",
"\n",
"EXT. OCEAN/UNDERWATER -- DAY 3\n",
"\n",
"Blue, deep and featureless, the twilight of five hundred feet down.\n",
"\n",
"PROPELLER SOUND. Materializing out of the blue limbo is the enormous but\n",
"\n",
"sleek form of an Ohio-class SSBN ballistic missile submarine.\n",
"\n",
"INT. U.S.S. MONTANA -- DAY 4\n",
"\n",
"In the attack center, darkened to womb-red, the crew's faces shine with sweat\n",
"\n",
"in the glow of their instruments. The SKIPPER and his EXEC crowd around\n",
"\n",
"BARNES, the sonarman.\n",
"\n",
" CAPTAIN\n",
"\n",
" Sixty knots? No way, Barnes... the reds don't\n",
"\n",
" have anything that fast.\n",
"\n",
" BARNES\n",
"\n",
" Checked it twice, skipper. It's a real unique\n",
"\n",
" signature. No cavitation, no reactor noise...\n",
"\n",
" doesn't even sound like screws.\n",
"\n",
"He puts the signal onto a speaker and everyone in the attack room listens to\n",
"\n",
"the intruder's acoustic signature, a strange THRUMMING. The captain studies\n",
"\n",
"the electronic position board, a graphic representation of the contours of\n",
"\n",
"the steep-walled canyon, a symbol for the Montana, and converging with it, an\n",
"\n",
"amorphous trace, representing the bogey.\n",
"\n",
" CAPTAIN\n",
"\n",
" What the hell is it?\n",
"\n",
" EXEC\n",
"\n",
" I'll tell you what it's not, it's not one of\n",
"\n",
" ours.\n",
"\n",
" BARNES\n",
"\n",
" Sir! Contact changing heading to two-one-four,\n",
"\n",
" diving. Speed eighty knots! Eighty knots!\n",
"\n",
" EXEC\n",
"\n",
" Eighty knots...\n",
"\n",
" BARNES\n",
"\n",
" Still diving, depth nine hundred feet. Port\n",
"\n",
" clearance to cliff wall, one hundred fifty feet.\n",
"\n",
" FRANK\n",
"\n",
" (simultaneously)\n",
"\n",
" Still diving, depth nine hundred feet. Port\n",
"\n",
" clearance to cliff wall, one hundred fifty feet.\n",
"\n",
"Tension builds in the attack room as the Montana surges to intercept the\n",
"\n",
"intruder. The exec tensely watches the vector-graphic readout for the side-\n",
"\n",
"scan sonar array. The sub is running uncomfortably\n"
]
}
],
"source": [
"#lets check one of the scripts..\n",
"print(train_df['Script'][4][:3000])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" KING KONG\n",
"\n",
" Written by\n",
"\n",
" Fran Walsh, Philippa Boyens and Peter Jackson\n",
"\n",
" Based on a Story by\n",
"\n",
" Merian C. Cooper and Edgar Wallace\n",
"\n",
" 1.\n",
"\n",
" EXT. CENTRAL PARK - DAY\n",
"\n",
" CLOSE ON: A scrawny MONKEY scratches.\n",
"\n",
" ANGLES ON: Defeated, listless ANIMALS, in the bleak environs of a\n",
"\n",
" dilapidated ZOO.\n",
"\n",
" WIDER: It is CENTRAL PARK ZOO in depression era NEW YORK. The PARK\n",
"\n",
" itself is like a GARBAGE DUMP, dotted with squalid SHANTY TOWNS.\n",
"\n",
" Against these BLEAK IMAGES, the SOUND of a BRIGHT, BRASSY SONG\n",
"\n",
" fades up: Al Jolson, singing \"I'm Sitting on Top of the World\".\n",
"\n",
" The sky line of MANHATTAN rises in the background, a grim steaming\n",
"\n",
" jungle on this cold FALL day.\n",
"\n",
" I\n",
"\n",
" EXT. NY STREETS - DAY\n",
"\n",
" LONG continues over:\n",
"\n",
" IMAGES: The CROWDED STREETS of NEW YORK ... beneath the bustle is\n",
"\n",
" a sense of despair.\n",
"\n",
" LONG SOUP LINES snake along the STREETS.\n",
"\n",
" The HUNGRY search through RUBBISH BINS for FOOD. SKYSCRAPERS rise\n",
"\n",
" steadily upwards as more people are evicted from their homes.\n",
"\n",
" HOMELESS sleep amid steaming VENTS and GARBAGE STREWN GUTTERS.\n",
"\n",
" Intercut:\n",
"\n",
" INT. VAUDEVILLE THEATRE - NIGHT\n",
"\n",
" SONG continues over:\n",
"\n",
" I\n",
"\n",
" SANNY, an old-time VAUDEVILLIAN, hurriedly fixes a large DROOPY\n",
"\n",
" MOUSTACHE on to a YOUNG WOMAN'S TOP LIP ... this is ANN DARROW.\n",
"\n",
" IMAGES: Weird and wonderful snatches of VAUDEVILLE ACTS follow ...\n",
"\n",
" singers, jugglers, boxing ladies.\n",
"\n",
" E\n",
"\n",
" Intercut with:\n",
"\n",
" EXT. NY STREETS - DAY\n",
"\n",
" The COLOR and MUSIC contrast with the SOUP LINES and SLUMPED\n",
"\n",
" SHOULDERS of the REAL WORLD.\n",
"\n",
" INT. VAUDEVILLE THEATRE - NIGHT\n",
"\n",
" ANGLE ON: ANN on STAGE ... dressed as an ELEGANT GENT, she\n",
"\n",
" launches into `I'm Just Wild About Harry' with HARRY, a larger-\n",
"\n",
" than-life PERFORMER dressed in a FRILLY DRESS, BRASSY RED WIG and\n",
"\n",
" FALSIES.\n",
"\n",
" 2.\n",
"\n",
" MANNY's CHARACTER joins in ... SNEEZING LOUDLY and causing ANN to\n",
"\n",
" take a SUDDEN PRAT FALL.\n",
"\n",
" nd so the ROUTINE BUILDS ... ANN and HARRY singing and dancing\n",
"\n",
" ... MANNY SNEEZING ... ANN falling.\n",
"\n",
" The AUDIENCE look on with bored expressions on their faces. All\n",
"\n",
" except ONE MAN at the BACK, who is LAUGHING HYSTERICALLY.\n",
"\n",
" CLOSE ON: ANN throwing everything into her ACT ... SWEAT rolls\n",
"\n",
" down her face ... she tries\n"
]
}
],
"source": [
"print(test_df['Script'][4][:3000])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"colab_type": "code",
"id": "5Z6GyVjFD9aH",
"outputId": "309eb172-19cc-40a4-e198-bc24dbf4d804"
},
"outputs": [
{
"data": {
"text/plain": [
"6 405\n",
"19 261\n",
"4 243\n",
"0 203\n",
"5 141\n",
"15 134\n",
"1 116\n",
"16 109\n",
"11 104\n",
"8 79\n",
"14 75\n",
"7 27\n",
"2 25\n",
"20 18\n",
"13 15\n",
"21 9\n",
"12 4\n",
"9 3\n",
"3 2\n",
"17 2\n",
"10 2\n",
"18 1\n",
"Name: Labels, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.Labels.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"train_df = train_df.append(train_df[train_df['Labels'] == 18].reset_index(drop=True))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# train_df = train_df.drop(1859, axis=0)\n",
"# train_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"# test_df = test_df.drop(407, axis=0)\n",
"# test_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "LdCtl00oCOIZ"
},
"outputs": [],
"source": [
"# !pip install keras\n",
"# !pip install nltk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import the Modeling Libraries "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"_cell_guid": "d46ba3fd-26f1-4635-b2f9-fca916ff3066",
"_uuid": "21f3ccd962d1556dc2346699d45a29e9ef791367",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "XvGfW6yECOIb",
"outputId": "3f818c87-cc25-4cf7-cd09-2a530b458b7a"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
]
}
],
"source": [
"import numpy as np\n",
"from tqdm import tqdm\n",
"from sklearn.svm import SVC\n",
"from keras.models import Sequential\n",
"from keras.layers.recurrent import LSTM, GRU\n",
"from keras.layers.core import Dense, Activation, Dropout\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.layers.normalization import BatchNormalization\n",
"from keras.utils import np_utils\n",
"from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\n",
"from keras.preprocessing import sequence, text\n",
"from keras.callbacks import EarlyStopping\n",
"import nltk\n",
"from nltk import word_tokenize\n",
"from nltk.corpus import stopwords\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "60326be1-82d1-4677-8ef8-da5b1eac475c",
"_uuid": "adb496504ab8453ce2b4f91dd6e5f17cbdaf4f68",
"colab_type": "text",
"id": "erL7TVmYCOId"
},
"source": [
"Let's load the datasets"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "f-2g2ajRCOIe",
"outputId": "06d8c71f-741e-4c9d-9d2b-ab90cfb07a2e",
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/anurag/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"nltk.download('stopwords')\n",
"stop_words = stopwords.words('english')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Define the Scoring Metric"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"_cell_guid": "deb46a3c-6170-4323-8fac-2710662ae0b9",
"_uuid": "62cd92e75f858aa7c97234e8267a64b00c6d04d0",
"colab": {},
"colab_type": "code",
"id": "bBd9n3z9COIo"
},
"outputs": [],
"source": [
"def multiclass_logloss(actual, predicted, eps=1e-15):\n",
" \n",
" \"\"\"Multi class version of Logarithmic Loss metric.\n",
" \n",
" :param actual: Array containing the actual target classes\n",
" :param predicted: Matrix with class predictions, one probability per class\n",
" \"\"\"\n",
" # Convert 'actual' to a binary array if it's not already:\n",
" if len(actual.shape) == 1:\n",
" actual2 = np.zeros((actual.shape[0], predicted.shape[1]))\n",
" for i, val in enumerate(actual):\n",
" actual2[i, val] = 1\n",
" actual = actual2\n",
"\n",
" clip = np.clip(predicted, eps, 1 - eps)\n",
" rows = actual.shape[0]\n",
" print(clip)\n",
" vsota = np.sum(actual * np.log(clip.tolist()))\n",
" return -1.0 / rows * vsota"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "65403e74-091f-43c4-9523-3e15d8a75a1e",
"_uuid": "4ffd04f40d9e921673d06ad64e01b9a7395d8e76",
"colab_type": "text",
"id": "8OtK6PLsCOIs"
},
"source": [
"### Before going further it is important that we split the data into training and validation sets. We can do it using \n",
"#### `train_test_split` from the `model_selection` module of scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"_cell_guid": "ba8e606d-8dee-495e-8c3f-62aa916e9927",
"_uuid": "b45676b121e2b719d355619e24cfed13d0d33f74",
"colab": {},
"colab_type": "code",
"id": "tqXblO-mCOIt"
},
"outputs": [],
"source": [
"xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.Script.values, train_df.Labels, \n",
" stratify=train_df.Labels, \n",
" random_state=42, \n",
" test_size=0.1, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"_cell_guid": "9e2fe6a9-8de0-4bbd-8264-f6b78e7993e2",
"_uuid": "6c8659049537836fdf00d19d6d656630a306d217",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "Ce_Eu4CrCOIu",
"outputId": "042067e3-a5ad-4f5a-c727-0581878fdd86"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1781,)\n",
"(198,)\n"
]
}
],
"source": [
"print (xtrain.shape)\n",
"print (xvalid.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "3db70c26-d684-478a-bcd4-980ed6c6d65b",
"_uuid": "794fb768f4a8e42c4be4f1dbb27144aae4d00c79",
"colab_type": "text",
"id": "unBlwkZ1COIx"
},
"source": [
"# Building Basic Models\n",
"\n",
"### Let's start building our very first model. \n",
"\n",
"### Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"_cell_guid": "b387f2af-11b1-455d-ad8d-320ed1005be3",
"_uuid": "350d453dc982f494c3774dbdcf731d856546d611",
"colab": {},
"colab_type": "code",
"id": "qnJ8aPthCOIx"
},
"outputs": [],
"source": [
"# Always start with these features. They work (almost) everytime!\n",
"tfv = TfidfVectorizer(min_df=3, max_features=None, \n",
" strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
" ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
" stop_words = 'english')\n",
"\n",
"# Fitting TF-IDF to both training and test sets (semi-supervised learning)\n",
"tfv.fit(list(xtrain) + list(xvalid))\n",
"\n",
"xtrain_tfv = tfv.transform(xtrain) \n",
"xvalid_tfv = tfv.transform(xvalid)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TFIDF on test data"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"x_test_tfv = tfv.transform(test_df['Script'].values)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"_cell_guid": "4106bbd1-dc35-4dc2-bda0-3024d3c056d3",
"_uuid": "3f5dd9ce043364fc61ba3a30298acd9cb72a2938",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "NhMszJiuCOIz",
"outputId": "d4060a49-dae8-4d78-e909-f752f83cdf8e"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
" FutureWarning)\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
" \"this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"logloss: 2.486 \n"
]
}
],
"source": [
"# Fitting a simple Logistic Regression on TFIDF\n",
"clf = LogisticRegression(C=1.0)\n",
"clf.fit(xtrain_tfv, ytrain)\n",
"predictions = clf.predict_proba(xvalid_tfv)\n",
"\n",
"print (\"logloss: %0.3f \" % multiclass_logloss(yvalid, predictions))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Submission "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Let's predict on the entire test data..."
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"test_set_preds = pd.DataFrame(columns = train_df.Genre.unique().tolist())\n",
"test_set_preds.insert(0, 'File_name', test_df.File_Name)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"test_set_preds[train_df.Genre.unique().tolist()] = clf.predict_proba(x_test_tfv)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'test_set_preds' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-28-d91266b0dd82>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_set_preds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'test_set_preds' is not defined"
]
}
],
"source": [
"test_set_preds.shape"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_name</th>\n",
" <th>Fantasy</th>\n",
" <th>Comedy</th>\n",
" <th>Drama</th>\n",
" <th>Sci-Fi</th>\n",
" <th>Romance</th>\n",
" <th>Thriller</th>\n",
" <th>Adventure</th>\n",
" <th>Mystery</th>\n",
" <th>Action</th>\n",
" <th>...</th>\n",
" <th>Family</th>\n",
" <th>Biography</th>\n",
" <th>Musical</th>\n",
" <th>War</th>\n",
" <th>Western</th>\n",
" <th>Music</th>\n",
" <th>History</th>\n",
" <th>Short</th>\n",
" <th>Film-Noir</th>\n",
" <th>Sport</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2300.txt</td>\n",
" <td>0.0621472</td>\n",
" <td>0.0421466</td>\n",
" <td>0.0130443</td>\n",
" <td>0.0038829</td>\n",
" <td>0.127008</td>\n",
" <td>0.0613026</td>\n",
" <td>0.196798</td>\n",
" <td>0.014007</td>\n",
" <td>0.0353658</td>\n",
" <td>...</td>\n",
" <td>0.00476286</td>\n",
" <td>0.00886165</td>\n",
" <td>0.0364149</td>\n",
" <td>0.0726348</td>\n",
" <td>0.042123</td>\n",
" <td>0.00388578</td>\n",
" <td>0.00382821</td>\n",
" <td>0.201259</td>\n",
" <td>0.0101705</td>\n",
" <td>0.00659258</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_809.txt</td>\n",
" <td>0.0618728</td>\n",
" <td>0.0424302</td>\n",
" <td>0.0128616</td>\n",
" <td>0.00384745</td>\n",
" <td>0.162379</td>\n",
" <td>0.0644784</td>\n",
" <td>0.195431</td>\n",
" <td>0.0140639</td>\n",
" <td>0.0337543</td>\n",
" <td>...</td>\n",
" <td>0.00473483</td>\n",
" <td>0.00875095</td>\n",
" <td>0.0326453</td>\n",
" <td>0.155985</td>\n",
" <td>0.0393143</td>\n",
" <td>0.00379855</td>\n",
" <td>0.00377858</td>\n",
" <td>0.0948981</td>\n",
" <td>0.0099209</td>\n",
" <td>0.00636009</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_1383.txt</td>\n",
" <td>0.0975445</td>\n",
" <td>0.144602</td>\n",
" <td>0.0133359</td>\n",
" <td>0.00399558</td>\n",
" <td>0.0758506</td>\n",
" <td>0.0479403</td>\n",
" <td>0.136187</td>\n",
" <td>0.0142812</td>\n",
" <td>0.0355864</td>\n",
" <td>...</td>\n",
" <td>0.00493601</td>\n",
" <td>0.00917026</td>\n",
" <td>0.0342179</td>\n",
" <td>0.0529838</td>\n",
" <td>0.151729</td>\n",
" <td>0.00399736</td>\n",
" <td>0.00395552</td>\n",
" <td>0.0972641</td>\n",
" <td>0.0110146</td>\n",
" <td>0.00689023</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_983.txt</td>\n",
" <td>0.081371</td>\n",
" <td>0.049697</td>\n",
" <td>0.0135967</td>\n",
" <td>0.00402333</td>\n",
" <td>0.103302</td>\n",
" <td>0.0638348</td>\n",
" <td>0.163699</td>\n",
" <td>0.0146887</td>\n",
" <td>0.0363712</td>\n",
" <td>...</td>\n",
" <td>0.00499925</td>\n",
" <td>0.00934587</td>\n",
" <td>0.036615</td>\n",
" <td>0.0638244</td>\n",
" <td>0.0479881</td>\n",
" <td>0.00404956</td>\n",
" <td>0.00401123</td>\n",
" <td>0.225464</td>\n",
" <td>0.0109523</td>\n",
" <td>0.00689013</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_1713.txt</td>\n",
" <td>0.153704</td>\n",
" <td>0.110646</td>\n",
" <td>0.0141907</td>\n",
" <td>0.00410832</td>\n",
" <td>0.0723873</td>\n",
" <td>0.0499341</td>\n",
" <td>0.18619</td>\n",
" <td>0.0153917</td>\n",
" <td>0.0376808</td>\n",
" <td>...</td>\n",
" <td>0.00506564</td>\n",
" <td>0.00963215</td>\n",
" <td>0.0720046</td>\n",
" <td>0.0489596</td>\n",
" <td>0.0497823</td>\n",
" <td>0.00413493</td>\n",
" <td>0.00411517</td>\n",
" <td>0.0888852</td>\n",
" <td>0.0111348</td>\n",
" <td>0.00707706</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" File_name Fantasy Comedy Drama Sci-Fi Romance \\\n",
"0 file_2300.txt 0.0621472 0.0421466 0.0130443 0.0038829 0.127008 \n",
"1 file_809.txt 0.0618728 0.0424302 0.0128616 0.00384745 0.162379 \n",
"2 file_1383.txt 0.0975445 0.144602 0.0133359 0.00399558 0.0758506 \n",
"3 file_983.txt 0.081371 0.049697 0.0135967 0.00402333 0.103302 \n",
"4 file_1713.txt 0.153704 0.110646 0.0141907 0.00410832 0.0723873 \n",
"\n",
" Thriller Adventure Mystery Action ... Family Biography \\\n",
"0 0.0613026 0.196798 0.014007 0.0353658 ... 0.00476286 0.00886165 \n",
"1 0.0644784 0.195431 0.0140639 0.0337543 ... 0.00473483 0.00875095 \n",
"2 0.0479403 0.136187 0.0142812 0.0355864 ... 0.00493601 0.00917026 \n",
"3 0.0638348 0.163699 0.0146887 0.0363712 ... 0.00499925 0.00934587 \n",
"4 0.0499341 0.18619 0.0153917 0.0376808 ... 0.00506564 0.00963215 \n",
"\n",
" Musical War Western Music History Short \\\n",
"0 0.0364149 0.0726348 0.042123 0.00388578 0.00382821 0.201259 \n",
"1 0.0326453 0.155985 0.0393143 0.00379855 0.00377858 0.0948981 \n",
"2 0.0342179 0.0529838 0.151729 0.00399736 0.00395552 0.0972641 \n",
"3 0.036615 0.0638244 0.0479881 0.00404956 0.00401123 0.225464 \n",
"4 0.0720046 0.0489596 0.0497823 0.00413493 0.00411517 0.0888852 \n",
"\n",
" Film-Noir Sport \n",
"0 0.0101705 0.00659258 \n",
"1 0.0099209 0.00636009 \n",
"2 0.0110146 0.00689023 \n",
"3 0.0109523 0.00689013 \n",
"4 0.0111348 0.00707706 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_set_preds.head()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"# test_set_preds.to_excel('test_set_preds.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"# pred_df = pd.read_excel(\"/Users/anurag/Downloads/Movie_Scripts_Sample_SubmissionL.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.06214723355175664 0.042146623610022714 0.013044332037354145 ...\n",
" 0.20125856438571393 0.010170535073625206 0.006592576535592437]\n",
" [0.06187280816682498 0.04243018124017416 0.012861594741520026 ...\n",
" 0.0948981163189656 0.009920896808506535 0.006360092595007451]\n",
" [0.09754452792196255 0.1446015828825774 0.013335926776471056 ...\n",
" 0.09726405512888492 0.011014628830312737 0.006890231855105332]\n",
" ...\n",
" [0.08640182590027343 0.11063912938749708 0.013709650461187581 ...\n",
" 0.09776645184552446 0.01073621514500798 0.006588622029522577]\n",
" [0.11033200163939762 0.05623496173057861 0.01356362469949817 ...\n",
" 0.12585100435340937 0.010992541569449196 0.006971135126053225]\n",
" [0.1911758586509735 0.043270688228447846 0.012024185208458112 ...\n",
" 0.13988146958682884 0.009851563910019954 0.006509688375101668]]\n"
]
},
{
"data": {
"text/plain": [
"2.4597586095272357"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiclass_logloss(test_df['Labels'].values, test_set_preds[train_df.Genre.unique().tolist()].values)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'test_set_preds' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-27-152e5a3bf49c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_set_preds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'test_set_preds' is not defined"
]
}
],
"source": [
"test_set_preds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "Starter Notebook - Movie Genre Classification",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment