Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save analyticsindiamagazine/8a9015b7f8d12d5b414e89007d9b069a to your computer and use it in GitHub Desktop.
Save analyticsindiamagazine/8a9015b7f8d12d5b414e89007d9b069a to your computer and use it in GitHub Desktop.
Starter_Notebook_For_Participants.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/analyticsindiamagazine/8a9015b7f8d12d5b414e89007d9b069a/starter_notebook_for_participants.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "IEFmR5J_COHU"
},
"source": [
"## Import Required Files"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "hEZXHo2YVq2B"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "YosGM4uOCOHZ"
},
"outputs": [],
"source": [
"## change it to the unzip path of the downloaded dataset..\n",
"data_folder = r'/Users/anurag/Downloads/movie genre classification/Scripts'"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5MGrGh94COHh",
"outputId": "6f3c2d5e-d458-4909-afcd-1f66475bab18"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Number of Files : 2827\n"
]
}
],
"source": [
"all_files = os.listdir(data_folder)\n",
"print('Total Number of Files :', len(all_files))"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "z3a0Y3Q0Vq2L"
},
"source": [
"# Read Train Files"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Nps340zYVq2M",
"outputId": "2a0d42cd-f6b2-4884-b2b7-3547c4d18a1a"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_Name</th>\n",
" <th>Labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2180.txt</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_693.txt</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_2469.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_2542.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_378.txt</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1973</td>\n",
" <td>file_1930.txt</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1974</td>\n",
" <td>file_1821.txt</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1975</td>\n",
" <td>file_350.txt</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1976</td>\n",
" <td>file_1933.txt</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1977</td>\n",
" <td>file_1210.txt</td>\n",
" <td>11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1978 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" File_Name Labels\n",
"0 file_2180.txt 8\n",
"1 file_693.txt 4\n",
"2 file_2469.txt 6\n",
"3 file_2542.txt 6\n",
"4 file_378.txt 16\n",
"... ... ...\n",
"1973 file_1930.txt 19\n",
"1974 file_1821.txt 19\n",
"1975 file_350.txt 16\n",
"1976 file_1933.txt 19\n",
"1977 file_1210.txt 11\n",
"\n",
"[1978 rows x 2 columns]"
]
},
"execution_count": 4,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"train_df = pd.read_csv('Train.csv')\n",
"train_df"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ZIUam5eVVq2Q",
"outputId": "ddd148da-7dfc-4e25-bca7-e39bf1c506b6"
},
"outputs": [
{
"data": {
"text/plain": [
"6 405\n",
"19 261\n",
"4 243\n",
"0 203\n",
"5 141\n",
"15 134\n",
"1 116\n",
"16 109\n",
"11 104\n",
"8 79\n",
"14 75\n",
"7 27\n",
"2 25\n",
"20 18\n",
"13 15\n",
"21 9\n",
"12 4\n",
"9 3\n",
"3 2\n",
"17 2\n",
"10 2\n",
"18 1\n",
"Name: Labels, dtype: int64"
]
},
"execution_count": 5,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"train_df.Labels.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SYZ9t9U8Vq2W",
"outputId": "92198597-c824-4597-9a5b-c74df7552130"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_Name</th>\n",
" <th>Labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2300.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_809.txt</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_1383.txt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_983.txt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_1713.txt</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>844</td>\n",
" <td>file_2474.txt</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>845</td>\n",
" <td>file_863.txt</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>846</td>\n",
" <td>file_1547.txt</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <td>847</td>\n",
" <td>file_1292.txt</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>848</td>\n",
" <td>file_1910.txt</td>\n",
" <td>19</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>849 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" File_Name Labels\n",
"0 file_2300.txt 6\n",
"1 file_809.txt 4\n",
"2 file_1383.txt 0\n",
"3 file_983.txt 1\n",
"4 file_1713.txt 19\n",
".. ... ...\n",
"844 file_2474.txt 6\n",
"845 file_863.txt 4\n",
"846 file_1547.txt 20\n",
"847 file_1292.txt 0\n",
"848 file_1910.txt 19\n",
"\n",
"[849 rows x 2 columns]"
]
},
"execution_count": 6,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.read_csv('/Users/anurag/Documents/Workspace/Our_test.csv')\n",
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "c3x3bITvVq2Y",
"outputId": "c24ee160-1caa-4a07-db6c-6c0b3c3d9fe3"
},
"outputs": [
{
"data": {
"text/plain": [
"6 174\n",
"19 112\n",
"4 104\n",
"0 87\n",
"5 60\n",
"15 58\n",
"1 50\n",
"16 46\n",
"11 45\n",
"8 34\n",
"14 32\n",
"7 12\n",
"2 10\n",
"20 8\n",
"13 7\n",
"21 4\n",
"9 1\n",
"12 1\n",
"3 1\n",
"17 1\n",
"18 1\n",
"10 1\n",
"Name: Labels, dtype: int64"
]
},
"execution_count": 7,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"test_df.Labels.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9UL0RRzLVq2b",
"outputId": "50d43098-7937-4531-9296-9f8b20ba1b42"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 22\n"
]
}
],
"source": [
"print(train_df.Labels.nunique(), test_df.Labels.nunique())"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "0DqMPYhrVq2d"
},
"outputs": [],
"source": [
"## let's read the text scripts in the train and test dataframes..\n",
"\n",
"train_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in train_df['File_Name']]\n",
"test_df['Script'] = [open(data_folder + os.sep + file, \"r\").read() for file in test_df['File_Name']]"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WU5cBRg5Vq2f"
},
"source": [
"# Lets look at a script file after Reading.."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "uC2QnjwzVq2g",
"outputId": "6a6fc548-1566-40e0-97f4-b79bb5344bf8",
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<b><!--\n",
"\n",
"</b>if (window!= top)\n",
"\n",
"top.location.href=location.href\n",
"\n",
"<b>// -->\n",
"\n",
"</b>\n",
"\n",
"The Abyss - by James Cameron \n",
"\n",
" THE ABYSS\n",
"\n",
" AN ORIGINAL SCREENPLAY\n",
"\n",
" BY\n",
"\n",
" JAMES CAMERON\n",
"\n",
" August 2, 1988\n",
"\n",
" Director's Revision\n",
"\n",
"------------------------------------------------------------------------------\n",
"\n",
" THE ABYSS\n",
"\n",
"OMITTED 1\n",
"\n",
"OMITTED 2\n",
"\n",
"TITLE: THE ABYSS -- ON BLACK, DISSOLVING TO COBALT BLUE\n",
"\n",
"EXT. OCEAN/UNDERWATER -- DAY 3\n",
"\n",
"Blue, deep and featureless, the twilight of five hundred feet down.\n",
"\n",
"PROPELLER SOUND. Materializing out of the blue limbo is the enormous but\n",
"\n",
"sleek form of an Ohio-class SSBN ballistic missile submarine.\n",
"\n",
"INT. U.S.S. MONTANA -- DAY 4\n",
"\n",
"In the attack center, darkened to womb-red, the crew's faces shine with sweat\n",
"\n",
"in the glow of their instruments. The SKIPPER and his EXEC crowd around\n",
"\n",
"BARNES, the sonarman.\n",
"\n",
" CAPTAIN\n",
"\n",
" Sixty knots? No way, Barnes... the reds don't\n",
"\n",
" have anything that fast.\n",
"\n",
" BARNES\n",
"\n",
" Checked it twice, skipper. It's a real unique\n",
"\n",
" signature. No cavitation, no reactor noise...\n",
"\n",
" doesn't even sound like screws.\n",
"\n",
"He puts the signal onto a speaker and everyone in the attack room listens to\n",
"\n",
"the intruder's acoustic signature, a strange THRUMMING. The captain studies\n",
"\n",
"the electronic position board, a graphic representation of the contours of\n",
"\n",
"the steep-walled canyon, a symbol for the Montana, and converging with it, an\n",
"\n",
"amorphous trace, representing the bogey.\n",
"\n",
" CAPTAIN\n",
"\n",
" What the hell is it?\n",
"\n",
" EXEC\n",
"\n",
" I'll tell you what it's not, it's not one of\n",
"\n",
" ours.\n",
"\n",
" BARNES\n",
"\n",
" Sir! Contact changing heading to two-one-four,\n",
"\n",
" diving. Speed eighty knots! Eighty knots!\n",
"\n",
" EXEC\n",
"\n",
" Eighty knots...\n",
"\n",
" BARNES\n",
"\n",
" Still diving, depth nine hundred feet. Port\n",
"\n",
" clearance to cliff wall, one hundred fifty feet.\n",
"\n",
" FRANK\n",
"\n",
" (simultaneously)\n",
"\n",
" Still diving, depth nine hundred feet. Port\n",
"\n",
" clearance to cliff wall, one hundred fifty feet.\n",
"\n",
"Tension builds in the attack room as the Montana surges to intercept the\n",
"\n",
"intruder. The exec tensely watches the vector-graphic readout for the side-\n",
"\n",
"scan sonar array. The sub is running uncomfortably\n"
]
}
],
"source": [
"#lets check one of the scripts..\n",
"print(train_df['Script'][4][:3000])"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_vvQ6vOcVq2j",
"outputId": "a3d04d6a-c61d-43ea-cb48-ac18a07ae219"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" KING KONG\n",
"\n",
" Written by\n",
"\n",
" Fran Walsh, Philippa Boyens and Peter Jackson\n",
"\n",
" Based on a Story by\n",
"\n",
" Merian C. Cooper and Edgar Wallace\n",
"\n",
" 1.\n",
"\n",
" EXT. CENTRAL PARK - DAY\n",
"\n",
" CLOSE ON: A scrawny MONKEY scratches.\n",
"\n",
" ANGLES ON: Defeated, listless ANIMALS, in the bleak environs of a\n",
"\n",
" dilapidated ZOO.\n",
"\n",
" WIDER: It is CENTRAL PARK ZOO in depression era NEW YORK. The PARK\n",
"\n",
" itself is like a GARBAGE DUMP, dotted with squalid SHANTY TOWNS.\n",
"\n",
" Against these BLEAK IMAGES, the SOUND of a BRIGHT, BRASSY SONG\n",
"\n",
" fades up: Al Jolson, singing \"I'm Sitting on Top of the World\".\n",
"\n",
" The sky line of MANHATTAN rises in the background, a grim steaming\n",
"\n",
" jungle on this cold FALL day.\n",
"\n",
" I\n",
"\n",
" EXT. NY STREETS - DAY\n",
"\n",
" LONG continues over:\n",
"\n",
" IMAGES: The CROWDED STREETS of NEW YORK ... beneath the bustle is\n",
"\n",
" a sense of despair.\n",
"\n",
" LONG SOUP LINES snake along the STREETS.\n",
"\n",
" The HUNGRY search through RUBBISH BINS for FOOD. SKYSCRAPERS rise\n",
"\n",
" steadily upwards as more people are evicted from their homes.\n",
"\n",
" HOMELESS sleep amid steaming VENTS and GARBAGE STREWN GUTTERS.\n",
"\n",
" Intercut:\n",
"\n",
" INT. VAUDEVILLE THEATRE - NIGHT\n",
"\n",
" SONG continues over:\n",
"\n",
" I\n",
"\n",
" SANNY, an old-time VAUDEVILLIAN, hurriedly fixes a large DROOPY\n",
"\n",
" MOUSTACHE on to a YOUNG WOMAN'S TOP LIP ... this is ANN DARROW.\n",
"\n",
" IMAGES: Weird and wonderful snatches of VAUDEVILLE ACTS follow ...\n",
"\n",
" singers, jugglers, boxing ladies.\n",
"\n",
" E\n",
"\n",
" Intercut with:\n",
"\n",
" EXT. NY STREETS - DAY\n",
"\n",
" The COLOR and MUSIC contrast with the SOUP LINES and SLUMPED\n",
"\n",
" SHOULDERS of the REAL WORLD.\n",
"\n",
" INT. VAUDEVILLE THEATRE - NIGHT\n",
"\n",
" ANGLE ON: ANN on STAGE ... dressed as an ELEGANT GENT, she\n",
"\n",
" launches into `I'm Just Wild About Harry' with HARRY, a larger-\n",
"\n",
" than-life PERFORMER dressed in a FRILLY DRESS, BRASSY RED WIG and\n",
"\n",
" FALSIES.\n",
"\n",
" 2.\n",
"\n",
" MANNY's CHARACTER joins in ... SNEEZING LOUDLY and causing ANN to\n",
"\n",
" take a SUDDEN PRAT FALL.\n",
"\n",
" nd so the ROUTINE BUILDS ... ANN and HARRY singing and dancing\n",
"\n",
" ... MANNY SNEEZING ... ANN falling.\n",
"\n",
" The AUDIENCE look on with bored expressions on their faces. All\n",
"\n",
" except ONE MAN at the BACK, who is LAUGHING HYSTERICALLY.\n",
"\n",
" CLOSE ON: ANN throwing everything into her ACT ... SWEAT rolls\n",
"\n",
" down her face ... she tries\n"
]
}
],
"source": [
"print(test_df['Script'][4][:3000])"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"colab_type": "code",
"id": "5Z6GyVjFD9aH",
"outputId": "309eb172-19cc-40a4-e198-bc24dbf4d804"
},
"outputs": [
{
"data": {
"text/plain": [
"6 405\n",
"19 261\n",
"4 243\n",
"0 203\n",
"5 141\n",
"15 134\n",
"1 116\n",
"16 109\n",
"11 104\n",
"8 79\n",
"14 75\n",
"7 27\n",
"2 25\n",
"20 18\n",
"13 15\n",
"21 9\n",
"12 4\n",
"9 3\n",
"3 2\n",
"17 2\n",
"10 2\n",
"18 1\n",
"Name: Labels, dtype: int64"
]
},
"execution_count": 11,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"train_df.Labels.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "LdCtl00oCOIZ"
},
"outputs": [],
"source": [
"# !pip install keras\n",
"# !pip install nltk"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "nxPpQJw0Vq2q"
},
"source": [
"### There is single instance for one class (Label == 18), lets duplicate the row for stratification"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "lplIhy8NVq2r"
},
"outputs": [],
"source": [
"train_df = train_df.append(train_df[train_df['Labels'] == 18])\n",
"train_df.reset_index(drop=True, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rkZO1-p1Vq2t",
"outputId": "91becbaf-df92-4200-ca21-59d66bb44981"
},
"outputs": [
{
"data": {
"text/plain": [
"6 405\n",
"19 261\n",
"4 243\n",
"0 203\n",
"5 141\n",
"15 134\n",
"1 116\n",
"16 109\n",
"11 104\n",
"8 79\n",
"14 75\n",
"7 27\n",
"2 25\n",
"20 18\n",
"13 15\n",
"21 9\n",
"12 4\n",
"9 3\n",
"3 2\n",
"17 2\n",
"18 2\n",
"10 2\n",
"Name: Labels, dtype: int64"
]
},
"execution_count": 14,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"train_df.Labels.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "3ZzXe4SqVq2x"
},
"source": [
"# Import the Modeling Libraries "
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "d46ba3fd-26f1-4635-b2f9-fca916ff3066",
"_uuid": "21f3ccd962d1556dc2346699d45a29e9ef791367",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"id": "XvGfW6yECOIb",
"outputId": "3f818c87-cc25-4cf7-cd09-2a530b458b7a"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
"/Users/anurag/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
]
}
],
"source": [
"import numpy as np\n",
"from tqdm import tqdm\n",
"from sklearn.svm import SVC\n",
"from keras.models import Sequential\n",
"from keras.layers.recurrent import LSTM, GRU\n",
"from keras.layers.core import Dense, Activation, Dropout\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.layers.normalization import BatchNormalization\n",
"from keras.utils import np_utils\n",
"from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\n",
"from keras.preprocessing import sequence, text\n",
"from keras.callbacks import EarlyStopping\n",
"import nltk\n",
"from nltk import word_tokenize\n",
"from nltk.corpus import stopwords\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "60326be1-82d1-4677-8ef8-da5b1eac475c",
"_uuid": "adb496504ab8453ce2b4f91dd6e5f17cbdaf4f68",
"colab_type": "text",
"id": "erL7TVmYCOId"
},
"source": [
"Let's load the datasets"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "f-2g2ajRCOIe",
"outputId": "06d8c71f-741e-4c9d-9d2b-ab90cfb07a2e",
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/anurag/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"nltk.download('stopwords')\n",
"stop_words = stopwords.words('english')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ARKw1PXBVq25"
},
"source": [
"# Define the Scoring Metric"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "G_agx-HVVq25"
},
"outputs": [],
"source": [
"from sklearn.metrics import log_loss"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "b4a37951-7a53-43b9-bb5a-0335f1259be3",
"_uuid": "14ede2221105fb84bb6b2d3a85f9a1f483e8b124",
"colab_type": "text",
"id": "a81kkCgECOIq"
},
"source": [
"### Let's Use LabelEncoder from scikit-learn to convert Genre labels to integers, 0, 1 2"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "d59a646d-7739-496c-814f-594d371d76eb",
"_uuid": "19eb8c10f06df8e0f543ee12f794df5f88b0ff1a",
"colab": {},
"colab_type": "code",
"id": "avnO-wf9COIq"
},
"outputs": [],
"source": [
"lbl_enc = preprocessing.LabelEncoder()\n",
"y = lbl_enc.fit_transform(train_df.Labels.values)"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "65403e74-091f-43c4-9523-3e15d8a75a1e",
"_uuid": "4ffd04f40d9e921673d06ad64e01b9a7395d8e76",
"colab_type": "text",
"id": "8OtK6PLsCOIs"
},
"source": [
"### Before going further it is important that we split the data into training and validation sets. We can do it using \n",
"#### `train_test_split` from the `model_selection` module of scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "ba8e606d-8dee-495e-8c3f-62aa916e9927",
"_uuid": "b45676b121e2b719d355619e24cfed13d0d33f74",
"colab": {},
"colab_type": "code",
"id": "tqXblO-mCOIt"
},
"outputs": [],
"source": [
"xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.Script.values, y, \n",
" stratify=y, \n",
" random_state=42, \n",
" test_size=0.30, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "9e2fe6a9-8de0-4bbd-8264-f6b78e7993e2",
"_uuid": "6c8659049537836fdf00d19d6d656630a306d217",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "Ce_Eu4CrCOIu",
"outputId": "042067e3-a5ad-4f5a-c727-0581878fdd86"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1385,)\n",
"(594,)\n"
]
}
],
"source": [
"print (xtrain.shape)\n",
"print (xvalid.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"_cell_guid": "3db70c26-d684-478a-bcd4-980ed6c6d65b",
"_uuid": "794fb768f4a8e42c4be4f1dbb27144aae4d00c79",
"colab_type": "text",
"id": "unBlwkZ1COIx"
},
"source": [
"# Building Basic Models\n",
"\n",
"### Let's start building our very first model. \n",
"\n",
"### Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "b387f2af-11b1-455d-ad8d-320ed1005be3",
"_uuid": "350d453dc982f494c3774dbdcf731d856546d611",
"colab": {},
"colab_type": "code",
"id": "qnJ8aPthCOIx"
},
"outputs": [],
"source": [
"# Always start with these features. They work (almost) everytime!\n",
"tfv = TfidfVectorizer(min_df=3, max_features=None, \n",
" strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
" ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
" stop_words = 'english')\n",
"\n",
"# Fitting TF-IDF to both training and test sets (semi-supervised learning)\n",
"tfv.fit(list(xtrain) + list(xvalid))\n",
"xtrain_tfv = tfv.transform(xtrain) \n",
"xvalid_tfv = tfv.transform(xvalid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "7DA_QK7aVq3I"
},
"source": [
"## TFIDF on test data"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "dzSPvE6pVq3J"
},
"outputs": [],
"source": [
"x_test_tfv = tfv.transform(test_df['Script'].values)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"_cell_guid": "4106bbd1-dc35-4dc2-bda0-3024d3c056d3",
"_uuid": "3f5dd9ce043364fc61ba3a30298acd9cb72a2938",
"colab": {},
"colab_type": "code",
"id": "NhMszJiuCOIz"
},
"outputs": [],
"source": [
"## Fitting a simple Logistic Regression on TFIDF\n",
"clf = LogisticRegression(C=1.0)\n",
"clf.fit(xtrain_tfv, ytrain)\n",
"predictions = clf.predict_proba(xvalid_tfv)\n",
"\n",
"print(\"logloss: %0.3f \" % log_loss(yvalid, predictions))"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "OsW7SEHUVq3M"
},
"source": [
"# Submission "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "qgN8HVaUVq3N"
},
"source": [
"### Let's predict on the entire test data..."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "j6RV2jDiVq3N"
},
"outputs": [],
"source": [
"test_set_preds = pd.DataFrame(columns = train_df.Labels.unique().tolist())\n",
"test_set_preds.insert(0, 'File_Name', test_df.File_Name)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "7gOI9x-0Vq3P"
},
"outputs": [],
"source": [
"test_set_preds[test_df.Labels.unique().tolist()] = clf.predict_proba(x_test_tfv)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "E2wHkWIyVq3S",
"outputId": "52af681f-882d-4f13-abf2-8e03041daf13"
},
"outputs": [
{
"data": {
"text/plain": [
"(849, 23)"
]
},
"execution_count": 33,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"test_set_preds.shape"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "wRIKKTeWVq3W",
"outputId": "18d12eb3-26b4-473b-e3ab-b930d1b3c945"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>File_name</th>\n",
" <th>8</th>\n",
" <th>4</th>\n",
" <th>6</th>\n",
" <th>16</th>\n",
" <th>15</th>\n",
" <th>19</th>\n",
" <th>1</th>\n",
" <th>14</th>\n",
" <th>0</th>\n",
" <th>...</th>\n",
" <th>7</th>\n",
" <th>3</th>\n",
" <th>13</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>12</th>\n",
" <th>10</th>\n",
" <th>17</th>\n",
" <th>9</th>\n",
" <th>18</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>file_2300.txt</td>\n",
" <td>0.00476286</td>\n",
" <td>0.0421466</td>\n",
" <td>0.0621472</td>\n",
" <td>0.00388802</td>\n",
" <td>0.0353658</td>\n",
" <td>0.127008</td>\n",
" <td>0.0038829</td>\n",
" <td>0.0455528</td>\n",
" <td>0.0130443</td>\n",
" <td>...</td>\n",
" <td>0.0364149</td>\n",
" <td>0.0101705</td>\n",
" <td>0.0726348</td>\n",
" <td>0.014007</td>\n",
" <td>0.00432342</td>\n",
" <td>0.042123</td>\n",
" <td>0.00382821</td>\n",
" <td>0.00659258</td>\n",
" <td>0.00388578</td>\n",
" <td>0.201259</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>file_809.txt</td>\n",
" <td>0.00473483</td>\n",
" <td>0.0424302</td>\n",
" <td>0.0618728</td>\n",
" <td>0.00380366</td>\n",
" <td>0.0337543</td>\n",
" <td>0.162379</td>\n",
" <td>0.00384745</td>\n",
" <td>0.0406282</td>\n",
" <td>0.0128616</td>\n",
" <td>...</td>\n",
" <td>0.0326453</td>\n",
" <td>0.0099209</td>\n",
" <td>0.155985</td>\n",
" <td>0.0140639</td>\n",
" <td>0.00426309</td>\n",
" <td>0.0393143</td>\n",
" <td>0.00377858</td>\n",
" <td>0.00636009</td>\n",
" <td>0.00379855</td>\n",
" <td>0.0948981</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>file_1383.txt</td>\n",
" <td>0.00493601</td>\n",
" <td>0.144602</td>\n",
" <td>0.0975445</td>\n",
" <td>0.00401702</td>\n",
" <td>0.0355864</td>\n",
" <td>0.0758506</td>\n",
" <td>0.00399558</td>\n",
" <td>0.0460768</td>\n",
" <td>0.0133359</td>\n",
" <td>...</td>\n",
" <td>0.0342179</td>\n",
" <td>0.0110146</td>\n",
" <td>0.0529838</td>\n",
" <td>0.0142812</td>\n",
" <td>0.00442489</td>\n",
" <td>0.151729</td>\n",
" <td>0.00395552</td>\n",
" <td>0.00689023</td>\n",
" <td>0.00399736</td>\n",
" <td>0.0972641</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>file_983.txt</td>\n",
" <td>0.00499925</td>\n",
" <td>0.049697</td>\n",
" <td>0.081371</td>\n",
" <td>0.00403428</td>\n",
" <td>0.0363712</td>\n",
" <td>0.103302</td>\n",
" <td>0.00402333</td>\n",
" <td>0.0467117</td>\n",
" <td>0.0135967</td>\n",
" <td>...</td>\n",
" <td>0.036615</td>\n",
" <td>0.0109523</td>\n",
" <td>0.0638244</td>\n",
" <td>0.0146887</td>\n",
" <td>0.00453059</td>\n",
" <td>0.0479881</td>\n",
" <td>0.00401123</td>\n",
" <td>0.00689013</td>\n",
" <td>0.00404956</td>\n",
" <td>0.225464</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>file_1713.txt</td>\n",
" <td>0.00506564</td>\n",
" <td>0.110646</td>\n",
" <td>0.153704</td>\n",
" <td>0.00412486</td>\n",
" <td>0.0376808</td>\n",
" <td>0.0723873</td>\n",
" <td>0.00410832</td>\n",
" <td>0.0462569</td>\n",
" <td>0.0141907</td>\n",
" <td>...</td>\n",
" <td>0.0720046</td>\n",
" <td>0.0111348</td>\n",
" <td>0.0489596</td>\n",
" <td>0.0153917</td>\n",
" <td>0.00459447</td>\n",
" <td>0.0497823</td>\n",
" <td>0.00411517</td>\n",
" <td>0.00707706</td>\n",
" <td>0.00413493</td>\n",
" <td>0.0888852</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" File_name 8 4 6 16 15 \\\n",
"0 file_2300.txt 0.00476286 0.0421466 0.0621472 0.00388802 0.0353658 \n",
"1 file_809.txt 0.00473483 0.0424302 0.0618728 0.00380366 0.0337543 \n",
"2 file_1383.txt 0.00493601 0.144602 0.0975445 0.00401702 0.0355864 \n",
"3 file_983.txt 0.00499925 0.049697 0.081371 0.00403428 0.0363712 \n",
"4 file_1713.txt 0.00506564 0.110646 0.153704 0.00412486 0.0376808 \n",
"\n",
" 19 1 14 0 ... 7 3 \\\n",
"0 0.127008 0.0038829 0.0455528 0.0130443 ... 0.0364149 0.0101705 \n",
"1 0.162379 0.00384745 0.0406282 0.0128616 ... 0.0326453 0.0099209 \n",
"2 0.0758506 0.00399558 0.0460768 0.0133359 ... 0.0342179 0.0110146 \n",
"3 0.103302 0.00402333 0.0467117 0.0135967 ... 0.036615 0.0109523 \n",
"4 0.0723873 0.00410832 0.0462569 0.0141907 ... 0.0720046 0.0111348 \n",
"\n",
" 13 20 21 12 10 17 \\\n",
"0 0.0726348 0.014007 0.00432342 0.042123 0.00382821 0.00659258 \n",
"1 0.155985 0.0140639 0.00426309 0.0393143 0.00377858 0.00636009 \n",
"2 0.0529838 0.0142812 0.00442489 0.151729 0.00395552 0.00689023 \n",
"3 0.0638244 0.0146887 0.00453059 0.0479881 0.00401123 0.00689013 \n",
"4 0.0489596 0.0153917 0.00459447 0.0497823 0.00411517 0.00707706 \n",
"\n",
" 9 18 \n",
"0 0.00388578 0.201259 \n",
"1 0.00379855 0.0948981 \n",
"2 0.00399736 0.0972641 \n",
"3 0.00404956 0.225464 \n",
"4 0.00413493 0.0888852 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 34,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"test_set_preds.head()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "hEeQA9VGVq3a"
},
"outputs": [],
"source": [
"#Reorder the columns to match the Sample_submission_file\n",
"test_set_preds = test_set_preds[['File_name',0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "052WdJ-ZVq3Y"
},
"outputs": [],
"source": [
"#Write your submissions to an excel file\n",
"test_set_preds.to_excel('test_set_preds_v1.0.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Follow the sample submission format for your submission file\n",
"\n",
"#### Please verify the following before submitting your solution to avoid an invalid sibmission.\n",
"\n",
"1. The format of the file is excel(.xlsx)\n",
"\n",
"2. The file doesn’t contain additional styling elements such as bold headings or table borders\n",
"\n",
"3. The length of the submission exactly matches with that of the sample submission and test set\n",
"\n",
"4. The file name doesnot have any spaces or special characters\n",
"\n",
"5. All the columns are present"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"include_colab_link": true,
"name": "Starter_Notebook_For_Participants.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment