Created
February 25, 2021 17:47
-
-
Save sujnesh/79a95e6274056831a5ea239618aff156 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"DLNLP_1_training.ipynb","provenance":[{"file_id":"1EITQsi_KfxiDa-0n2H3B60ya9eDZxmlk","timestamp":1614273981603},{"file_id":"1OQWcwvlFXpVa4jY3xwgWsv1YNP82zPsh","timestamp":1614187490944},{"file_id":"1IkS10nv6TZlyEye7XP5mOH5FN9ZOSMzp","timestamp":1613390197346}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"widgets":{"application/vnd.jupyter.widget-state+json":{"5f2953a72f2042d49184ead9117cbf73":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_461f11d60378466e8bb4aa35661c5bc7","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_c1d26b5ee62c4f6d88f6f23017a466b0","IPY_MODEL_75027d1d70f6453196ff4c5515026b1b"]}},"461f11d60378466e8bb4aa35661c5bc7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"c1d26b5ee62c4f6d88f6f23017a466b0":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_6aba6c7f2f0a4793b75728e00429313e","_dom_classes":[],"description":"test.csv: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":929146,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":929146,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_1455993f735749dbb12e6b37feb3a0be"}},"75027d1d70f6453196ff4c5515026b1b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_629fcbd9e79e4e3db097aaab74d8ce49","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 929k/929k [00:02<00:00, 332kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_3c97da95822749ffaaf4fc6f366e95de"}},"6aba6c7f2f0a4793b75728e00429313e":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"1455993f735749dbb12e6b37feb3a0be":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"629fcbd9e79e4e3db097aaab74d8ce49":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"3c97da95822749ffaaf4fc6f366e95de":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"6c5e759b25764fa28fb67cf9890bb082":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_25b640cb177f43549f579a61ca41932f","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_6d6fceaec873483d8e5056be146f0039","IPY_MODEL_3f98b0437e6d4ab4a79cc3c7a56d6e0e"]}},"25b640cb177f43549f579a61ca41932f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"6d6fceaec873483d8e5056be146f0039":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_3c17cab91bca448a8f4eed615b45ccc1","_dom_classes":[],"description":"train.csv: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":5140653,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":5140653,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_dfb5b47e326e498883f2dec217711489"}},"3f98b0437e6d4ab4a79cc3c7a56d6e0e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_37d2e5f6b813417fa67fb79754516a70","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 5.14M/5.14M [00:01<00:00, 4.26MB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_e78e88f074204ed7b3e96e74e23fb48d"}},"3c17cab91bca448a8f4eed615b45ccc1":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"dfb5b47e326e498883f2dec217711489":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"37d2e5f6b813417fa67fb79754516a70":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"e78e88f074204ed7b3e96e74e23fb48d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}}}}},"cells":[{"cell_type":"markdown","metadata":{"id":"Yl2Ccct-22CN"},"source":["<div style=\"text-align: center\">\n"," <a href=\"https://www.aicrowd.com/challenges/dlnlp-note\"><img alt=\"AIcrowd\" src=\"https://gitlab.aicrowd.com/S.Rathi/iit-b-notebook-misc/-/raw/S.Rathi-master-patch-59012/creative_updated%20on%208.2.21_1%20_desktopbanner.jpg\"></a>\n","</div>"]},{"cell_type":"markdown","metadata":{"id":"OqD6TR0UiuMy"},"source":["# How to use this notebook? 📝\n","1. **Copy the notebook**. This is a shared template and any edits you make here will not be saved. _You should copy it into your own drive folder._ For this, click the \"File\" menu (top-left), then \"Save a Copy in Drive\". You can edit your copy however you like.\n","2. **Link it to your AIcrowd account**. In order to submit your code to AICrowd, you need to provide your account's API key [here](https://colab.research.google.com/drive/1EITQsi_KfxiDa-0n2H3B60ya9eDZxmlk#scrollTo=H7iqy5XcWeHN&line=4&uniqifier=1).\n","3. **Modify** the predefined functions for preprocessing, softmax, training etc.\n","4. **Save** your trained model.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"7xo2UdutLUpg"},"source":["#Dataset Specifications 💾\n","\n","* **train.csv**: has 3 columns with latter two being 'reviews' & corresponding 'ratings'.\n","* **test.csv**: has 2 columns with latter being 'reviews'. You will have to predict the corresponding 'ratings'.\n","* 'ratings' in predictions should be integers in range \\[1,5\\] i.e. {1,2,3,4,5}"]},{"cell_type":"markdown","metadata":{"id":"iIw4tX5SdMVn"},"source":["# Install AIcrowd Utilities 🧰\n","\n","We will install `aicrowd-cli` that can help us manage some trivial tasks and make our lives easier."]},{"cell_type":"code","metadata":{"id":"qjHTtoBG5aAh","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614266927478,"user_tz":-330,"elapsed":10604,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"ad89348f-2328-458b-8c23-96d413f7f473"},"source":["!pip install -U git+https://gitlab.aicrowd.com/aicrowd/aicrowd-cli.git > /dev/null"],"execution_count":null,"outputs":[{"output_type":"stream","text":[" Running command git clone -q https://gitlab.aicrowd.com/aicrowd/aicrowd-cli.git /tmp/pip-req-build-0awb7ht1\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"QDpkSmY6jf9c"},"source":["Load AIcrowd magic commands"]},{"cell_type":"code","metadata":{"id":"5mMLkbvTjfd-"},"source":["%load_ext aicrowd.magic"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"z2VrYX9KowAR"},"source":["# Import necessary modules and packages 📚"]},{"cell_type":"code","metadata":{"id":"K7J7E7RWWXju"},"source":["import os\n","import pandas as pd\n","import numpy as np\n","\n","#Add your necessary modules & packages here"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"0rUE152lf4wv"},"source":["# AIcrowd Runtime Configuration ⚙️\n","\n","The contents of the `ASSETS_DIR` are copied to your GDrive."]},{"cell_type":"code","metadata":{"id":"H7iqy5XcWeHN"},"source":["class AIcrowdConfig:\n"," DATASET_DIR = \"data\"\n"," TEST_DATA_PATH = os.path.join(DATASET_DIR, \"test.csv\")\n"," TRAIN_DATA_PATH = os.path.join(DATASET_DIR, \"train.csv\")\n"," ASSETS_DIR = \"assets\"\n"," API_KEY = \"\" # Get your key from https://www.aicrowd.com/participants/me (ctrl + click the link)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"l8iIbBlCf8sf"},"source":["# Download training data 📲\n","AIcrowd magic functions will download the dataset after authenticating your API key."]},{"cell_type":"code","metadata":{"id":"Mfd1YA-6W3-F","colab":{"base_uri":"https://localhost:8080/","height":149,"referenced_widgets":["5f2953a72f2042d49184ead9117cbf73","461f11d60378466e8bb4aa35661c5bc7","c1d26b5ee62c4f6d88f6f23017a466b0","75027d1d70f6453196ff4c5515026b1b","6aba6c7f2f0a4793b75728e00429313e","1455993f735749dbb12e6b37feb3a0be","629fcbd9e79e4e3db097aaab74d8ce49","3c97da95822749ffaaf4fc6f366e95de","6c5e759b25764fa28fb67cf9890bb082","25b640cb177f43549f579a61ca41932f","6d6fceaec873483d8e5056be146f0039","3f98b0437e6d4ab4a79cc3c7a56d6e0e","3c17cab91bca448a8f4eed615b45ccc1","dfb5b47e326e498883f2dec217711489","37d2e5f6b813417fa67fb79754516a70","e78e88f074204ed7b3e96e74e23fb48d"]},"executionInfo":{"status":"ok","timestamp":1614266970915,"user_tz":-330,"elapsed":7000,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"b15248db-84cf-42b4-d4d9-98d4d487db44"},"source":["%aicrowd login --api-key \"$AIcrowdConfig.API_KEY\"\n","%aicrowd dataset download -c dlnlp-note"],"execution_count":null,"outputs":[{"output_type":"stream","text":["\u001b[32mAPI Key valid\u001b[0m\n","\u001b[32mSaved API Key successfully!\u001b[0m\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"5f2953a72f2042d49184ead9117cbf73","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='test.csv', max=929146.0, style=ProgressStyle(description_…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6c5e759b25764fa28fb67cf9890bb082","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='train.csv', max=5140653.0, style=ProgressStyle(descriptio…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"LDFjY8lrk6-5"},"source":["Extract the downloaded dataset to `data` directory"]},{"cell_type":"code","metadata":{"id":"g7qa_wvIaFkT"},"source":["!mkdir $AIcrowdConfig.DATASET_DIR\n","!mv train.csv $AIcrowdConfig.DATASET_DIR\n","!mv test.csv $AIcrowdConfig.DATASET_DIR"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ukXy2a0Qlbq2"},"source":["# Load training data 💻"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204},"id":"MSn0e6bQZ-QV","executionInfo":{"status":"ok","timestamp":1614266979225,"user_tz":-330,"elapsed":1265,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"16cec0e4-da2f-4bff-9de3-a5f81abe276b"},"source":["train_data = pd.read_csv(AIcrowdConfig.TRAIN_DATA_PATH)\n","train_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Unnamed: 0</th>\n"," <th>reviews</th>\n"," <th>ratings</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0</td>\n"," <td>This book was very informative, covering all a...</td>\n"," <td>4</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1</td>\n"," <td>I am already a baseball fan and knew a bit abo...</td>\n"," <td>5</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>2</td>\n"," <td>I didn't like this product it smudged all unde...</td>\n"," <td>1</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>3</td>\n"," <td>I simply love the product. I appreciate print ...</td>\n"," <td>5</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>4</td>\n"," <td>It goes on very easily and makes my eyes look ...</td>\n"," <td>5</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Unnamed: 0 reviews ratings\n","0 0 This book was very informative, covering all a... 4\n","1 1 I am already a baseball fan and knew a bit abo... 5\n","2 2 I didn't like this product it smudged all unde... 1\n","3 3 I simply love the product. I appreciate print ... 5\n","4 4 It goes on very easily and makes my eyes look ... 5"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"markdown","metadata":{"id":"RIRuYtFklgp8"},"source":["## Preprocess the training data 🧹"]},{"cell_type":"code","metadata":{"id":"wAXwyPnhamHc"},"source":["'''\n","About the task:\n","\n","You are provided with a codeflow- which consists of functions to be implemented(MANDATORY).\n","\n","You need to implement each of the functions mentioned below, you may add your own function parameters if needed.\n","'''\n","\n","\n","def encode_data(text):\n"," # This function will be used to encode the reviews using a dictionary(created using corpus vocabulary) \n"," \n"," # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n"," # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n"," # encode the words into integers \n","\n"," # return encoded examples\n"," pass\n","\n","\n","\n","def convert_to_lower(text):\n"," # return the reviews after convering then to lowercase\n"," pass\n","\n","\n","def remove_punctuation(text):\n"," # return the reviews after removing punctuations\n"," pass\n","\n","\n","def remove_stopwords(text):\n"," # return the reviews after removing the stopwords\n"," pass\n","\n","def perform_tokenization(text):\n"," # return the reviews after performing tokenization\n"," pass\n","\n","\n","def perform_padding(data):\n"," # return the reviews after padding the reviews to maximum length\n"," pass\n","\n","def preprocess_data(data):\n"," # make all the following function calls on your data\n","\n"," review = data[\"reviews\"]\n"," review = convert_to_lower(review)\n"," review = remove_punctuation(review)\n"," review = remove_stopwords(review)\n"," review = perform_tokenization(review)\n"," review = encode_data(review)\n"," processed_data = perform_padding(review)\n","\n"," # return processed_data # Uncomment this\n"," # Remove this dummy code at the bottom\n"," return np.zeros( (len(data[\"reviews\"]), 100) ) \n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"H2MtfZ6cfF6i"},"source":["# Define your Softmax function\n","\n","You have to write your own implementation from scratch and return softmax values(using predefined softmax is prohibited)"]},{"cell_type":"code","metadata":{"id":"XDLPaP6DfKjy"},"source":["def softmax_activation(x):\n"," # write your implementation here\n"," pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"US2eZwsMgbnu"},"source":["#Training Time ⏰"]},{"cell_type":"markdown","metadata":{"id":"ES_qPFSklr9v"},"source":["## Define your model\n","You should define your medal related methods here using the given template"]},{"cell_type":"code","metadata":{"id":"9GlHfaCe0c5W"},"source":["# Example with tensorflow, but you can replace with pytorch\n","# For better code add all imports to the top cell marked for imports\n","import tensorflow\n","from tensorflow import keras"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"XaZpngdTVNuG"},"source":["class NeuralNet:\n","\n"," def __init__(self, reviews, ratings):\n","\n"," self.reviews = reviews\n"," self.ratings = ratings\n","\n","\n"," def build_nn(self):\n"," #add the input and output layer here; you can use either tensorflow or pytorch\n"," model = keras.models.Sequential()\n"," model.add(keras.layers.Input((100,)))\n"," model.add(keras.layers.Dense(np.max(self.ratings)+1, activation='softmax') )\n","\n"," ####### Use the softmax activation that you wrote code for above #####\n"," \n"," model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')\n","\n"," self.model = model\n","\n"," def train_nn(self,batch_size,epochs):\n"," # write the training loop here; you can use either tensorflow or pytorch\n"," # print validation accuracy\n"," self.model.fit(x=self.reviews, y=self.ratings, epochs=3)\n","\n"," def predict(self, reviews):\n"," # return a list containing all the ratings predicted by the trained model\n","\n"," self.model.predict(reviews)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gYZrv0q-mJlr"},"source":["\n","## Initialize & Train your model"]},{"cell_type":"code","metadata":{"id":"IhJXjbj_Z4v1","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614268774053,"user_tz":-330,"elapsed":5577,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"1a33ad74-7a28-45af-b10f-d6d2d635ed52"},"source":["batch_size, epochs= 1000, 3\n"," \n","train_reviews=preprocess_data(train_data)\n","train_ratings=train_data['ratings'].values - 1\n","\n","model=NeuralNet(train_reviews,train_ratings)\n","model.build_nn()\n","model.train_nn(batch_size,epochs)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Epoch 1/3\n","1563/1563 [==============================] - 2s 898us/step - loss: 1.3788\n","Epoch 2/3\n","1563/1563 [==============================] - 1s 886us/step - loss: 1.0885\n","Epoch 3/3\n","1563/1563 [==============================] - 1s 897us/step - loss: 1.0759\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"4L5sKWOMmciy"},"source":["## Save your trained model\n","Save your model for later. Your model will loaded from here for predictions. Replace \\[example\\] below accordingly"]},{"cell_type":"code","metadata":{"id":"CrEnW27Vzup0"},"source":["if not os.path.isdir(AIcrowdConfig.ASSETS_DIR):\n"," os.mkdir(AIcrowdConfig.ASSETS_DIR)\n","# This is the example for a keras model, save your model according to your framework\n","model.model.save(os.path.join(AIcrowdConfig.ASSETS_DIR, \"dummy_model.h5\"))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"B1YQfCZoCrDS"},"source":["# Copy trained model to GDrive\n","\n","These assets can be loaded again in another notebook by running `%aicrowd submission load-assets`"]},{"cell_type":"code","metadata":{"id":"9POzV8uBN_Lq","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614269054044,"user_tz":-330,"elapsed":20948,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"7dd5a9f4-a1fe-4a5c-82da-1c832f014f16"},"source":["%aicrowd submission save-assets -c dlnlp-note"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Mounting Google Drive 💾\n","Google drive is needed to store your assets\n","Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Tc_VkaIuDhJ5"},"source":["# Submitting to AIcrowd\n","\n","You need to use the submission notebook that will load the trained model and predict the outputs for given inputs.\n","\n","You can use this as a starting point for your submission notebook.\n","\n","Inference notebook [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qesKuphCpa6dKsx8_AXsw2z7X22NWum5?usp=sharing)\n"]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment