Created
February 25, 2021 17:49
-
-
Save sujnesh/69237f2bd59b64abd6ae9c0524347239 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"DLNLP_1_inference.ipynb","provenance":[{"file_id":"1mNjZLd5eKe9JJ4NVLPSNgGR5UiMsZIDz","timestamp":1614274189832},{"file_id":"1qesKuphCpa6dKsx8_AXsw2z7X22NWum5","timestamp":1614233365227},{"file_id":"1IkS10nv6TZlyEye7XP5mOH5FN9ZOSMzp","timestamp":1613420987089}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"widgets":{"application/vnd.jupyter.widget-state+json":{"f43ddcf72d51486a807353f51d62c2cb":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_cc35995f78a3489689b3c6c396f49941","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_494b4a1bd2614b909819b953874c4927","IPY_MODEL_338c077dbd2440f8a440f8eb69880cdf"]}},"cc35995f78a3489689b3c6c396f49941":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"494b4a1bd2614b909819b953874c4927":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_75b7dd331cfc45c3810d9faaae5ee460","_dom_classes":[],"description":"test.csv: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":929146,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":929146,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_eb1778f417bb46538f8a05001206d4fe"}},"338c077dbd2440f8a440f8eb69880cdf":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_b198a9ece4e049a3a03d1804eaedd55a","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"โ","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 929k/929k [00:33<00:00, 27.9kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_c71704350c2242f7b1ed87112d92d493"}},"75b7dd331cfc45c3810d9faaae5ee460":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"eb1778f417bb46538f8a05001206d4fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"b198a9ece4e049a3a03d1804eaedd55a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"c71704350c2242f7b1ed87112d92d493":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}}}}},"cells":[{"cell_type":"markdown","metadata":{"id":"3-HLhpfJ2tWf"},"source":["<div style=\"text-align: center\">\n"," <a href=\"https://www.aicrowd.com/challenges/dlnlp-note\"><img alt=\"AIcrowd\" src=\"https://gitlab.aicrowd.com/S.Rathi/iit-b-notebook-misc/-/raw/S.Rathi-master-patch-59012/creative_updated%20on%208.2.21_1%20_desktopbanner.jpg\"></a>\n","</div>"]},{"cell_type":"markdown","metadata":{"id":"Rk3nRu08nIsn"},"source":["\n","# How to use this notebook? ๐\n","1. **Copy the notebook**. This is a shared template and any edits you make here will not be saved. _You should copy it into your own drive folder._ For this, click the \"File\" menu (top-left), then \"Save a Copy in Drive\". You can edit your copy however you like.\n","2. **Link it to your AICrowd account**. In order to submit your code to AICrowd, you need to provide your account's API key [here](https://colab.research.google.com/drive/1mNjZLd5eKe9JJ4NVLPSNgGR5UiMsZIDz#scrollTo=H7iqy5XcWeHN&line=4&uniqifier=1).\n","3. **Load** your trained model.\n","4. **Modify** the predefined functions for preprocessing, prediction etc.\n","\n","5. **Make a submission**. You have to run all the code in the notebook & use it to make your submission.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"TiSy9UAsJgHa"},"source":["#Dataset Specifications ๐พ\n","\n","* **train.csv**: has 3 columns with latter two being 'reviews' & corresponding 'ratings'.\n","* **test.csv**: has 2 columns with latter being 'reviews'. You will have to predict the corresponding 'ratings'.\n","* 'ratings' in predictions should be integers in range \\[1,5\\] i.e. {1,2,3,4,5}"]},{"cell_type":"markdown","metadata":{"id":"iIw4tX5SdMVn"},"source":["# Install AIcrowd Utilities ๐งฐ\n","\n","We will install `aicrowd-cli` that can help us manage some trivial tasks and make our lives easier."]},{"cell_type":"code","metadata":{"id":"qjHTtoBG5aAh","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614269135145,"user_tz":-330,"elapsed":10459,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"0970b5a6-e53c-4948-e899-538325e0bc29"},"source":["!pip install -U git+https://gitlab.aicrowd.com/aicrowd/aicrowd-cli.git > /dev/null"],"execution_count":null,"outputs":[{"output_type":"stream","text":[" Running command git clone -q https://gitlab.aicrowd.com/aicrowd/aicrowd-cli.git /tmp/pip-req-build-_2ogoy30\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"UZ5AVIxWna9B"},"source":["Load AIcrowd magic commands"]},{"cell_type":"code","metadata":{"id":"5pGvuw2dnack"},"source":["%load_ext aicrowd.magic"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wmC1uk_ZniGi"},"source":["# Import necessary modules and packages ๐"]},{"cell_type":"code","metadata":{"id":"eYgfWwhJnioq"},"source":["import os\n","import pandas as pd\n","import numpy as np\n","\n","#Add your necessary modules & packages here"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"0rUE152lf4wv"},"source":["# AIcrowd Runtime Configuration โ๏ธ\n","\n","Define configuration parameters. Please include any files needed for the notebook to run under `ASSETS_DIR`. We will copy the contents of this directory to your final submission file ๐"]},{"cell_type":"code","metadata":{"id":"H7iqy5XcWeHN"},"source":["class AIcrowdConfig:\n"," DATASET_PATH = \"test.csv\"\n"," PREDICTIONS_PATH = \"predictions.csv\"\n"," ASSETS_DIR = \"assets\"\n"," API_KEY = \"\" # Get your key from https://www.aicrowd.com/participants/me"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"l8iIbBlCf8sf"},"source":["# Download test data ๐ฒ"]},{"cell_type":"code","metadata":{"id":"Mfd1YA-6W3-F","colab":{"base_uri":"https://localhost:8080/","height":100,"referenced_widgets":["f43ddcf72d51486a807353f51d62c2cb","cc35995f78a3489689b3c6c396f49941","494b4a1bd2614b909819b953874c4927","338c077dbd2440f8a440f8eb69880cdf","75b7dd331cfc45c3810d9faaae5ee460","eb1778f417bb46538f8a05001206d4fe","b198a9ece4e049a3a03d1804eaedd55a","c71704350c2242f7b1ed87112d92d493"]},"executionInfo":{"status":"ok","timestamp":1614269203693,"user_tz":-330,"elapsed":4235,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"1ee3c1c7-ec9f-4cca-f4ff-c91499e06984"},"source":["%aicrowd login --api-key \"$AIcrowdConfig.API_KEY\"\n","%aicrowd dataset download -c dlnlp-note test*"],"execution_count":null,"outputs":[{"output_type":"stream","text":["\u001b[32mAPI Key valid\u001b[0m\n","\u001b[32mSaved API Key successfully!\u001b[0m\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f43ddcf72d51486a807353f51d62c2cb","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='test.csv', max=929146.0, style=ProgressStyle(description_โฆ"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"CEy92x00PaaJ"},"source":["# Load the saved assets from GDrive"]},{"cell_type":"code","metadata":{"id":"BKddC8LBParb","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614269226485,"user_tz":-330,"elapsed":17729,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"8eaa3bcb-42bc-479c-e9b6-8665ed7fc722"},"source":["%aicrowd submission load-assets -c dlnlp-note"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Mounting Google Drive ๐พ\n","Google drive is needed to store your assets\n","Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"j35PzDltlrvM"},"source":["#Load test data ๐ป"]},{"cell_type":"code","metadata":{"id":"t-3g5JeBluL8","colab":{"base_uri":"https://localhost:8080/","height":204},"executionInfo":{"status":"ok","timestamp":1614269228434,"user_tz":-330,"elapsed":1202,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"f4fffb10-b593-4205-bb7c-887a5a2d0b8f"},"source":["test_data = pd.read_csv(AIcrowdConfig.DATASET_PATH)\n","test_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Unnamed: 0</th>\n"," <th>reviews</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0</td>\n"," <td>Doesn't work at ALL. Don't waste your money or...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1</td>\n"," <td>What crap. Would need a lot more power to do ...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>2</td>\n"," <td>Has no suction and didn't work. Not worth trying.</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>3</td>\n"," <td>That is definitely a trash. Unable to clean an...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>4</td>\n"," <td>Didn't even worked on cleaning the ears at all...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Unnamed: 0 reviews\n","0 0 Doesn't work at ALL. Don't waste your money or...\n","1 1 What crap. Would need a lot more power to do ...\n","2 2 Has no suction and didn't work. Not worth trying.\n","3 3 That is definitely a trash. Unable to clean an...\n","4 4 Didn't even worked on cleaning the ears at all..."]},"metadata":{"tags":[]},"execution_count":14}]},{"cell_type":"markdown","metadata":{"id":"_ps7jlE3jmDp"},"source":["## Preprocess the test data ๐งน"]},{"cell_type":"code","metadata":{"id":"xTdMs_MBjr-a"},"source":["##### Important note - Make sure to add the preprocessing code here correctly\n","\n","\n","'''\n","About the task:\n","\n","You are provided with a codeflow- which consists of functions to be implemented(MANDATORY).\n","\n","You need to implement each of the functions mentioned below, you may add your own function parameters if needed.\n","'''\n","\n","\n","def encode_data(text):\n"," # This function will be used to encode the reviews using a dictionary(created using corpus vocabulary) \n"," \n"," # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n"," # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n"," # encode the words into integers \n","\n"," # return encoded examples\n"," pass\n","\n","\n","\n","def convert_to_lower(text):\n"," # return the reviews after convering then to lowercase\n"," pass\n","\n","\n","def remove_punctuation(text):\n"," # return the reviews after removing punctuations\n"," pass\n","\n","\n","def remove_stopwords(text):\n"," # return the reviews after removing the stopwords\n"," pass\n","\n","def perform_tokenization(text):\n"," # return the reviews after performing tokenization\n"," pass\n","\n","\n","def perform_padding(data):\n"," # return the reviews after padding the reviews to maximum length\n"," pass\n","\n","def preprocess_data(data):\n"," # make all the following function calls on your data\n","\n"," review = data[\"reviews\"]\n"," review = convert_to_lower(review)\n"," review = remove_punctuation(review)\n"," review = remove_stopwords(review)\n"," review = perform_tokenization(review)\n"," review = encode_data(review)\n"," processed_data = perform_padding(review)\n","\n"," # return processed_data # Uncomment this\n"," # Remove this dummy code at the bottom\n"," return np.zeros( (len(data[\"reviews\"]), 100) ) \n"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"4Bo9Fmr1p-hj"},"source":["## Prediction time โฐ"]},{"cell_type":"markdown","metadata":{"id":"75RLUyIhlW6T"},"source":["#### Read and preprocess the data"]},{"cell_type":"code","metadata":{"id":"VEsgymgJYBVR"},"source":["test_reviews=preprocess_data(test_data)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"1ZKMU3tp_Qo3"},"source":["from tensorflow import keras\n","model = keras.models.load_model(os.path.join(AIcrowdConfig.ASSETS_DIR, \"dummy_model.h5\"))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"XniRSW0fYHGq"},"source":["#Make your predictions here based on your model\n","raw_predictions = model.predict(test_reviews)\n","predictions = np.argmax(raw_predictions, axis=-1)\n","\n","pd.DataFrame(predictions, columns=[\"ratings\"]).to_csv(AIcrowdConfig.PREDICTIONS_PATH)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8Rj6IceQAiPo","executionInfo":{"status":"ok","timestamp":1614270333246,"user_tz":-330,"elapsed":1211,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"11c7d7d2-39f1-4cf1-ad04-cc5bb9f252d3"},"source":["!ls $AIcrowdConfig.PREDICTIONS_PATH"],"execution_count":null,"outputs":[{"output_type":"stream","text":["predictions.csv\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"ndJ64fKkymTj"},"source":["# Submit to AIcrowd ๐"]},{"cell_type":"code","metadata":{"id":"AtxH12yzZBtd","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1614270302405,"user_tz":-330,"elapsed":1798,"user":{"displayName":"Dipam Chakraborty","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhjYbzjRrx303amHrTPU3lUOY9OMpxKTrTzhtYd=s64","userId":"04496869744334527762"}},"outputId":"f10d63c8-6933-4f36-c6b1-4371ca634d40"},"source":["%aicrowd submission create --jupyter -c dlnlp-note"],"execution_count":null,"outputs":[{"output_type":"stream","text":["An error occured: [Errno 2] No such file or directory: '/content/drive/My Drive/Colab Notebooks/dlnlp-note-inference.ipynb'\n"],"name":"stdout"}]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment