Created
February 25, 2021 20:02
-
-
Save aicrowd-bot/290de190970341c1f8242ac94c2b2a5d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"aicrowd-learning-to-smell-baseline_solution.ipynb","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"code","metadata":{"id":"wRLeurVsV6Li","executionInfo":{"status":"ok","timestamp":1601846778108,"user_tz":-180,"elapsed":1131,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["import os\n","import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import NearestNeighbors"],"execution_count":93,"outputs":[]},{"cell_type":"code","metadata":{"id":"vKy3l_yvWAUQ","executionInfo":{"status":"ok","timestamp":1601846779748,"user_tz":-180,"elapsed":2751,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"cbe343bf-da44-454a-9439-dbee90bcaeea","colab":{"base_uri":"https://localhost:8080/","height":445}},"source":["!wget https://www.dropbox.com/s/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n"],"execution_count":94,"outputs":[{"output_type":"stream","text":["--2020-10-04 21:26:18-- https://www.dropbox.com/s/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n","Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.1, 2620:100:6018:1::a27d:301\n","Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.1|:443... connected.\n","HTTP request sent, awaiting response... 301 Moved Permanently\n","Location: /s/raw/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip [following]\n","--2020-10-04 21:26:18-- https://www.dropbox.com/s/raw/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n","Reusing existing connection to www.dropbox.com:443.\n","HTTP request sent, awaiting response... 302 Found\n","Location: https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline/BApvdQ-un7z_yczvNGMd6IeXZtoBgClGSpMwWXjfyO2ZKJi3I0ihigar1U5eh9f8zN3Vv4xhPS6PdAahtbFED218gHKEsUpFgWXYIchCJNYuJj2RIMMQxRqmhvoN7uDz3u0/file# [following]\n","--2020-10-04 21:26:18-- https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline/BApvdQ-un7z_yczvNGMd6IeXZtoBgClGSpMwWXjfyO2ZKJi3I0ihigar1U5eh9f8zN3Vv4xhPS6PdAahtbFED218gHKEsUpFgWXYIchCJNYuJj2RIMMQxRqmhvoN7uDz3u0/file\n","Resolving uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com (uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6018:15::a27d:30f\n","Connecting to uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com (uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com)|162.125.3.15|:443... connected.\n","HTTP request sent, awaiting response... 302 Found\n","Location: /cd/0/inline2/BApLZWzS3Rc-uTVYiTx2MzXcdnJQrAaEfAVW5Zowwmm5O-WXBaDX05HT3GbYqUlkz2Q-ZVyEBne5Q3f0LkHk4aoGEl9pCg1UbelpokG9xWbrqcfYIvX-f1NO_bPX1g3pqYRAVdp5V88ZyUZbiCJYcViE3AWXc5K5UHKx18m-dfq54TgJwwEnWuKo6bPf6qduztcvX9F2E0Xq8yleam_tlwPJAeQScnj1DfF4HSUhj57Q9PEAgVb0yi1TPFVUGGF8_zCyfpHNlHSwbl2EiOgMbTUoJEEt4Amryz-rtanq12Jd5ReU-b-8lvD7V_jkvVK1rxu9tE5PCVPkukkp5gjhpCdSYt04mweexIuq4o3bIgjxFQ/file [following]\n","--2020-10-04 21:26:19-- https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline2/BApLZWzS3Rc-uTVYiTx2MzXcdnJQrAaEfAVW5Zowwmm5O-WXBaDX05HT3GbYqUlkz2Q-ZVyEBne5Q3f0LkHk4aoGEl9pCg1UbelpokG9xWbrqcfYIvX-f1NO_bPX1g3pqYRAVdp5V88ZyUZbiCJYcViE3AWXc5K5UHKx18m-dfq54TgJwwEnWuKo6bPf6qduztcvX9F2E0Xq8yleam_tlwPJAeQScnj1DfF4HSUhj57Q9PEAgVb0yi1TPFVUGGF8_zCyfpHNlHSwbl2EiOgMbTUoJEEt4Amryz-rtanq12Jd5ReU-b-8lvD7V_jkvVK1rxu9tE5PCVPkukkp5gjhpCdSYt04mweexIuq4o3bIgjxFQ/file\n","Reusing existing connection to uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com:443.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 56694 (55K) [application/zip]\n","Saving to: ‘aicrowd-learning-to-smell-data.zip.6’\n","\n","aicrowd-learning-to 100%[===================>] 55.37K --.-KB/s in 0.01s \n","\n","2020-10-04 21:26:19 (4.40 MB/s) - ‘aicrowd-learning-to-smell-data.zip.6’ saved [56694/56694]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"u1OHgESNXlMF","executionInfo":{"status":"ok","timestamp":1601846780063,"user_tz":-180,"elapsed":2972,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"795c0d2c-65b4-46d0-f4af-c3e6337ab31e","colab":{"base_uri":"https://localhost:8080/","height":85}},"source":["!unzip -o aicrowd-learning-to-smell-data.zip"],"execution_count":95,"outputs":[{"output_type":"stream","text":["Archive: aicrowd-learning-to-smell-data.zip\n"," inflating: data/test.csv \n"," inflating: data/train.csv \n"," inflating: data/vocabulary.txt \n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"UgjQaTO2Zkjc","executionInfo":{"status":"ok","timestamp":1601846783185,"user_tz":-180,"elapsed":1626,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"53dfb1c5-7de7-4625-a540-18c1f96d9ab7","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["os.listdir(\"./data\")"],"execution_count":96,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['data', 'test.csv', 'vocabulary.txt', 'train.csv']"]},"metadata":{"tags":[]},"execution_count":96}]},{"cell_type":"code","metadata":{"id":"6DuPytRYV6Lm","executionInfo":{"status":"ok","timestamp":1601846783925,"user_tz":-180,"elapsed":2358,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["train = pd.read_csv(\"data/train.csv\")\n","test = pd.read_csv(\"data/test.csv\")\n","vocab = pd.read_csv(\"data/vocabulary.txt\", header=None)"],"execution_count":97,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"jfx-yAIwV6Lp"},"source":["I used precomputed fingerprints from PubChem. To reproduce, you can run `python download_data_from_pubchem.py`, which is available on [github](https://https://github.com/latticetower/learning-to-smell-baseline), or simply download file with collected fingerprints from there:"]},{"cell_type":"code","metadata":{"id":"sPr3pMmjaMaU","executionInfo":{"status":"ok","timestamp":1601846783927,"user_tz":-180,"elapsed":2347,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"b6717c95-eb3f-4aeb-ffa6-00808c514e6c","colab":{"base_uri":"https://localhost:8080/","height":204}},"source":["!wget https://raw.githubusercontent.com/latticetower/learning-to-smell-baseline/main/pubchem_fingerprints.csv"],"execution_count":98,"outputs":[{"output_type":"stream","text":["--2020-10-04 21:26:23-- https://raw.githubusercontent.com/latticetower/learning-to-smell-baseline/main/pubchem_fingerprints.csv\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1359150 (1.3M) [text/plain]\n","Saving to: ‘pubchem_fingerprints.csv.3’\n","\n","pubchem_fingerprint 100%[===================>] 1.30M --.-KB/s in 0.1s \n","\n","2020-10-04 21:26:23 (13.0 MB/s) - ‘pubchem_fingerprints.csv.3’ saved [1359150/1359150]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"N_TLEthSV6Lp","executionInfo":{"status":"ok","timestamp":1601846783928,"user_tz":-180,"elapsed":2343,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["fingerprints = pd.read_csv(\"pubchem_fingerprints.csv\")"],"execution_count":99,"outputs":[]},{"cell_type":"code","metadata":{"id":"0eLif2wxV6Ls","executionInfo":{"status":"ok","timestamp":1601846783930,"user_tz":-180,"elapsed":2332,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"fcbda0d2-580d-41e8-b51c-37b5716cb069","colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["train_df = train.merge(fingerprints, on=\"SMILES\", how=\"left\")\n","test_df = test.merge(fingerprints, on=\"SMILES\", how=\"left\")\n","print(train_df.fingerprint.isnull().sum(), \"train molecules have no associated fingerprint\")\n","print(test_df.fingerprint.isnull().sum(), \"test molecules have no associated fingerprint\")"],"execution_count":100,"outputs":[{"output_type":"stream","text":["33 train molecules have no associated fingerprint\n","5 test molecules have no associated fingerprint\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"ZhXeFjkFV6Lz"},"source":["I use only molecules which have fingerprint available to find k nearest neighbours, that's why I filter both train and test data and use unpacked fingerprints to compute K nearest neighbours. "]},{"cell_type":"code","metadata":{"id":"4ueyDwpsV6L0","executionInfo":{"status":"ok","timestamp":1601846783931,"user_tz":-180,"elapsed":1623,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["def to_bits(x):\n"," try:\n"," unpacked = np.unpackbits(np.frombuffer(bytes.fromhex(x), dtype=np.uint8))\n"," except Exception as e:\n"," print(e)\n"," print(x)\n"," \n"," return unpacked\n","\n","\n","train_df = train_df[~train_df.fingerprint.isnull()]\n","train_fingerprints = train_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])\n","train_fingerprints = np.stack(train_fingerprints.values)\n","\n","test_df = test_df[~test_df.fingerprint.isnull()]\n","test_fingerprints = test_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])\n","test_fingerprints = np.stack(test_fingerprints.values)"],"execution_count":101,"outputs":[]},{"cell_type":"code","metadata":{"id":"T_xig60KV6L2","executionInfo":{"status":"ok","timestamp":1601846792680,"user_tz":-180,"elapsed":9649,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(train_fingerprints)\n","distances, neighbour_indices = nbrs.kneighbors(test_fingerprints)"],"execution_count":102,"outputs":[]},{"cell_type":"code","metadata":{"id":"2khr1jBjV6L5","executionInfo":{"status":"ok","timestamp":1601846792685,"user_tz":-180,"elapsed":8946,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["for i, neighbours in zip(test_df.index, neighbour_indices):\n"," test.loc[i, \"PREDICTIONS\"] = \";\".join([train.loc[train_df.index[x], \"SENTENCE\"] for x in neighbours])"],"execution_count":103,"outputs":[]},{"cell_type":"code","metadata":{"id":"lTNOY0zDV6L9","executionInfo":{"status":"ok","timestamp":1601846792687,"user_tz":-180,"elapsed":8182,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"aad4ea2d-1d19-4e4e-be90-7367aba3c14b","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["test.PREDICTIONS.isnull().sum()"],"execution_count":104,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":104}]},{"cell_type":"markdown","metadata":{"id":"9DyogHV0V6MA"},"source":["We still need to fill several predictions, for this we use top-5 most common molecular scents from train dataset."]},{"cell_type":"code","metadata":{"id":"U-N07x75V6MA","executionInfo":{"status":"ok","timestamp":1601846793471,"user_tz":-180,"elapsed":7249,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"f26bc2bd-8889-416d-f48e-879a07a83069","colab":{"base_uri":"https://localhost:8080/","height":119}},"source":["train.SENTENCE.value_counts()[:5]"],"execution_count":105,"outputs":[{"output_type":"execute_result","data":{"text/plain":["odorless 57\n","mint 36\n","fruity 32\n","woody 28\n","oily 24\n","Name: SENTENCE, dtype: int64"]},"metadata":{"tags":[]},"execution_count":105}]},{"cell_type":"code","metadata":{"id":"bBGDE5cUV6MD","executionInfo":{"status":"ok","timestamp":1601846793472,"user_tz":-180,"elapsed":5562,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["default_prediction = \";\".join(train.SENTENCE.value_counts()[:5].index)"],"execution_count":106,"outputs":[]},{"cell_type":"code","metadata":{"id":"LKV6eUO1V6MH","executionInfo":{"status":"ok","timestamp":1601846793473,"user_tz":-180,"elapsed":3989,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["test.loc[test.PREDICTIONS.isnull(), \"PREDICTIONS\"] = default_prediction"],"execution_count":107,"outputs":[]},{"cell_type":"code","metadata":{"id":"UHhHfGBpV6MK","executionInfo":{"status":"ok","timestamp":1601846793474,"user_tz":-180,"elapsed":3277,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["test.to_csv(\"baseline_submission.csv\", index=None)"],"execution_count":108,"outputs":[]},{"cell_type":"code","metadata":{"id":"e8fGwomtV6MO","executionInfo":{"status":"ok","timestamp":1601846797593,"user_tz":-180,"elapsed":1140,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"4e1e950d-a6c0-45b0-c259-93e372642f80","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["from google.colab import files\n","files.download(\"baseline_submission.csv\")"],"execution_count":109,"outputs":[{"output_type":"display_data","data":{"application/javascript":["\n"," async function download(id, filename, size) {\n"," if (!google.colab.kernel.accessAllowed) {\n"," return;\n"," }\n"," const div = document.createElement('div');\n"," const label = document.createElement('label');\n"," label.textContent = `Downloading \"${filename}\": `;\n"," div.appendChild(label);\n"," const progress = document.createElement('progress');\n"," progress.max = size;\n"," div.appendChild(progress);\n"," document.body.appendChild(div);\n","\n"," const buffers = [];\n"," let downloaded = 0;\n","\n"," const channel = await google.colab.kernel.comms.open(id);\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n","\n"," for await (const message of channel.messages) {\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n"," if (message.buffers) {\n"," for (const buffer of message.buffers) {\n"," buffers.push(buffer);\n"," downloaded += buffer.byteLength;\n"," progress.value = downloaded;\n"," }\n"," }\n"," }\n"," const blob = new Blob(buffers, {type: 'application/binary'});\n"," const a = document.createElement('a');\n"," a.href = window.URL.createObjectURL(blob);\n"," a.download = filename;\n"," div.appendChild(a);\n"," a.click();\n"," div.remove();\n"," }\n"," "],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}},{"output_type":"display_data","data":{"application/javascript":["download(\"download_593a351f-3f83-464f-9d25-1471b217af54\", \"baseline_submission.csv\", 144933)"],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}}]},{"cell_type":"code","metadata":{"id":"gtcmeoAga98u"},"source":[""],"execution_count":null,"outputs":[]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment