Skip to content

Instantly share code, notes, and snippets.

@skbly7
Created December 22, 2020 13:26
Show Gist options
  • Save skbly7/af83b824b7d82af1790deeff22283e3a to your computer and use it in GitHub Desktop.
Save skbly7/af83b824b7d82af1790deeff22283e3a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"aicrowd-learning-to-smell-baseline_solution.ipynb","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"code","metadata":{"id":"wRLeurVsV6Li","executionInfo":{"status":"ok","timestamp":1601846778108,"user_tz":-180,"elapsed":1131,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["import os\n","import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import NearestNeighbors"],"execution_count":93,"outputs":[]},{"cell_type":"code","metadata":{"id":"vKy3l_yvWAUQ","executionInfo":{"status":"ok","timestamp":1601846779748,"user_tz":-180,"elapsed":2751,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"cbe343bf-da44-454a-9439-dbee90bcaeea","colab":{"base_uri":"https://localhost:8080/","height":445}},"source":["!wget https://www.dropbox.com/s/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n"],"execution_count":94,"outputs":[{"output_type":"stream","text":["--2020-10-04 21:26:18-- https://www.dropbox.com/s/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n","Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.1, 2620:100:6018:1::a27d:301\n","Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.1|:443... connected.\n","HTTP request sent, awaiting response... 301 Moved Permanently\n","Location: /s/raw/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip [following]\n","--2020-10-04 21:26:18-- https://www.dropbox.com/s/raw/3b2ta3qr706d1ua/aicrowd-learning-to-smell-data.zip\n","Reusing existing connection to www.dropbox.com:443.\n","HTTP request sent, awaiting response... 302 Found\n","Location: https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline/BApvdQ-un7z_yczvNGMd6IeXZtoBgClGSpMwWXjfyO2ZKJi3I0ihigar1U5eh9f8zN3Vv4xhPS6PdAahtbFED218gHKEsUpFgWXYIchCJNYuJj2RIMMQxRqmhvoN7uDz3u0/file# [following]\n","--2020-10-04 21:26:18-- https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline/BApvdQ-un7z_yczvNGMd6IeXZtoBgClGSpMwWXjfyO2ZKJi3I0ihigar1U5eh9f8zN3Vv4xhPS6PdAahtbFED218gHKEsUpFgWXYIchCJNYuJj2RIMMQxRqmhvoN7uDz3u0/file\n","Resolving uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com (uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6018:15::a27d:30f\n","Connecting to uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com (uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com)|162.125.3.15|:443... connected.\n","HTTP request sent, awaiting response... 302 Found\n","Location: /cd/0/inline2/BApLZWzS3Rc-uTVYiTx2MzXcdnJQrAaEfAVW5Zowwmm5O-WXBaDX05HT3GbYqUlkz2Q-ZVyEBne5Q3f0LkHk4aoGEl9pCg1UbelpokG9xWbrqcfYIvX-f1NO_bPX1g3pqYRAVdp5V88ZyUZbiCJYcViE3AWXc5K5UHKx18m-dfq54TgJwwEnWuKo6bPf6qduztcvX9F2E0Xq8yleam_tlwPJAeQScnj1DfF4HSUhj57Q9PEAgVb0yi1TPFVUGGF8_zCyfpHNlHSwbl2EiOgMbTUoJEEt4Amryz-rtanq12Jd5ReU-b-8lvD7V_jkvVK1rxu9tE5PCVPkukkp5gjhpCdSYt04mweexIuq4o3bIgjxFQ/file [following]\n","--2020-10-04 21:26:19-- https://uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com/cd/0/inline2/BApLZWzS3Rc-uTVYiTx2MzXcdnJQrAaEfAVW5Zowwmm5O-WXBaDX05HT3GbYqUlkz2Q-ZVyEBne5Q3f0LkHk4aoGEl9pCg1UbelpokG9xWbrqcfYIvX-f1NO_bPX1g3pqYRAVdp5V88ZyUZbiCJYcViE3AWXc5K5UHKx18m-dfq54TgJwwEnWuKo6bPf6qduztcvX9F2E0Xq8yleam_tlwPJAeQScnj1DfF4HSUhj57Q9PEAgVb0yi1TPFVUGGF8_zCyfpHNlHSwbl2EiOgMbTUoJEEt4Amryz-rtanq12Jd5ReU-b-8lvD7V_jkvVK1rxu9tE5PCVPkukkp5gjhpCdSYt04mweexIuq4o3bIgjxFQ/file\n","Reusing existing connection to uc29ccbd4c826b54d9e4d4306de4.dl.dropboxusercontent.com:443.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 56694 (55K) [application/zip]\n","Saving to: ‘aicrowd-learning-to-smell-data.zip.6’\n","\n","aicrowd-learning-to 100%[===================>] 55.37K --.-KB/s in 0.01s \n","\n","2020-10-04 21:26:19 (4.40 MB/s) - ‘aicrowd-learning-to-smell-data.zip.6’ saved [56694/56694]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"u1OHgESNXlMF","executionInfo":{"status":"ok","timestamp":1601846780063,"user_tz":-180,"elapsed":2972,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"795c0d2c-65b4-46d0-f4af-c3e6337ab31e","colab":{"base_uri":"https://localhost:8080/","height":85}},"source":["!unzip -o aicrowd-learning-to-smell-data.zip"],"execution_count":95,"outputs":[{"output_type":"stream","text":["Archive: aicrowd-learning-to-smell-data.zip\n"," inflating: data/test.csv \n"," inflating: data/train.csv \n"," inflating: data/vocabulary.txt \n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"UgjQaTO2Zkjc","executionInfo":{"status":"ok","timestamp":1601846783185,"user_tz":-180,"elapsed":1626,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"53dfb1c5-7de7-4625-a540-18c1f96d9ab7","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["os.listdir(\"./data\")"],"execution_count":96,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['data', 'test.csv', 'vocabulary.txt', 'train.csv']"]},"metadata":{"tags":[]},"execution_count":96}]},{"cell_type":"code","metadata":{"id":"6DuPytRYV6Lm","executionInfo":{"status":"ok","timestamp":1601846783925,"user_tz":-180,"elapsed":2358,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["train = pd.read_csv(\"data/train.csv\")\n","test = pd.read_csv(\"data/test.csv\")\n","vocab = pd.read_csv(\"data/vocabulary.txt\", header=None)"],"execution_count":97,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"jfx-yAIwV6Lp"},"source":["I used precomputed fingerprints from PubChem. To reproduce, you can run `python download_data_from_pubchem.py`, which is available on [github](https://https://github.com/latticetower/learning-to-smell-baseline), or simply download file with collected fingerprints from there:"]},{"cell_type":"code","metadata":{"id":"sPr3pMmjaMaU","executionInfo":{"status":"ok","timestamp":1601846783927,"user_tz":-180,"elapsed":2347,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"b6717c95-eb3f-4aeb-ffa6-00808c514e6c","colab":{"base_uri":"https://localhost:8080/","height":204}},"source":["!wget https://raw.githubusercontent.com/latticetower/learning-to-smell-baseline/main/pubchem_fingerprints.csv"],"execution_count":98,"outputs":[{"output_type":"stream","text":["--2020-10-04 21:26:23-- https://raw.githubusercontent.com/latticetower/learning-to-smell-baseline/main/pubchem_fingerprints.csv\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1359150 (1.3M) [text/plain]\n","Saving to: ‘pubchem_fingerprints.csv.3’\n","\n","pubchem_fingerprint 100%[===================>] 1.30M --.-KB/s in 0.1s \n","\n","2020-10-04 21:26:23 (13.0 MB/s) - ‘pubchem_fingerprints.csv.3’ saved [1359150/1359150]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"N_TLEthSV6Lp","executionInfo":{"status":"ok","timestamp":1601846783928,"user_tz":-180,"elapsed":2343,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["fingerprints = pd.read_csv(\"pubchem_fingerprints.csv\")"],"execution_count":99,"outputs":[]},{"cell_type":"code","metadata":{"id":"0eLif2wxV6Ls","executionInfo":{"status":"ok","timestamp":1601846783930,"user_tz":-180,"elapsed":2332,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"fcbda0d2-580d-41e8-b51c-37b5716cb069","colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["train_df = train.merge(fingerprints, on=\"SMILES\", how=\"left\")\n","test_df = test.merge(fingerprints, on=\"SMILES\", how=\"left\")\n","print(train_df.fingerprint.isnull().sum(), \"train molecules have no associated fingerprint\")\n","print(test_df.fingerprint.isnull().sum(), \"test molecules have no associated fingerprint\")"],"execution_count":100,"outputs":[{"output_type":"stream","text":["33 train molecules have no associated fingerprint\n","5 test molecules have no associated fingerprint\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"ZhXeFjkFV6Lz"},"source":["I use only molecules which have fingerprint available to find k nearest neighbours, that's why I filter both train and test data and use unpacked fingerprints to compute K nearest neighbours. "]},{"cell_type":"code","metadata":{"id":"4ueyDwpsV6L0","executionInfo":{"status":"ok","timestamp":1601846783931,"user_tz":-180,"elapsed":1623,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["def to_bits(x):\n"," try:\n"," unpacked = np.unpackbits(np.frombuffer(bytes.fromhex(x), dtype=np.uint8))\n"," except Exception as e:\n"," print(e)\n"," print(x)\n"," \n"," return unpacked\n","\n","\n","train_df = train_df[~train_df.fingerprint.isnull()]\n","train_fingerprints = train_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])\n","train_fingerprints = np.stack(train_fingerprints.values)\n","\n","test_df = test_df[~test_df.fingerprint.isnull()]\n","test_fingerprints = test_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])\n","test_fingerprints = np.stack(test_fingerprints.values)"],"execution_count":101,"outputs":[]},{"cell_type":"code","metadata":{"id":"T_xig60KV6L2","executionInfo":{"status":"ok","timestamp":1601846792680,"user_tz":-180,"elapsed":9649,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(train_fingerprints)\n","distances, neighbour_indices = nbrs.kneighbors(test_fingerprints)"],"execution_count":102,"outputs":[]},{"cell_type":"code","metadata":{"id":"2khr1jBjV6L5","executionInfo":{"status":"ok","timestamp":1601846792685,"user_tz":-180,"elapsed":8946,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["for i, neighbours in zip(test_df.index, neighbour_indices):\n"," test.loc[i, \"PREDICTIONS\"] = \";\".join([train.loc[train_df.index[x], \"SENTENCE\"] for x in neighbours])"],"execution_count":103,"outputs":[]},{"cell_type":"code","metadata":{"id":"lTNOY0zDV6L9","executionInfo":{"status":"ok","timestamp":1601846792687,"user_tz":-180,"elapsed":8182,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"aad4ea2d-1d19-4e4e-be90-7367aba3c14b","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["test.PREDICTIONS.isnull().sum()"],"execution_count":104,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":104}]},{"cell_type":"markdown","metadata":{"id":"9DyogHV0V6MA"},"source":["We still need to fill several predictions, for this we use top-5 most common molecular scents from train dataset."]},{"cell_type":"code","metadata":{"id":"U-N07x75V6MA","executionInfo":{"status":"ok","timestamp":1601846793471,"user_tz":-180,"elapsed":7249,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"f26bc2bd-8889-416d-f48e-879a07a83069","colab":{"base_uri":"https://localhost:8080/","height":119}},"source":["train.SENTENCE.value_counts()[:5]"],"execution_count":105,"outputs":[{"output_type":"execute_result","data":{"text/plain":["odorless 57\n","mint 36\n","fruity 32\n","woody 28\n","oily 24\n","Name: SENTENCE, dtype: int64"]},"metadata":{"tags":[]},"execution_count":105}]},{"cell_type":"code","metadata":{"id":"bBGDE5cUV6MD","executionInfo":{"status":"ok","timestamp":1601846793472,"user_tz":-180,"elapsed":5562,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["default_prediction = \";\".join(train.SENTENCE.value_counts()[:5].index)"],"execution_count":106,"outputs":[]},{"cell_type":"code","metadata":{"id":"LKV6eUO1V6MH","executionInfo":{"status":"ok","timestamp":1601846793473,"user_tz":-180,"elapsed":3989,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["test.loc[test.PREDICTIONS.isnull(), \"PREDICTIONS\"] = default_prediction"],"execution_count":107,"outputs":[]},{"cell_type":"code","metadata":{"id":"UHhHfGBpV6MK","executionInfo":{"status":"ok","timestamp":1601846793474,"user_tz":-180,"elapsed":3277,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}}},"source":["test.to_csv(\"baseline_submission.csv\", index=None)"],"execution_count":108,"outputs":[]},{"cell_type":"code","metadata":{"id":"e8fGwomtV6MO","executionInfo":{"status":"ok","timestamp":1601846797593,"user_tz":-180,"elapsed":1140,"user":{"displayName":"Tatiana Malygina","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhCqlqXvsPEGhS3I2NKze_aTOa6y3To6ci_3I4HSnE=s64","userId":"01712087614426971441"}},"outputId":"4e1e950d-a6c0-45b0-c259-93e372642f80","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["from google.colab import files\n","files.download(\"baseline_submission.csv\")"],"execution_count":109,"outputs":[{"output_type":"display_data","data":{"application/javascript":["\n"," async function download(id, filename, size) {\n"," if (!google.colab.kernel.accessAllowed) {\n"," return;\n"," }\n"," const div = document.createElement('div');\n"," const label = document.createElement('label');\n"," label.textContent = `Downloading \"${filename}\": `;\n"," div.appendChild(label);\n"," const progress = document.createElement('progress');\n"," progress.max = size;\n"," div.appendChild(progress);\n"," document.body.appendChild(div);\n","\n"," const buffers = [];\n"," let downloaded = 0;\n","\n"," const channel = await google.colab.kernel.comms.open(id);\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n","\n"," for await (const message of channel.messages) {\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n"," if (message.buffers) {\n"," for (const buffer of message.buffers) {\n"," buffers.push(buffer);\n"," downloaded += buffer.byteLength;\n"," progress.value = downloaded;\n"," }\n"," }\n"," }\n"," const blob = new Blob(buffers, {type: 'application/binary'});\n"," const a = document.createElement('a');\n"," a.href = window.URL.createObjectURL(blob);\n"," a.download = filename;\n"," div.appendChild(a);\n"," a.click();\n"," div.remove();\n"," }\n"," "],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}},{"output_type":"display_data","data":{"application/javascript":["download(\"download_593a351f-3f83-464f-9d25-1471b217af54\", \"baseline_submission.csv\", 144933)"],"text/plain":["<IPython.core.display.Javascript object>"]},"metadata":{"tags":[]}}]},{"cell_type":"code","metadata":{"id":"gtcmeoAga98u"},"source":[""],"execution_count":null,"outputs":[]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment