ljmartin/chembl_assay_download.ipynb

## chembl_assay_download.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import bq_helper\n",
    "from bq_helper import BigQueryHelper\n",
    "\n",
    "#you will need to set credentials to use google's BigQuery\n",
    "#see: https://cloud.google.com/docs/authentication/getting-started\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How to download activity data from ChEMBL from the cloud\n",
    "\n",
    "Google performs the queries and returns only the relevant info. Saves ~15GB since you don't need a local copy of the chembl database. Takes about 10-15mins. Must have bq_helper, see https://github.com/SohierDane/BigQuery_Helper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up a BigQuery helper instance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebi_chembl = bq_helper.BigQueryHelper(active_project=\"patents-public-data\",\n",
    "                                   dataset_name=\"ebi_chembl\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View available tables as sanity check:\n",
    "Just showing first ten here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['action_type',\n",
       " 'action_type_23',\n",
       " 'action_type_24',\n",
       " 'activities',\n",
       " 'activities_23',\n",
       " 'activities_24',\n",
       " 'activity_properties',\n",
       " 'activity_properties_24',\n",
       " 'activity_smid',\n",
       " 'activity_smid_24']"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# View table names under the ebi_chembl data table\n",
    "bq_assistant = BigQueryHelper(\"patents-public-data\", \"ebi_chembl\")\n",
    "bq_assistant.list_tables()[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View the available columns in a given table:\n",
    "\n",
    "This helped with knowing what to ask for in the queries below. Example here prints the columns for the 'target_dictionary' table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['tid', 'target_type', 'pref_name', 'tax_id', 'organism', 'chembl_id',\n",
       "       'species_group_flag'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "# View the first three rows of the tissue_dictionary_23 data table\n",
    "b =bq_assistant.head(\"target_dictionary\", num_rows=3)\n",
    "b.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save tables for targets, activities, assays, molecules, and structures\n",
    "\n",
    "Uncomment the 'estimate_query' line to check how big the download will be (in GB) before actually saving it.\n",
    "There is a (large) limit of 5TB per month apparently. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#Target dictionary:\n",
    "query1 = \"\"\"\n",
    "SELECT \n",
    "    chembl_id, organism, pref_name, tid\n",
    "FROM\n",
    "  `patents-public-data.ebi_chembl.target_dictionary`\n",
    "WHERE\n",
    "    organism in ('Homo sapiens')\n",
    "\"\"\"\n",
    "\n",
    "#bq_assistant.estimate_query_size(query1)\n",
    "target_dict = ebi_chembl.query_to_pandas_safe(query1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "##Activities:\n",
    "\n",
    "query1 = \"\"\"\n",
    "SELECT\n",
    "    assay_id, molregno, activity_comment, pchembl_value\n",
    "FROM\n",
    "  `patents-public-data.ebi_chembl.activities`\n",
    "\"\"\"\n",
    "#take only positives:\n",
    "#WHERE\n",
    "#    activity_comment in ('Active', 'active', 'Partial agonist', 'Agonist', 'Antagonist') or pchembl_value>\"5\"\n",
    "\n",
    "#bq_assistant.estimate_query_size(query1)\n",
    "activities = ebi_chembl.query_to_pandas_safe(query1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "##Assays\n",
    "\n",
    "query1 = \"\"\"\n",
    "SELECT\n",
    "    chembl_id, assay_id, assay_organism, tid\n",
    "FROM\n",
    "  `patents-public-data.ebi_chembl.assays`\n",
    "WHERE\n",
    "    assay_organism in ('Homo sapiens')\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "#bq_assistant.estimate_query_size(query1)\n",
    "assays = ebi_chembl.query_to_pandas_safe(query1)\n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "##Molecule dictionary:\n",
    "\n",
    "query1 = \"\"\"\n",
    "SELECT\n",
    "    molregno, chembl_id\n",
    "FROM\n",
    "  `patents-public-data.ebi_chembl.molecule_dictionary`\n",
    "\n",
    "\"\"\"\n",
    "#bq_assistant.estimate_query_size(query1)\n",
    "molecule_dict = ebi_chembl.query_to_pandas_safe(query1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "##Compound structures:\n",
    "\n",
    "query1 = \"\"\"\n",
    "SELECT\n",
    "    molregno, canonical_smiles\n",
    "FROM\n",
    "  `patents-public-data.ebi_chembl.compound_structures`\n",
    "\n",
    "\"\"\"\n",
    "bq_assistant.estimate_query_size(query1)\n",
    "compound_structures = ebi_chembl.query_to_pandas_safe(query1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Perform inner joins as you would in SQL:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.merge(compound_structures, molecule_dict, on='molregno')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.merge(df, activities, on='molregno')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "#assays = assays.drop('chembl_id', axis=1)\n",
    "df3 = pd.merge(df2, assays, on='assay_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "target_dict.columns = ['chembl_target_id', 'organism', 'pref_name', 'tid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df4 = pd.merge(df3, target_dict, on='tid')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Print out activites:\n",
    "The columns here contain enough data to parse and build a virtual screening or do any other analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4253823\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>molregno</th>\n",
       "      <th>canonical_smiles</th>\n",
       "      <th>chembl_id</th>\n",
       "      <th>assay_id</th>\n",
       "      <th>activity_comment</th>\n",
       "      <th>pchembl_value</th>\n",
       "      <th>assay_organism</th>\n",
       "      <th>tid</th>\n",
       "      <th>chembl_target_id</th>\n",
       "      <th>organism</th>\n",
       "      <th>pref_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>97</td>\n",
       "      <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
       "      <td>CHEMBL2</td>\n",
       "      <td>688489</td>\n",
       "      <td>Active</td>\n",
       "      <td>4.8</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>100782</td>\n",
       "      <td>CHEMBL4159</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>97</td>\n",
       "      <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
       "      <td>CHEMBL2</td>\n",
       "      <td>688489</td>\n",
       "      <td>Active</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>100782</td>\n",
       "      <td>CHEMBL4159</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>97</td>\n",
       "      <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
       "      <td>CHEMBL2</td>\n",
       "      <td>688489</td>\n",
       "      <td>Active</td>\n",
       "      <td>4.5</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>100782</td>\n",
       "      <td>CHEMBL4159</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2866</td>\n",
       "      <td>COc1cc2ncnc(Nc3cccc(Cl)c3)c2cc1OC</td>\n",
       "      <td>CHEMBL7917</td>\n",
       "      <td>688489</td>\n",
       "      <td>Active</td>\n",
       "      <td>4.8</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>100782</td>\n",
       "      <td>CHEMBL4159</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>37980</td>\n",
       "      <td>[O-][N+](=O)OCCNC(=O)c1cccnc1</td>\n",
       "      <td>CHEMBL284906</td>\n",
       "      <td>688489</td>\n",
       "      <td>Not Active</td>\n",
       "      <td>4.7</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>100782</td>\n",
       "      <td>CHEMBL4159</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  molregno                               canonical_smiles     chembl_id  \\\n",
       "0       97  COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4       CHEMBL2   \n",
       "1       97  COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4       CHEMBL2   \n",
       "2       97  COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4       CHEMBL2   \n",
       "3     2866              COc1cc2ncnc(Nc3cccc(Cl)c3)c2cc1OC    CHEMBL7917   \n",
       "4    37980                  [O-][N+](=O)OCCNC(=O)c1cccnc1  CHEMBL284906   \n",
       "\n",
       "  assay_id activity_comment pchembl_value assay_organism     tid  \\\n",
       "0   688489           Active           4.8   Homo sapiens  100782   \n",
       "1   688489           Active           4.9   Homo sapiens  100782   \n",
       "2   688489           Active           4.5   Homo sapiens  100782   \n",
       "3   688489           Active           4.8   Homo sapiens  100782   \n",
       "4   688489       Not Active           4.7   Homo sapiens  100782   \n",
       "\n",
       "  chembl_target_id      organism  \\\n",
       "0       CHEMBL4159  Homo sapiens   \n",
       "1       CHEMBL4159  Homo sapiens   \n",
       "2       CHEMBL4159  Homo sapiens   \n",
       "3       CHEMBL4159  Homo sapiens   \n",
       "4       CHEMBL4159  Homo sapiens   \n",
       "\n",
       "                                           pref_name  \n",
       "0  Endoplasmic reticulum-associated amyloid beta-...  \n",
       "1  Endoplasmic reticulum-associated amyloid beta-...  \n",
       "2  Endoplasmic reticulum-associated amyloid beta-...  \n",
       "3  Endoplasmic reticulum-associated amyloid beta-...  \n",
       "4  Endoplasmic reticulum-associated amyloid beta-...  "
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(df4))\n",
    "df4.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "df5 = df4.drop(['molregno', 'assay_id', 'assay_organism', 'tid'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "df5.to_csv('all_activities.csv', index=False, header=True, quoting=csv.QUOTE_ALL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import bq_helper\n",
	"from bq_helper import BigQueryHelper\n",
	"\n",
	"#you will need to set credentials to use google's BigQuery\n",
	"#see: https://cloud.google.com/docs/authentication/getting-started\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# How to download activity data from ChEMBL from the cloud\n",
	"\n",
	"Google performs the queries and returns only the relevant info. Saves ~15GB since you don't need a local copy of the chembl database. Takes about 10-15mins. Must have bq_helper, see https://github.com/SohierDane/BigQuery_Helper"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Set up a BigQuery helper instance:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"ebi_chembl = bq_helper.BigQueryHelper(active_project=\"patents-public-data\",\n",
	" dataset_name=\"ebi_chembl\")\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### View available tables as sanity check:\n",
	"Just showing first ten here"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 147,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['action_type',\n",
	" 'action_type_23',\n",
	" 'action_type_24',\n",
	" 'activities',\n",
	" 'activities_23',\n",
	" 'activities_24',\n",
	" 'activity_properties',\n",
	" 'activity_properties_24',\n",
	" 'activity_smid',\n",
	" 'activity_smid_24']"
	]
	},
	"execution_count": 147,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# View table names under the ebi_chembl data table\n",
	"bq_assistant = BigQueryHelper(\"patents-public-data\", \"ebi_chembl\")\n",
	"bq_assistant.list_tables()[:10]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### View the available columns in a given table:\n",
	"\n",
	"This helped with knowing what to ask for in the queries below. Example here prints the columns for the 'target_dictionary' table."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Index(['tid', 'target_type', 'pref_name', 'tax_id', 'organism', 'chembl_id',\n",
	" 'species_group_flag'],\n",
	" dtype='object')"
	]
	},
	"execution_count": 117,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\n",
	"# View the first three rows of the tissue_dictionary_23 data table\n",
	"b =bq_assistant.head(\"target_dictionary\", num_rows=3)\n",
	"b.columns"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Save tables for targets, activities, assays, molecules, and structures\n",
	"\n",
	"Uncomment the 'estimate_query' line to check how big the download will be (in GB) before actually saving it.\n",
	"There is a (large) limit of 5TB per month apparently. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 149,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"#Target dictionary:\n",
	"query1 = \"\"\"\n",
	"SELECT \n",
	" chembl_id, organism, pref_name, tid\n",
	"FROM\n",
	" `patents-public-data.ebi_chembl.target_dictionary`\n",
	"WHERE\n",
	" organism in ('Homo sapiens')\n",
	"\"\"\"\n",
	"\n",
	"#bq_assistant.estimate_query_size(query1)\n",
	"target_dict = ebi_chembl.query_to_pandas_safe(query1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 165,
	"metadata": {},
	"outputs": [],
	"source": [
	"##Activities:\n",
	"\n",
	"query1 = \"\"\"\n",
	"SELECT\n",
	" assay_id, molregno, activity_comment, pchembl_value\n",
	"FROM\n",
	" `patents-public-data.ebi_chembl.activities`\n",
	"\"\"\"\n",
	"#take only positives:\n",
	"#WHERE\n",
	"# activity_comment in ('Active', 'active', 'Partial agonist', 'Agonist', 'Antagonist') or pchembl_value>\"5\"\n",
	"\n",
	"#bq_assistant.estimate_query_size(query1)\n",
	"activities = ebi_chembl.query_to_pandas_safe(query1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 108,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"##Assays\n",
	"\n",
	"query1 = \"\"\"\n",
	"SELECT\n",
	" chembl_id, assay_id, assay_organism, tid\n",
	"FROM\n",
	" `patents-public-data.ebi_chembl.assays`\n",
	"WHERE\n",
	" assay_organism in ('Homo sapiens')\n",
	"\n",
	"\"\"\"\n",
	"\n",
	"#bq_assistant.estimate_query_size(query1)\n",
	"assays = ebi_chembl.query_to_pandas_safe(query1)\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {},
	"outputs": [],
	"source": [
	"##Molecule dictionary:\n",
	"\n",
	"query1 = \"\"\"\n",
	"SELECT\n",
	" molregno, chembl_id\n",
	"FROM\n",
	" `patents-public-data.ebi_chembl.molecule_dictionary`\n",
	"\n",
	"\"\"\"\n",
	"#bq_assistant.estimate_query_size(query1)\n",
	"molecule_dict = ebi_chembl.query_to_pandas_safe(query1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {},
	"outputs": [],
	"source": [
	"##Compound structures:\n",
	"\n",
	"query1 = \"\"\"\n",
	"SELECT\n",
	" molregno, canonical_smiles\n",
	"FROM\n",
	" `patents-public-data.ebi_chembl.compound_structures`\n",
	"\n",
	"\"\"\"\n",
	"bq_assistant.estimate_query_size(query1)\n",
	"compound_structures = ebi_chembl.query_to_pandas_safe(query1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Perform inner joins as you would in SQL:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 166,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.merge(compound_structures, molecule_dict, on='molregno')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 167,
	"metadata": {},
	"outputs": [],
	"source": [
	"df2 = pd.merge(df, activities, on='molregno')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 169,
	"metadata": {},
	"outputs": [],
	"source": [
	"#assays = assays.drop('chembl_id', axis=1)\n",
	"df3 = pd.merge(df2, assays, on='assay_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"target_dict.columns = ['chembl_target_id', 'organism', 'pref_name', 'tid']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 171,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"df4 = pd.merge(df3, target_dict, on='tid')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Print out activites:\n",
	"The columns here contain enough data to parse and build a virtual screening or do any other analysis."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 174,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4253823\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>molregno</th>\n",
	" <th>canonical_smiles</th>\n",
	" <th>chembl_id</th>\n",
	" <th>assay_id</th>\n",
	" <th>activity_comment</th>\n",
	" <th>pchembl_value</th>\n",
	" <th>assay_organism</th>\n",
	" <th>tid</th>\n",
	" <th>chembl_target_id</th>\n",
	" <th>organism</th>\n",
	" <th>pref_name</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>97</td>\n",
	" <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
	" <td>CHEMBL2</td>\n",
	" <td>688489</td>\n",
	" <td>Active</td>\n",
	" <td>4.8</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>100782</td>\n",
	" <td>CHEMBL4159</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>97</td>\n",
	" <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
	" <td>CHEMBL2</td>\n",
	" <td>688489</td>\n",
	" <td>Active</td>\n",
	" <td>4.9</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>100782</td>\n",
	" <td>CHEMBL4159</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>97</td>\n",
	" <td>COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4</td>\n",
	" <td>CHEMBL2</td>\n",
	" <td>688489</td>\n",
	" <td>Active</td>\n",
	" <td>4.5</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>100782</td>\n",
	" <td>CHEMBL4159</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2866</td>\n",
	" <td>COc1cc2ncnc(Nc3cccc(Cl)c3)c2cc1OC</td>\n",
	" <td>CHEMBL7917</td>\n",
	" <td>688489</td>\n",
	" <td>Active</td>\n",
	" <td>4.8</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>100782</td>\n",
	" <td>CHEMBL4159</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>37980</td>\n",
	" <td>[O-][N+](=O)OCCNC(=O)c1cccnc1</td>\n",
	" <td>CHEMBL284906</td>\n",
	" <td>688489</td>\n",
	" <td>Not Active</td>\n",
	" <td>4.7</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>100782</td>\n",
	" <td>CHEMBL4159</td>\n",
	" <td>Homo sapiens</td>\n",
	" <td>Endoplasmic reticulum-associated amyloid beta-...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" molregno canonical_smiles chembl_id \\\n",
	"0 97 COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4 CHEMBL2 \n",
	"1 97 COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4 CHEMBL2 \n",
	"2 97 COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4 CHEMBL2 \n",
	"3 2866 COc1cc2ncnc(Nc3cccc(Cl)c3)c2cc1OC CHEMBL7917 \n",
	"4 37980 [O-][N+](=O)OCCNC(=O)c1cccnc1 CHEMBL284906 \n",
	"\n",
	" assay_id activity_comment pchembl_value assay_organism tid \\\n",
	"0 688489 Active 4.8 Homo sapiens 100782 \n",
	"1 688489 Active 4.9 Homo sapiens 100782 \n",
	"2 688489 Active 4.5 Homo sapiens 100782 \n",
	"3 688489 Active 4.8 Homo sapiens 100782 \n",
	"4 688489 Not Active 4.7 Homo sapiens 100782 \n",
	"\n",
	" chembl_target_id organism \\\n",
	"0 CHEMBL4159 Homo sapiens \n",
	"1 CHEMBL4159 Homo sapiens \n",
	"2 CHEMBL4159 Homo sapiens \n",
	"3 CHEMBL4159 Homo sapiens \n",
	"4 CHEMBL4159 Homo sapiens \n",
	"\n",
	" pref_name \n",
	"0 Endoplasmic reticulum-associated amyloid beta-... \n",
	"1 Endoplasmic reticulum-associated amyloid beta-... \n",
	"2 Endoplasmic reticulum-associated amyloid beta-... \n",
	"3 Endoplasmic reticulum-associated amyloid beta-... \n",
	"4 Endoplasmic reticulum-associated amyloid beta-... "
	]
	},
	"execution_count": 174,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print(len(df4))\n",
	"df4.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 175,
	"metadata": {},
	"outputs": [],
	"source": [
	"df5 = df4.drop(['molregno', 'assay_id', 'assay_organism', 'tid'], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 176,
	"metadata": {},
	"outputs": [],
	"source": [
	"import csv\n",
	"df5.to_csv('all_activities.csv', index=False, header=True, quoting=csv.QUOTE_ALL)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}