Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Last active August 31, 2020 23:47
Show Gist options
  • Save PatWalters/7614dbadaa4b1dcd6650c80a6e3640e4 to your computer and use it in GitHub Desktop.
Save PatWalters/7614dbadaa4b1dcd6650c80a6e3640e4 to your computer and use it in GitHub Desktop.
A few ChEMBL query examples
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import mysql.connector as sql"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"hostname = 'localhost'\n",
"database = 'chembl_27'\n",
"user = 'pwalters'\n",
"password = 'imnottellin'\n",
"con = sql.connect(host=hostname, database=database, user=user, password=password)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Retrieve marketed drugs from ChEMBL**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"drug_query = \"\"\"select distinct canonical_smiles, cs.molregno, f.ingredient from compound_structures cs\n",
"join formulations f on cs.molregno = f.molregno\n",
"join products p on p.product_id = f.product_id\n",
"join compound_properties cp on cp.molregno = cs.molregno\n",
"where p.oral = 1\n",
"and cp.mw_freebase < 1000\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"drug_df = pd.read_sql(drug_query,con=con)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>canonical_smiles</th>\n",
" <th>molregno</th>\n",
" <th>ingredient</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1</td>\n",
" <td>32842</td>\n",
" <td>SULFAPYRIDINE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+]</td>\n",
" <td>674933</td>\n",
" <td>BUTABARBITAL SODIUM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Cl.N=C(N)N</td>\n",
" <td>674679</td>\n",
" <td>GUANIDINE HYDROCHLORIDE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CC1=CC(=O)c2ccccc2C1=O</td>\n",
" <td>19344</td>\n",
" <td>MENADIONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cn1c(=O)c2[nH]cnc2n(C)c1=O.Cn1c(=O)c2[nH]cnc2n...</td>\n",
" <td>794445</td>\n",
" <td>AMINOPHYLLINE</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" canonical_smiles molregno \\\n",
"0 Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1 32842 \n",
"1 CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+] 674933 \n",
"2 Cl.N=C(N)N 674679 \n",
"3 CC1=CC(=O)c2ccccc2C1=O 19344 \n",
"4 Cn1c(=O)c2[nH]cnc2n(C)c1=O.Cn1c(=O)c2[nH]cnc2n... 794445 \n",
"\n",
" ingredient \n",
"0 SULFAPYRIDINE \n",
"1 BUTABARBITAL SODIUM \n",
"2 GUANIDINE HYDROCHLORIDE \n",
"3 MENADIONE \n",
"4 AMINOPHYLLINE "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drug_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Retrieve journal information along with structures from ChEMBL**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"journal_query = \"\"\"select canonical_smiles, cs.molregno, journal from docs\n",
"join compound_records cr on docs.doc_id = cr.doc_id\n",
"join compound_structures cs on cs.molregno = cr.molregno\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"journal_df = pd.read_sql(journal_query,con=con)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>canonical_smiles</th>\n",
" <th>molregno</th>\n",
" <th>journal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl</td>\n",
" <td>1</td>\n",
" <td>J. Med. Chem.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1</td>\n",
" <td>2</td>\n",
" <td>J. Med. Chem.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1</td>\n",
" <td>3</td>\n",
" <td>J. Med. Chem.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1</td>\n",
" <td>4</td>\n",
" <td>J. Med. Chem.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1</td>\n",
" <td>5</td>\n",
" <td>J. Med. Chem.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" canonical_smiles molregno journal\n",
"0 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl 1 J. Med. Chem.\n",
"1 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1 2 J. Med. Chem.\n",
"2 Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1 3 J. Med. Chem.\n",
"3 Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1 4 J. Med. Chem.\n",
"4 Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1 5 J. Med. Chem."
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"journal_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Retrieve COVID-19 drug repurposing data from ChEMBL**"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"covid_query = \"\"\"select cs.canonical_smiles, cs.molregno, usan_tbl.synonyms, act.assay_id, act.standard_value, act.standard_type, act.standard_units from activities act\n",
" join compound_structures cs on cs.molregno = act.molregno\n",
" join assays a on act.assay_id = a.assay_id\n",
" join source s on a.src_id = s.src_id\n",
" left join (select molregno, synonyms from molecule_synonyms where syn_type = 'USAN') as usan_tbl on usan_tbl.molregno = cs.molregno\n",
" where src_description = 'SARS-CoV-2 Screening Data'\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"covid_df = pd.read_sql(covid_query,con=con)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>canonical_smiles</th>\n",
" <th>molregno</th>\n",
" <th>synonyms</th>\n",
" <th>assay_id</th>\n",
" <th>standard_value</th>\n",
" <th>standard_type</th>\n",
" <th>standard_units</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>O=C(O)c1ccc(N2C(=O)/C(=C/c3ccc(-c4ccc([N+](=O)...</td>\n",
" <td>1033892</td>\n",
" <td>None</td>\n",
" <td>1804482</td>\n",
" <td>8.709636e+06</td>\n",
" <td>IC50</td>\n",
" <td>nM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\\Nc1nc(-c2ccc...</td>\n",
" <td>425006</td>\n",
" <td>None</td>\n",
" <td>1804482</td>\n",
" <td>2.089300e+03</td>\n",
" <td>IC50</td>\n",
" <td>nM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CCNC(=O)c1cc2c(-c3cc(C(C)(C)O)ccc3Oc3c(C)cc(F)...</td>\n",
" <td>2335425</td>\n",
" <td>None</td>\n",
" <td>1804482</td>\n",
" <td>3.467368e+06</td>\n",
" <td>IC50</td>\n",
" <td>nM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CCC(=O)CCCCC[C@@H]1NC(=O)[C@H]2CCCCN2C(=O)[C@H...</td>\n",
" <td>275631</td>\n",
" <td>None</td>\n",
" <td>1804482</td>\n",
" <td>9.996846e+08</td>\n",
" <td>IC50</td>\n",
" <td>nM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>C[C@@H](C1CCCCC1)n1c(-c2cc3c(cc2Br)OCO3)nc2cc(...</td>\n",
" <td>2260454</td>\n",
" <td>None</td>\n",
" <td>1804482</td>\n",
" <td>6.025600e+02</td>\n",
" <td>IC50</td>\n",
" <td>nM</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" canonical_smiles molregno synonyms \\\n",
"0 O=C(O)c1ccc(N2C(=O)/C(=C/c3ccc(-c4ccc([N+](=O)... 1033892 None \n",
"1 O=C(O)/C(Cc1ccccc1[N+](=O)[O-])=N\\Nc1nc(-c2ccc... 425006 None \n",
"2 CCNC(=O)c1cc2c(-c3cc(C(C)(C)O)ccc3Oc3c(C)cc(F)... 2335425 None \n",
"3 CCC(=O)CCCCC[C@@H]1NC(=O)[C@H]2CCCCN2C(=O)[C@H... 275631 None \n",
"4 C[C@@H](C1CCCCC1)n1c(-c2cc3c(cc2Br)OCO3)nc2cc(... 2260454 None \n",
"\n",
" assay_id standard_value standard_type standard_units \n",
"0 1804482 8.709636e+06 IC50 nM \n",
"1 1804482 2.089300e+03 IC50 nM \n",
"2 1804482 3.467368e+06 IC50 nM \n",
"3 1804482 9.996846e+08 IC50 nM \n",
"4 1804482 6.025600e+02 IC50 nM "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"covid_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment