Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leelasd/746f3c447805fc6ea5aad1071bd77aef to your computer and use it in GitHub Desktop.
Save leelasd/746f3c447805fc6ea5aad1071bd77aef to your computer and use it in GitHub Desktop.
Python script to query the ChEMBL database to retrieve a list of compounds with bioactivities for certain target proteins, and then retrieve information on the molecular properties of those compounds
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files
#====================================================================#
# using a list of known targets, find compounds that are active on these targets:
def find_bioactivities_for_targets(targets):
targets = ",".join(targets) # join the targets into a suitable string to fulfil the search conditions of the ChEMBL api
assay_type = 'B' # only look for binding assays
pchembl_value = 5 # xxx specify a minimum threshold of the pCHEMBL activity value. Greater than or equal to 5 (10 microM) is a typical minimum rule of thumb for binding activity between a compound and a protein target
limit = 100 # limit the number of records pulled back for each url call xxx
# set up the call to the ChEMBL 'activity' API.
# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records).
# So we need to iterate over several pages of results to gather all relevant information together.
url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
url_full_string = url_stem + "/chembl/api/data/activity.json?target_chembl_id__in={}&assay_type={}&pchembl_value__gte={}&limit={}".format(targets
, assay_type, pchembl_value, limit) #This is the full url with the specified input parameters
url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to j
son format
url_activities = url_full['activities'] #This is a list of the results for activities
# This 'while' loop iterates over several pages of records (if required), and collates the list of results
while url_full['page_meta']['next']:
url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
url_activities = url_activities + url_full['activities'] #Add result (as a list) to previous list of results
# Convert the list of results into a Pandas dataframe:
act_df = pd.DataFrame(url_activities)
# Print out some useful information:
print("This is the url string that calls the 'Activities' API with the initial query specification:\n{}".format(url_full_string) )
print("\nThese are the available columns for the Activities API:\n{}".format(act_df.columns))
#Specify which columns to keep so that the size of the dataframe becomes more manageable:
act_df = act_df[[ 'target_chembl_id','target_organism', 'target_pref_name'
, 'parent_molecule_chembl_id','molecule_chembl_id','molecule_pref_name'
, 'pchembl_value', 'standard_type','standard_relation', 'standard_value', 'standard_units'
, 'assay_chembl_id','document_chembl_id','src_id']]
return act_df
#====================================================================#
# Extract the list of compounds from the previous dataframe ('act_df'), and call the 'molecule' API to find their molecular properties etc, so that this list can be refined
def find_properties_of_compounds(act_df):
#First find the list of compounds that are within the act_df dataframe:
cmpd_chembl_ids = list(set(act_df['molecule_chembl_id']))
print("There are {} compounds initially identified as active on the known targets. e.g.".format(len(cmpd_chembl_ids)))
print(cmpd_chembl_ids[0:2])
#For the identified compounds, extract their molecular properties and other information from the 'molecule' ChEMBL API
#Specify the input parameters:
cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
limit = 100 #Limit the number of records pulled back for each url call
# Set up the call to the ChEMBL 'molecule' API
# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
# So need to iterate over several pages of records to gather all relevant information together!
url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
url_full_string = url_stem + "/chembl/api/data/molecule.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full
url with the specified input parameters
url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to j
son format
url_molecules = url_full['molecules'] #This is a list of the results for activities
# This 'while' loop iterates over several pages of records (if required), and collates the list of results
while url_full['page_meta']['next']:
url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
url_molecules = url_molecules + url_full['molecules'] #Add result (as a list) to previous list of results
#Convert the list of results into a Pandas dataframe:
mol_df = pd.DataFrame(url_molecules)
#Print out some useful information:
print("This is the url string that calls the 'Molecule' API with the specified query\n{}".format(url_full_string) )
print("\nThese are the available columns for the Molecule API:\n{}".format(mol_df.columns))
# Select only relevant columns:
mol_df = mol_df[[ 'molecule_chembl_id','pref_name', 'molecule_hierarchy'
, 'molecule_properties', 'max_phase']]
# And convert cells containing a dictionary to individual columns in the dataframe so that is it easier to filter!
# Molecule hierarchy:
mol_df['parent_chembl_id'] = mol_df['molecule_hierarchy'].apply(lambda x: x['parent_chembl_id'])
#Physicochemical properties (only report if cells are not null)
mol_df['acd_logd'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['acd_logd'])
mol_df['acd_logp'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['acd_logp'])
mol_df['acd_most_apka'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['acd_most_apka'])
mol_df['acd_most_bpka'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['acd_most_bpka'])
mol_df['alogp'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['alogp'])
mol_df['hba'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['hba'])
mol_df['hbd'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['hbd'])
mol_df['mw_freebase'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['mw_freebase']) #This is th
e mwt of the parent compound
mol_df['full_mwt'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['full_mwt']) #This is the mwt
of the full compound including any salt
mol_df['num_ro5_violations'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['num_ro5_violations'
])
mol_df['psa'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['psa'])
mol_df['heavy_atoms'] = mol_df.loc[ mol_df['molecule_properties'].notnull(), 'molecule_properties'].apply(lambda x: x['heavy_atoms'])
print(mol_df)
return mol_df
#====================================================================#
# Filter the compound list based on relevant information
def filter_list_of_compounds(mol_df):
# Filter the compounds based on their molecular properties, or max_phase, for example:
# Now keep only molecules with max_phase = 4 (ie approved drugs), for example:
res = mol_df[ mol_df['max_phase'] == 4 ]
#Now keep only molecules with max_phase = 4 (ie approved drugs), for example:
res = mol_df[ mol_df['max_phase'] == 4 ]
# # OR keep only molecules with less than 400 amu, for example:
# # but first need to convert strings to float for 'full_mwt':
# mol_df['full_mwt'] = mol_df['full_mwt'].apply(lambda x: float(x) )
# res = mol_df[ mol_df['full_mwt'] < 400 ]
#Display only top few rows:
print(res.head())
return res
#====================================================================#
def main():
# using a list of known targets, find compounds that are active on these targets:
targets = ['CHEMBL1848', 'CHEMBL3394'] # xxx
act_df = find_bioactivities_for_targets(targets)
# extract the list of compounds, and find these compounds' properties:
mol_df = find_properties_of_compounds(act_df)
# Filter the compound list based on relevant information
filtered_mol_df = filter_list_of_compounds(mol_df)
print("FINISHED\n")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment