Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save avrilcoghlan/133806c5caf9d0483dfae4b6741174f8 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/133806c5caf9d0483dfae4b6741174f8 to your computer and use it in GitHub Desktop.
Script to retrieve predicted targets from ChEMBL for an input list of ChEMBL compounds
import os
import sys
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files
#====================================================================#
# call the 'target prediction' API to find the predicted targets of our list of compounds:
def find_predicted_targets_of_compounds(cmpd_chembl_ids):
# For the identified compounds, extract their predicted targets from the 'target prediction' ChEMBL API.
# Specify the input parameters:
cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
limit = 100 #Limit the number of records pulled back for each url call
# Set up the call to the ChEMBL 'molecule' API
# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
# So need to iterate over several pages of records to gather all relevant information together!
url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
url_full_string = url_stem + "/chembl/api/data/target_prediction.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters
url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format
url_targetpredictions = url_full['target_predictions'] #This is a list of the results for target predictions
# This 'while' loop iterates over several pages of records (if required), and collates the list of results
while url_full['page_meta']['next']:
url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
url_targetpredictions = url_targetpredictions + url_full['target_predictions'] #Add result (as a list) to previous list of results
no_predictions = len(url_targetpredictions)
if no_predictions == 0:
# print("No predictions for compounds",cmpd_chembl_ids)
targ_df = pd.DataFrame(url_targetpredictions)
else:
# Convert the list of results into a Pandas dataframe:
targ_df = pd.DataFrame(url_targetpredictions)
# Print out some useful information:
# print("This is the url string that calls the 'Target prediction' API with the specified query\n{}".format(url_full_string) )
# print("\nThese are the available columns for the Target prediction API:\n{}".format(targ_df.columns))
# Select only relevant columns:
targ_df = targ_df[[ 'molecule_chembl_id','probability', 'target_accession', 'target_chembl_id', 'value']]
# print targ_df
return targ_df
#====================================================================#
# read in the input list of compounds of interest:
def read_input_list_of_compounds(input_compoundlist_file, output_file):
cnt = 0
# open the output file:
with open(output_file, 'w') as f:
# read in the list of compounds:
compounds = list() # create an empty list to store the compounds in
inputfileObj = open(input_compoundlist_file, "r")
compound_set_count = 0 # we will retrieve data for 10 compounds at a time
for line in inputfileObj:
line = line.rstrip()
temp = line.split()
# CHEMBL10
compound = temp[0] # e.g. CHEMBL10
cnt += 1
compounds.append(compound)
# if the list of compounds has 10 compounds, find the compound info. for these compounds:
if len(compounds) == 10:
compound_set_count += 1
# using a list of known compounds, find predicted targets for those compounds:
print(cnt,"Finding predicted targets for compounds",compounds)
targ_df = find_predicted_targets_of_compounds(compounds)
# Export the data frame to a csv file:
# Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes
# and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
if compound_set_count == 1:
targ_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets
else:
targ_df.to_csv(f, sep="\t", index=None, header=False)
# empty the list of compounds:
compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/
inputfileObj.close()
# if there are some compounds left in the compound list, find their predicted targets:
if len(compounds) > 0:
# find the predicted targets for these targets:
print(cnt,"Finding predicted targets for compounds",compounds)
targ_df = find_predicted_targets_of_compounds(compounds)
targ_df.to_csv(f, sep="\t", index=None, header=False)
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s input_compoundlist_file output_file" % sys.argv[0])
sys.exit(1)
input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest
output_file = sys.argv[2]
# read in the input list of compounds of interest:
print("Reading in compound list...")
read_input_list_of_compounds(input_compoundlist_file, output_file)
print("FINISHED\n")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment