avrilcoghlan/retrieve_predictedtarget_info_from_chembl_for_compoundlist.py

## retrieve_predictedtarget_info_from_chembl_for_compoundlist.py
import os
import sys
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files

#====================================================================#

# call the 'target prediction' API to find the predicted targets of our list of compounds:

def find_predicted_targets_of_compounds(cmpd_chembl_ids):

    # For the identified compounds, extract their predicted targets from the 'target prediction' ChEMBL API.
    # Specify the input parameters:
    cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
    limit = 100 #Limit the number of records pulled back for each url call

    # Set up the call to the ChEMBL 'molecule' API
    # Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
    # So need to iterate over several pages of records to gather all relevant information together!
    url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
    url_full_string = url_stem + "/chembl/api/data/target_prediction.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters
    url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format
    url_targetpredictions = url_full['target_predictions'] #This is a list of the results for target predictions

    # This 'while' loop iterates over several pages of records (if required), and collates the list of results
    while url_full['page_meta']['next']:
        url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
        url_targetpredictions = url_targetpredictions + url_full['target_predictions'] #Add result (as a list) to previous list of results

    no_predictions = len(url_targetpredictions)
    if no_predictions == 0:
        # print("No predictions for compounds",cmpd_chembl_ids)
        targ_df = pd.DataFrame(url_targetpredictions)

    else:
        # Convert the list of results into a Pandas dataframe:
        targ_df = pd.DataFrame(url_targetpredictions)

        # Print out some useful information:
        # print("This is the url string that calls the 'Target prediction' API with the specified query\n{}".format(url_full_string) )
        # print("\nThese are the available columns for the Target prediction API:\n{}".format(targ_df.columns))

        # Select only relevant columns:
        targ_df = targ_df[[ 'molecule_chembl_id','probability', 'target_accession', 'target_chembl_id', 'value']]

        # print targ_df

    return targ_df

#====================================================================#

# read in the input list of compounds of interest:

def read_input_list_of_compounds(input_compoundlist_file, output_file):

    cnt = 0
    # open the output file:
    with open(output_file, 'w') as f:

        # read in the list of compounds:
        compounds = list() # create an empty list to store the compounds in
        inputfileObj = open(input_compoundlist_file, "r")
                compound_set_count = 0 # we will retrieve data for 10 compounds at a time
        for line in inputfileObj:
            line = line.rstrip()
            temp = line.split()
            # CHEMBL10
            compound = temp[0] # e.g. CHEMBL10
            cnt += 1
            compounds.append(compound)
            # if the list of compounds has 10 compounds, find the compound info. for these compounds:
            if len(compounds) == 10:
                compound_set_count += 1
                # using a list of known compounds, find predicted targets for those compounds:
                print(cnt,"Finding predicted targets for compounds",compounds)
                targ_df = find_predicted_targets_of_compounds(compounds)

                # Export the data frame to a csv file:
                # Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes
                # and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
                if compound_set_count == 1:
                    targ_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets
                else:
                    targ_df.to_csv(f, sep="\t", index=None, header=False)

                # empty the list of compounds:
                compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/
        inputfileObj.close()

        # if there are some compounds left in the compound list, find their predicted targets:
        if len(compounds) > 0:
            # find the predicted targets for these targets:
            print(cnt,"Finding predicted targets for compounds",compounds)
            targ_df = find_predicted_targets_of_compounds(compounds)
            targ_df.to_csv(f, sep="\t", index=None, header=False)

#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
        print("Usage: %s input_compoundlist_file output_file" % sys.argv[0])
        sys.exit(1)
    input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest
    output_file = sys.argv[2]

    # read in the input list of compounds of interest:
    print("Reading in compound list...")
    read_input_list_of_compounds(input_compoundlist_file, output_file)

    print("FINISHED\n")

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#
	import os
	import sys
	import pandas as pd # uses pandas python module to view and analyse data
	import requests # this is used to access json files

	#====================================================================#

	# call the 'target prediction' API to find the predicted targets of our list of compounds:

	def find_predicted_targets_of_compounds(cmpd_chembl_ids):

	# For the identified compounds, extract their predicted targets from the 'target prediction' ChEMBL API.
	# Specify the input parameters:
	cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
	limit = 100 #Limit the number of records pulled back for each url call

	# Set up the call to the ChEMBL 'molecule' API
	# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
	# So need to iterate over several pages of records to gather all relevant information together!
	url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
	url_full_string = url_stem + "/chembl/api/data/target_prediction.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters
	url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format
	url_targetpredictions = url_full['target_predictions'] #This is a list of the results for target predictions

	# This 'while' loop iterates over several pages of records (if required), and collates the list of results
	while url_full['page_meta']['next']:
	url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
	url_targetpredictions = url_targetpredictions + url_full['target_predictions'] #Add result (as a list) to previous list of results

	no_predictions = len(url_targetpredictions)
	if no_predictions == 0:
	# print("No predictions for compounds",cmpd_chembl_ids)
	targ_df = pd.DataFrame(url_targetpredictions)

	else:
	# Convert the list of results into a Pandas dataframe:
	targ_df = pd.DataFrame(url_targetpredictions)

	# Print out some useful information:
	# print("This is the url string that calls the 'Target prediction' API with the specified query\n{}".format(url_full_string) )
	# print("\nThese are the available columns for the Target prediction API:\n{}".format(targ_df.columns))

	# Select only relevant columns:
	targ_df = targ_df[[ 'molecule_chembl_id','probability', 'target_accession', 'target_chembl_id', 'value']]

	# print targ_df

	return targ_df

	#====================================================================#

	# read in the input list of compounds of interest:

	def read_input_list_of_compounds(input_compoundlist_file, output_file):

	cnt = 0
	# open the output file:
	with open(output_file, 'w') as f:

	# read in the list of compounds:
	compounds = list() # create an empty list to store the compounds in
	inputfileObj = open(input_compoundlist_file, "r")
	compound_set_count = 0 # we will retrieve data for 10 compounds at a time
	for line in inputfileObj:
	line = line.rstrip()
	temp = line.split()
	# CHEMBL10
	compound = temp[0] # e.g. CHEMBL10
	cnt += 1
	compounds.append(compound)
	# if the list of compounds has 10 compounds, find the compound info. for these compounds:
	if len(compounds) == 10:
	compound_set_count += 1
	# using a list of known compounds, find predicted targets for those compounds:
	print(cnt,"Finding predicted targets for compounds",compounds)
	targ_df = find_predicted_targets_of_compounds(compounds)

	# Export the data frame to a csv file:
	# Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes
	# and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
	if compound_set_count == 1:
	targ_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets
	else:
	targ_df.to_csv(f, sep="\t", index=None, header=False)

	# empty the list of compounds:
	compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/
	inputfileObj.close()

	# if there are some compounds left in the compound list, find their predicted targets:
	if len(compounds) > 0:
	# find the predicted targets for these targets:
	print(cnt,"Finding predicted targets for compounds",compounds)
	targ_df = find_predicted_targets_of_compounds(compounds)
	targ_df.to_csv(f, sep="\t", index=None, header=False)

	#====================================================================#

	def main():

	# check the command-line arguments:
	if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
	print("Usage: %s input_compoundlist_file output_file" % sys.argv[0])
	sys.exit(1)
	input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest
	output_file = sys.argv[2]

	# read in the input list of compounds of interest:
	print("Reading in compound list...")
	read_input_list_of_compounds(input_compoundlist_file, output_file)

	print("FINISHED\n")

	#====================================================================#

	if __name__=="__main__":
	main()

	#====================================================================#