mnowotka/mol2tar.r

## mol2tar.r
#!/usr/bin/env RScript

# This scripts reads a list of compounds, identified by their chembl ids from a CSV file
# given as an input. For each compound it find related targets, optionally filtered by
# organism. It saves a mepping between the compound and targets in the output CSV file.

# First, we import useful libaries. We need a library to parse JSON and another one to
# make HTTP requests.
library(jsonlite)
library(httr)

# Some constants/defaults
DEFAULT_INPUT <- "compounds_list_lite.csv"
DEFAULT_OUTPUT <- "./r_out.csv"
DEFAULT_ENCODING <- "UTF-8"
URL_ROOT <- "https://www.ebi.ac.uk/chembl/api/data"

# This part defines and handles command line arguments
suppressPackageStartupMessages(library("optparse"))
args <- commandArgs(TRUE)
parser <- OptionParser()
parser <- add_option(parser, c("-i", "--input"), default=DEFAULT_INPUT, dest="input", help="Path to input CSV file with compound chembl ids [default]")
parser <- add_option(parser, c("-o", "--output"), default=DEFAULT_OUTPUT, dest="output", help="Path to output CSV file with targets [default]")
parser <- add_option(parser, c("-O", "--organism"), dest="organism", help="Filter targets by organisms [default]")
parser <- add_option(parser, c("-U", "--url"), default=URL_ROOT, dest="url", help="API URL root [default]")
parser <- add_option(parser, c("-e", "--encoding"), default=DEFAULT_ENCODING, dest="encoding", help="API encoding [default]")
parsed <- parse_args(parser, args=args)

# Reading the input CSV file
mols <- read.csv(file=parsed$input, header=FALSE, sep=",")

# Defining some auxiliary variables
list <- c()
limit <- 20

# This function takes care about interaction with the ChEMBL REST API
# It performs HTTP GET request, handles errors, perses and validates
# response and returns result
get_objects <- function(query){
    req <- GET(file.path(parsed$url, 'activity.json'), query=query)
	warn_for_status(req)
	json <- content(req, "text", encoding=parsed$encoding)
	validate(json)
    return(fromJSON(json))
}

# We loop through each compound from the input CSV file
for (mol in mols[["V1"]]){

	# printing here just to see the progress
	print(mol)
	offset <- 0
	next_page <- TRUE
	vector <- c()

	# we need to handle pagination, while there are more results
	# we should read the next page
	while(!is.null(next_page)){
		 # we filter acivities by the molecule and optionally target organism,
		 # limit and offest control paging
         query <- list("molecule_chembl_id"=mol, "offset"=offset, "limit"=limit, "target_organism"=parsed$organism)
         objects <- get_objects(query)

         # extracting target ids
         targets <- objects$activities$target_chembl_id
         vector <- c(vector, targets)
         next_page <- objects$page_meta$'next'
         offset <- offset + limit
	}

	# target list may not be unique and randomly ordered
	# so we need to sort and remove duplicates
	list[[length(list)+1]] <- c(mol, sort(unique(vector)))
}

# creating a data frame
len <- max(sapply(list ,length))
dfs <- sapply(list, function(x) c(x, rep('', len - length(x))))

# once we have a data frame we can save it to the output CSV file
write.csv(t(dfs), file=parsed$output)
	#!/usr/bin/env RScript

	# This scripts reads a list of compounds, identified by their chembl ids from a CSV file
	# given as an input. For each compound it find related targets, optionally filtered by
	# organism. It saves a mepping between the compound and targets in the output CSV file.

	# First, we import useful libaries. We need a library to parse JSON and another one to
	# make HTTP requests.
	library(jsonlite)
	library(httr)

	# Some constants/defaults
	DEFAULT_INPUT <- "compounds_list_lite.csv"
	DEFAULT_OUTPUT <- "./r_out.csv"
	DEFAULT_ENCODING <- "UTF-8"
	URL_ROOT <- "https://www.ebi.ac.uk/chembl/api/data"

	# This part defines and handles command line arguments
	suppressPackageStartupMessages(library("optparse"))
	args <- commandArgs(TRUE)
	parser <- OptionParser()
	parser <- add_option(parser, c("-i", "--input"), default=DEFAULT_INPUT, dest="input", help="Path to input CSV file with compound chembl ids [default]")
	parser <- add_option(parser, c("-o", "--output"), default=DEFAULT_OUTPUT, dest="output", help="Path to output CSV file with targets [default]")
	parser <- add_option(parser, c("-O", "--organism"), dest="organism", help="Filter targets by organisms [default]")
	parser <- add_option(parser, c("-U", "--url"), default=URL_ROOT, dest="url", help="API URL root [default]")
	parser <- add_option(parser, c("-e", "--encoding"), default=DEFAULT_ENCODING, dest="encoding", help="API encoding [default]")
	parsed <- parse_args(parser, args=args)

	# Reading the input CSV file
	mols <- read.csv(file=parsed$input, header=FALSE, sep=",")

	# Defining some auxiliary variables
	list <- c()
	limit <- 20

	# This function takes care about interaction with the ChEMBL REST API
	# It performs HTTP GET request, handles errors, perses and validates
	# response and returns result
	get_objects <- function(query){
	req <- GET(file.path(parsed$url, 'activity.json'), query=query)
	warn_for_status(req)
	json <- content(req, "text", encoding=parsed$encoding)
	validate(json)
	return(fromJSON(json))
	}

	# We loop through each compound from the input CSV file
	for (mol in mols[["V1"]]){

	# printing here just to see the progress
	print(mol)
	offset <- 0
	next_page <- TRUE
	vector <- c()

	# we need to handle pagination, while there are more results
	# we should read the next page
	while(!is.null(next_page)){
	# we filter acivities by the molecule and optionally target organism,
	# limit and offest control paging
	query <- list("molecule_chembl_id"=mol, "offset"=offset, "limit"=limit, "target_organism"=parsed$organism)
	objects <- get_objects(query)

	# extracting target ids
	targets <- objects$activities$target_chembl_id
	vector <- c(vector, targets)
	next_page <- objects$page_meta$'next'
	offset <- offset + limit
	}

	# target list may not be unique and randomly ordered
	# so we need to sort and remove duplicates
	list[[length(list)+1]] <- c(mol, sort(unique(vector)))
	}

	# creating a data frame
	len <- max(sapply(list ,length))
	dfs <- sapply(list, function(x) c(x, rep('', len - length(x))))

	# once we have a data frame we can save it to the output CSV file
	write.csv(t(dfs), file=parsed$output)