Skip to content

Instantly share code, notes, and snippets.

@cells2numbers
Forked from mnowotka/mol2tar.r
Created October 24, 2018 19:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cells2numbers/e6c762754698b8ac58df2987f61e1f96 to your computer and use it in GitHub Desktop.
Save cells2numbers/e6c762754698b8ac58df2987f61e1f96 to your computer and use it in GitHub Desktop.
#!/usr/bin/env RScript
# This scripts reads a list of compounds, identified by their chembl ids from a CSV file
# given as an input. For each compound it find related targets, optionally filtered by
# organism. It saves a mepping between the compound and targets in the output CSV file.
# First, we import useful libaries. We need a library to parse JSON and another one to
# make HTTP requests.
library(jsonlite)
library(httr)
# Some constants/defaults
DEFAULT_INPUT <- "compounds_list_lite.csv"
DEFAULT_OUTPUT <- "./r_out.csv"
DEFAULT_ENCODING <- "UTF-8"
URL_ROOT <- "https://www.ebi.ac.uk/chembl/api/data"
# This part defines and handles command line arguments
suppressPackageStartupMessages(library("optparse"))
args <- commandArgs(TRUE)
parser <- OptionParser()
parser <- add_option(parser, c("-i", "--input"), default=DEFAULT_INPUT, dest="input", help="Path to input CSV file with compound chembl ids [default]")
parser <- add_option(parser, c("-o", "--output"), default=DEFAULT_OUTPUT, dest="output", help="Path to output CSV file with targets [default]")
parser <- add_option(parser, c("-O", "--organism"), dest="organism", help="Filter targets by organisms [default]")
parser <- add_option(parser, c("-U", "--url"), default=URL_ROOT, dest="url", help="API URL root [default]")
parser <- add_option(parser, c("-e", "--encoding"), default=DEFAULT_ENCODING, dest="encoding", help="API encoding [default]")
parsed <- parse_args(parser, args=args)
# Reading the input CSV file
mols <- read.csv(file=parsed$input, header=FALSE, sep=",")
# Defining some auxiliary variables
list <- c()
limit <- 20
# This function takes care about interaction with the ChEMBL REST API
# It performs HTTP GET request, handles errors, perses and validates
# response and returns result
get_objects <- function(query){
req <- GET(file.path(parsed$url, 'activity.json'), query=query)
warn_for_status(req)
json <- content(req, "text", encoding=parsed$encoding)
validate(json)
return(fromJSON(json))
}
# We loop through each compound from the input CSV file
for (mol in mols[["V1"]]){
# printing here just to see the progress
print(mol)
offset <- 0
next_page <- TRUE
vector <- c()
# we need to handle pagination, while there are more results
# we should read the next page
while(!is.null(next_page)){
# we filter acivities by the molecule and optionally target organism,
# limit and offest control paging
query <- list("molecule_chembl_id"=mol, "offset"=offset, "limit"=limit, "target_organism"=parsed$organism)
objects <- get_objects(query)
# extracting target ids
targets <- objects$activities$target_chembl_id
vector <- c(vector, targets)
next_page <- objects$page_meta$'next'
offset <- offset + limit
}
# target list may not be unique and randomly ordered
# so we need to sort and remove duplicates
list[[length(list)+1]] <- c(mol, sort(unique(vector)))
}
# creating a data frame
len <- max(sapply(list ,length))
dfs <- sapply(list, function(x) c(x, rep('', len - length(x))))
# once we have a data frame we can save it to the output CSV file
write.csv(t(dfs), file=parsed$output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment