-
-
Save mnowotka/99a232900116df22be84ead82f234d9e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env RScript | |
# This scripts reads a list of compounds, identified by their chembl ids from a CSV file | |
# given as an input. For each compound it find related targets, optionally filtered by | |
# organism. It saves a mepping between the compound and targets in the output CSV file. | |
# First, we import useful libaries. We need a library to parse JSON and another one to | |
# make HTTP requests. | |
library(jsonlite) | |
library(httr) | |
# Some constants/defaults | |
DEFAULT_INPUT <- "compounds_list_lite.csv" | |
DEFAULT_OUTPUT <- "./r_out.csv" | |
DEFAULT_ENCODING <- "UTF-8" | |
URL_ROOT <- "https://www.ebi.ac.uk/chembl/api/data" | |
# This part defines and handles command line arguments | |
suppressPackageStartupMessages(library("optparse")) | |
args <- commandArgs(TRUE) | |
parser <- OptionParser() | |
parser <- add_option(parser, c("-i", "--input"), default=DEFAULT_INPUT, dest="input", help="Path to input CSV file with compound chembl ids [default]") | |
parser <- add_option(parser, c("-o", "--output"), default=DEFAULT_OUTPUT, dest="output", help="Path to output CSV file with targets [default]") | |
parser <- add_option(parser, c("-O", "--organism"), dest="organism", help="Filter targets by organisms [default]") | |
parser <- add_option(parser, c("-U", "--url"), default=URL_ROOT, dest="url", help="API URL root [default]") | |
parser <- add_option(parser, c("-e", "--encoding"), default=DEFAULT_ENCODING, dest="encoding", help="API encoding [default]") | |
parsed <- parse_args(parser, args=args) | |
# Reading the input CSV file | |
mols <- read.csv(file=parsed$input, header=FALSE, sep=",") | |
# Defining some auxiliary variables | |
list <- c() | |
limit <- 20 | |
# This function takes care about interaction with the ChEMBL REST API | |
# It performs HTTP GET request, handles errors, perses and validates | |
# response and returns result | |
get_objects <- function(query){ | |
req <- GET(file.path(parsed$url, 'activity.json'), query=query) | |
warn_for_status(req) | |
json <- content(req, "text", encoding=parsed$encoding) | |
validate(json) | |
return(fromJSON(json)) | |
} | |
# We loop through each compound from the input CSV file | |
for (mol in mols[["V1"]]){ | |
# printing here just to see the progress | |
print(mol) | |
offset <- 0 | |
next_page <- TRUE | |
vector <- c() | |
# we need to handle pagination, while there are more results | |
# we should read the next page | |
while(!is.null(next_page)){ | |
# we filter acivities by the molecule and optionally target organism, | |
# limit and offest control paging | |
query <- list("molecule_chembl_id"=mol, "offset"=offset, "limit"=limit, "target_organism"=parsed$organism) | |
objects <- get_objects(query) | |
# extracting target ids | |
targets <- objects$activities$target_chembl_id | |
vector <- c(vector, targets) | |
next_page <- objects$page_meta$'next' | |
offset <- offset + limit | |
} | |
# target list may not be unique and randomly ordered | |
# so we need to sort and remove duplicates | |
list[[length(list)+1]] <- c(mol, sort(unique(vector))) | |
} | |
# creating a data frame | |
len <- max(sapply(list ,length)) | |
dfs <- sapply(list, function(x) c(x, rep('', len - length(x)))) | |
# once we have a data frame we can save it to the output CSV file | |
write.csv(t(dfs), file=parsed$output) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment