Last active
March 8, 2018 15:35
-
-
Save tgirke/fb4d7ab4cc555968b521f4d8e624cd79 to your computer and use it in GitHub Desktop.
KEGG CMP Download into R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################### | |
## Import of KEGG Compounds into SDFset Container ## | |
#################################################### | |
## Date: March 07, 2018 | |
## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/ | |
################################ | |
## Download KEGG CMPs via URL ## | |
################################ | |
library(ChemmineR) | |
keggquery <- c("D00040", "D00066") | |
## Example for importing single compound via URL | |
read.SDFset("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+D00040") | |
## For many compounds you could use this simple import function or | |
## use the PubChem getIds() function in ChemmineR (requires PubChem IDs) | |
importKEGG <- function(ids) { | |
sdfset <- SDFset() | |
for(i in ids) { | |
url <- paste0("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+", i) | |
tmp <- as(read.SDFset(url), "SDFset") | |
cid(tmp) <- i | |
sdfset <- c(sdfset, tmp) | |
} | |
sdfset | |
} | |
sdfset <- importKEGG(ids=keggquery) | |
sdfset | |
plot(sdfset) # Nicer compound plots are available in developer version of ChemmineR | |
######################## | |
## Download via ChEBI ## | |
######################## | |
## This solution is slower but more comprehensive, and it gives you most of the KEGG compounds along with a | |
## lot of additional information | |
library(ChemmineR) | |
## Download KEGG SDFs from ChEBI | |
download.file("ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz", "ChEBI_complete_3star.sdf.gz") | |
## Import into R using first SDFstr container for faster subsetting | |
sdfstr <- read.SDFstr("ChEBI_complete_3star.sdf.gz") | |
keggindex <- grep("KEGG COMPOUND Database Links", as(sdfstr, "list")) # Identify KEGG compounds | |
## Coerce only KEGG compounds to SDFset object | |
sdfset <- read.SDFset(sdfstr[keggindex]) | |
valid <- validSDF(sdfset) | |
sdfset <- sdfset[valid] # Removes problematic structures (e.g. atoms only) | |
## Obtain data block of SDF in matrix format to get full set of KEGG IDs | |
blockmatrix <- datablock2ma(datablocklist=datablock(sdfset)) | |
head(blockmatrix) # Check what other userful information ChEBI provides | |
keggids <- blockmatrix[,"KEGG DRUG Database Links"] # Column with KEGG IDs | |
keggids <- keggids[!is.na(keggids)] | |
## Subset by KEGG IDs and use them as keys for simpler compound selection | |
keggsdfset <- sdfset[names(keggids)] | |
cid(keggsdfset) <- makeUnique(keggids) # Assures uniqueness of IDs | |
keggsdfset # contains all KEGG compounds with IDs from ChEBI in SDFset object | |
## Save KEGG compounds to SD file on disk for faster restart of routine in future | |
write.SDF(keggsdfset, "keggsdfset.sdf", cid=TRUE) | |
############################################################# | |
## Structure comparisons/searching with fmcsR or ChemmineR ## | |
############################################################# | |
library(fmcsR) | |
keggsdfset <- read.SDFset("keggsdfset.sdf") | |
cid(keggsdfset) <- sdfid(keggsdfset) # Use SDFIDs as CIDs | |
## Example for finding flexible maximum substructure among your two compounds | |
plotMCS(fmcs(keggsdfset["D00040"], keggsdfset["D00066"], au=2, bu=1)) | |
## Note MCS searching can be slow (NP complete problem) especially on large compounds. Removing | |
## very large compounds via a MW filter can help as well as prescreening with much faster fingerprint | |
## searches (see ChemmineR vignette for this). | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment