Skip to content

Instantly share code, notes, and snippets.

@tgirke
Last active March 8, 2018 15:35
Show Gist options
  • Save tgirke/fb4d7ab4cc555968b521f4d8e624cd79 to your computer and use it in GitHub Desktop.
Save tgirke/fb4d7ab4cc555968b521f4d8e624cd79 to your computer and use it in GitHub Desktop.
KEGG CMP Download into R
####################################################
## Import of KEGG Compounds into SDFset Container ##
####################################################
## Date: March 07, 2018
## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/
################################
## Download KEGG CMPs via URL ##
################################
library(ChemmineR)
keggquery <- c("D00040", "D00066")
## Example for importing single compound via URL
read.SDFset("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+D00040")
## For many compounds you could use this simple import function or
## use the PubChem getIds() function in ChemmineR (requires PubChem IDs)
importKEGG <- function(ids) {
sdfset <- SDFset()
for(i in ids) {
url <- paste0("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+", i)
tmp <- as(read.SDFset(url), "SDFset")
cid(tmp) <- i
sdfset <- c(sdfset, tmp)
}
sdfset
}
sdfset <- importKEGG(ids=keggquery)
sdfset
plot(sdfset) # Nicer compound plots are available in developer version of ChemmineR
########################
## Download via ChEBI ##
########################
## This solution is slower but more comprehensive, and it gives you most of the KEGG compounds along with a
## lot of additional information
library(ChemmineR)
## Download KEGG SDFs from ChEBI
download.file("ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz", "ChEBI_complete_3star.sdf.gz")
## Import into R using first SDFstr container for faster subsetting
sdfstr <- read.SDFstr("ChEBI_complete_3star.sdf.gz")
keggindex <- grep("KEGG COMPOUND Database Links", as(sdfstr, "list")) # Identify KEGG compounds
## Coerce only KEGG compounds to SDFset object
sdfset <- read.SDFset(sdfstr[keggindex])
valid <- validSDF(sdfset)
sdfset <- sdfset[valid] # Removes problematic structures (e.g. atoms only)
## Obtain data block of SDF in matrix format to get full set of KEGG IDs
blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))
head(blockmatrix) # Check what other userful information ChEBI provides
keggids <- blockmatrix[,"KEGG DRUG Database Links"] # Column with KEGG IDs
keggids <- keggids[!is.na(keggids)]
## Subset by KEGG IDs and use them as keys for simpler compound selection
keggsdfset <- sdfset[names(keggids)]
cid(keggsdfset) <- makeUnique(keggids) # Assures uniqueness of IDs
keggsdfset # contains all KEGG compounds with IDs from ChEBI in SDFset object
## Save KEGG compounds to SD file on disk for faster restart of routine in future
write.SDF(keggsdfset, "keggsdfset.sdf", cid=TRUE)
#############################################################
## Structure comparisons/searching with fmcsR or ChemmineR ##
#############################################################
library(fmcsR)
keggsdfset <- read.SDFset("keggsdfset.sdf")
cid(keggsdfset) <- sdfid(keggsdfset) # Use SDFIDs as CIDs
## Example for finding flexible maximum substructure among your two compounds
plotMCS(fmcs(keggsdfset["D00040"], keggsdfset["D00066"], au=2, bu=1))
## Note MCS searching can be slow (NP complete problem) especially on large compounds. Removing
## very large compounds via a MW filter can help as well as prescreening with much faster fingerprint
## searches (see ChemmineR vignette for this).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment