tgirke/kegg_cmp_download.R

## kegg_cmp_download.R
####################################################
## Import of KEGG Compounds into SDFset Container ##
####################################################
## Date: March 07, 2018
## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/

################################
## Download KEGG CMPs via URL ##
################################
library(ChemmineR)
keggquery <- c("D00040", "D00066")
## Example for importing single compound via URL
read.SDFset("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+D00040")
## For many compounds you could use this simple import function or
## use the PubChem getIds() function in ChemmineR (requires PubChem IDs)
importKEGG <- function(ids) {
    sdfset <- SDFset()
    for(i in ids) {
        url <- paste0("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+", i)
        tmp <- as(read.SDFset(url), "SDFset")
        cid(tmp) <- i
        sdfset <- c(sdfset, tmp)
    }
    sdfset
}
sdfset <- importKEGG(ids=keggquery)
sdfset
plot(sdfset) # Nicer compound plots are available in developer version of ChemmineR

########################
## Download via ChEBI ##
########################
## This solution is slower but more comprehensive, and it gives you most of the KEGG compounds along with a
## lot of additional information
library(ChemmineR)
## Download KEGG SDFs from ChEBI
download.file("ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz", "ChEBI_complete_3star.sdf.gz")

## Import into R using first SDFstr container for faster subsetting
sdfstr <- read.SDFstr("ChEBI_complete_3star.sdf.gz")
keggindex <- grep("KEGG COMPOUND Database Links", as(sdfstr, "list")) # Identify KEGG compounds

## Coerce only KEGG compounds to SDFset object
sdfset <- read.SDFset(sdfstr[keggindex])
valid <- validSDF(sdfset)
sdfset <- sdfset[valid] # Removes problematic structures (e.g. atoms only)

## Obtain data block of SDF in matrix format to get full set of KEGG IDs
blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))
head(blockmatrix) # Check what other userful information ChEBI provides
keggids <- blockmatrix[,"KEGG DRUG Database Links"] # Column with KEGG IDs
keggids <- keggids[!is.na(keggids)]

## Subset by KEGG IDs and use them as keys for simpler compound selection
keggsdfset <- sdfset[names(keggids)]
cid(keggsdfset) <- makeUnique(keggids) # Assures uniqueness of IDs
keggsdfset # contains all KEGG compounds with IDs from ChEBI in SDFset object

## Save KEGG compounds to SD file on disk for faster restart of routine in future
write.SDF(keggsdfset, "keggsdfset.sdf", cid=TRUE)

#############################################################
## Structure comparisons/searching with fmcsR or ChemmineR ##
#############################################################
library(fmcsR)
keggsdfset <- read.SDFset("keggsdfset.sdf")
cid(keggsdfset) <- sdfid(keggsdfset) # Use SDFIDs as CIDs
## Example for finding flexible maximum substructure among your two compounds
plotMCS(fmcs(keggsdfset["D00040"], keggsdfset["D00066"], au=2, bu=1))

## Note MCS searching can be slow (NP complete problem) especially on large compounds. Removing
## very large compounds via a MW filter can help as well as prescreening with much faster fingerprint
## searches (see ChemmineR vignette for this).
	####################################################
	## Import of KEGG Compounds into SDFset Container ##
	####################################################
	## Date: March 07, 2018
	## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/

	################################
	## Download KEGG CMPs via URL ##
	################################
	library(ChemmineR)
	keggquery <- c("D00040", "D00066")
	## Example for importing single compound via URL
	read.SDFset("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+D00040")
	## For many compounds you could use this simple import function or
	## use the PubChem getIds() function in ChemmineR (requires PubChem IDs)
	importKEGG <- function(ids) {
	sdfset <- SDFset()
	for(i in ids) {
	url <- paste0("http://www.genome.jp/dbget-bin/www_bget?-f+m+drug+", i)
	tmp <- as(read.SDFset(url), "SDFset")
	cid(tmp) <- i
	sdfset <- c(sdfset, tmp)
	}
	sdfset
	}
	sdfset <- importKEGG(ids=keggquery)
	sdfset
	plot(sdfset) # Nicer compound plots are available in developer version of ChemmineR

	########################
	## Download via ChEBI ##
	########################
	## This solution is slower but more comprehensive, and it gives you most of the KEGG compounds along with a
	## lot of additional information
	library(ChemmineR)
	## Download KEGG SDFs from ChEBI
	download.file("ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz", "ChEBI_complete_3star.sdf.gz")

	## Import into R using first SDFstr container for faster subsetting
	sdfstr <- read.SDFstr("ChEBI_complete_3star.sdf.gz")
	keggindex <- grep("KEGG COMPOUND Database Links", as(sdfstr, "list")) # Identify KEGG compounds

	## Coerce only KEGG compounds to SDFset object
	sdfset <- read.SDFset(sdfstr[keggindex])
	valid <- validSDF(sdfset)
	sdfset <- sdfset[valid] # Removes problematic structures (e.g. atoms only)

	## Obtain data block of SDF in matrix format to get full set of KEGG IDs
	blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))
	head(blockmatrix) # Check what other userful information ChEBI provides
	keggids <- blockmatrix[,"KEGG DRUG Database Links"] # Column with KEGG IDs
	keggids <- keggids[!is.na(keggids)]

	## Subset by KEGG IDs and use them as keys for simpler compound selection
	keggsdfset <- sdfset[names(keggids)]
	cid(keggsdfset) <- makeUnique(keggids) # Assures uniqueness of IDs
	keggsdfset # contains all KEGG compounds with IDs from ChEBI in SDFset object

	## Save KEGG compounds to SD file on disk for faster restart of routine in future
	write.SDF(keggsdfset, "keggsdfset.sdf", cid=TRUE)

	#############################################################
	## Structure comparisons/searching with fmcsR or ChemmineR ##
	#############################################################
	library(fmcsR)
	keggsdfset <- read.SDFset("keggsdfset.sdf")
	cid(keggsdfset) <- sdfid(keggsdfset) # Use SDFIDs as CIDs
	## Example for finding flexible maximum substructure among your two compounds
	plotMCS(fmcs(keggsdfset["D00040"], keggsdfset["D00066"], au=2, bu=1))

	## Note MCS searching can be slow (NP complete problem) especially on large compounds. Removing
	## very large compounds via a MW filter can help as well as prescreening with much faster fingerprint
	## searches (see ChemmineR vignette for this).