Skip to content

Instantly share code, notes, and snippets.

@ericminikel
Last active May 2, 2020 11:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Save ericminikel/8229983 to your computer and use it in GitHub Desktop.
Introductory tutorial on how to use rcdk to conduct an SAR
require(rcdk) # chemical informatics functionality in R
require(gap) # for qq plots later
options(stringsAsFactors=FALSE)
setwd('c:/sci/037cinf/analysis/sar/')
# plot molecules in R plot window instead of separate Java window
rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
}
# first look at curcumin
curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
rcdkplot(curcumin)
curc.frags = get.murcko.fragments(curcumin)
curc.frags
#[[1]]
#[[1]]$rings
#[1] "c1ccccc1"
#
#[[1]]$frameworks
#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
png('curcumin.murcko.fragments.png',width=600,height=300)
par(mfrow=c(1,2))
rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
dev.off()
anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
rcdkplot(anle138b)
anle138b.frags = get.murcko.fragments(anle138b)
anle138b.frags
# [[1]]
# [[1]]$rings
# [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
#
# [[1]]$frameworks
# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
png('anle138b.murcko.fragments.png',width=600,height=120)
par(mfrow=c(1,5))
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
dev.off()
anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
anle138b.frags[[1]]$rings
# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"
# now for a pretend SAR
# load FDA drug list
drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
# remove those with no SMILES or with a really huge smiles
# otherwise R will hang on the macromolecules
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
# check that lengths are same
dim(drugs) # 1467 3
length(drug.objects) # 1467
statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list
# examine statin structures
png('statin.structures.png',width=1200,height=600)
par(mfrow=c(2,4))
for (statin in statins) {
rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
}
dev.off()
# get all murcko fragments
mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)
# get a list of all possible fragments in any of these drugs
allfrags = unique(unlist(mfrags))
length(allfrags) # 2035
# get only the ring systems
allrings = unique(unlist(lapply(mfrags,"[[",1)))
length(allrings) # 556
# convert to a matrix
mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
for (i in 1:length(drug.objects)) {
mat[i,] = allrings %in% mfrags[[i]]$rings
}
# aside: if you want rings and frameworks thrown in together,
# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
# lapply then does this to each element of the list
mfrags_all = lapply(mfrags,do.call,what=c)
is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
pvals = numeric(length(allrings)) # vector to hold p vals
for (j in 1:length(allrings)) { # iterate over rings
hasfrag = mat[,j] # does each drug contain this fragment?
contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
}
png('murcko.ring.statin.qqplot.png',width=500,height=500)
qqunif(pvals,pch=19,cex.main=.8,
main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
dev.off()
png('statins.most.enriched.ring.system.png',width=300,height=300)
allrings[pvals < .000001]
# [1] "C=1CCC2CCCC=C2(C=1)"
# what statins contain it?
drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
# "Lovastatin" "Pravastatin" "Simvastatin"
# how many other drugs contain it
sum(!is_statin & mat[,which(pvals < .000001)])
# 0
rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
dev.off()
png('frag.freq.hist1.png',width=600,height=400)
hist(colSums(mat),col='yellow',breaks=100,
xlab='# of molecules containing a given fragment',
ylab='# of fragments like this',
main='Histogram of fragment frequencies')
dev.off()
sum(colSums(mat)==1)
# [1] 351
sum(colSums(mat[is_statin,]) > 0)
# 8
start_time = Sys.time()
get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
Sys.time() - start_time
# Time difference of 3.394177 mins
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment