ericminikel/rcdk-sar-tutorial.r

## rcdk-sar-tutorial.r
require(rcdk) # chemical informatics functionality in R
require(gap)  # for qq plots later
options(stringsAsFactors=FALSE)
setwd('c:/sci/037cinf/analysis/sar/')

# plot molecules in R plot window instead of separate Java window
rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
    par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
    temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
    plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
    rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
}

# first look at curcumin
curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
rcdkplot(curcumin)

curc.frags = get.murcko.fragments(curcumin)
curc.frags
#[[1]]
#[[1]]$rings
#[1] "c1ccccc1"
#
#[[1]]$frameworks
#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
png('curcumin.murcko.fragments.png',width=600,height=300)
par(mfrow=c(1,2))
rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
dev.off()

anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
rcdkplot(anle138b)

anle138b.frags = get.murcko.fragments(anle138b)
anle138b.frags
# [[1]]
# [[1]]$rings
# [1] "c1ccccc1"        "c1ccc2OCOc2(c1)"
#
# [[1]]$frameworks
# [1] "c1cccc(c1)c2n[nH]cc2"                  "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
png('anle138b.murcko.fragments.png',width=600,height=120)
par(mfrow=c(1,5))
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
dev.off()

anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
anle138b.frags[[1]]$rings
# [1] "c1ccccc1"        "c1n[nH]cc1"      "c1ccc2OCOc2(c1)"

# now for a pretend SAR

# load FDA drug list
drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
                   sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
# remove those with no SMILES or with a really huge smiles
# otherwise R will hang on the macromolecules
drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
# check that lengths are same
dim(drugs) # 1467 3
length(drug.objects) # 1467

statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

# examine statin structures
png('statin.structures.png',width=1200,height=600)
par(mfrow=c(2,4))
for (statin in statins) {
    rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
}
dev.off()

# get all murcko fragments
mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)

# get a list of all possible fragments in any of these drugs
allfrags = unique(unlist(mfrags))
length(allfrags) # 2035

# get only the ring systems
allrings = unique(unlist(lapply(mfrags,"[[",1)))
length(allrings) # 556


# convert to a matrix
mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
for (i in 1:length(drug.objects)) {
    mat[i,] = allrings %in% mfrags[[i]]$rings
}

# aside: if you want rings and frameworks thrown in together,
# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
# lapply then does this to each element of the list
mfrags_all = lapply(mfrags,do.call,what=c)

is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
pvals = numeric(length(allrings)) # vector to hold p vals
for (j in 1:length(allrings)) { # iterate over rings
    hasfrag = mat[,j] # does each drug contain this fragment?
    contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
    pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
}
png('murcko.ring.statin.qqplot.png',width=500,height=500)
qqunif(pvals,pch=19,cex.main=.8,
    main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
dev.off()

png('statins.most.enriched.ring.system.png',width=300,height=300)
allrings[pvals < .000001]
# [1] "C=1CCC2CCCC=C2(C=1)"
# what statins contain it?
drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
# "Lovastatin"  "Pravastatin" "Simvastatin"
# how many other drugs contain it
sum(!is_statin & mat[,which(pvals < .000001)])
# 0
rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
dev.off()

png('frag.freq.hist1.png',width=600,height=400)
hist(colSums(mat),col='yellow',breaks=100,
    xlab='# of molecules containing a given fragment',
    ylab='# of fragments like this',
    main='Histogram of fragment frequencies')
dev.off()

sum(colSums(mat)==1)
# [1] 351

sum(colSums(mat[is_statin,]) > 0)
# 8

start_time = Sys.time()
get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
Sys.time() - start_time
# Time difference of 3.394177 mins
	require(rcdk) # chemical informatics functionality in R
	require(gap) # for qq plots later
	options(stringsAsFactors=FALSE)
	setwd('c:/sci/037cinf/analysis/sar/')

	# plot molecules in R plot window instead of separate Java window
	rcdkplot = function(molecule,width=500,height=500,marg=0,main='') {
	par(mar=c(marg,marg,marg,marg)) # set margins to zero since this isn't a real plot
	temp1 = view.image.2d(molecule,width,height) # get Java representation into an image matrix. set number of pixels you want horiz and vertical
	plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='',main=main) # create an empty plot
	rasterImage(temp1,1,1,10,10) # boundaries of raster: xmin, ymin, xmax, ymax. here i set them equal to plot boundaries
	}

	# first look at curcumin
	curcumin = parse.smiles("O=C(\\C=C\\c1ccc(O)c(OC)c1)CC(=O)\\C=C\\c2cc(OC)c(O)cc2")[[1]]
	rcdkplot(curcumin)

	curc.frags = get.murcko.fragments(curcumin)
	curc.frags
	#[[1]]
	#[[1]]$rings
	#[1] "c1ccccc1"
	#
	#[[1]]$frameworks
	#[1] "c1ccc(cc1)C=CCCCC=Cc2ccccc2"
	png('curcumin.murcko.fragments.png',width=600,height=300)
	par(mfrow=c(1,2))
	rcdkplot(parse.smiles(curc.frags[[1]]$rings)[[1]])
	rcdkplot(parse.smiles(curc.frags[[1]]$frameworks)[[1]])
	dev.off()

	anle138b = parse.smiles("C1OC2=C(O1)C=C(C=C2)C3=CC(=NN3)C4=CC(=CC=C4)Br")[[1]]
	rcdkplot(anle138b)

	anle138b.frags = get.murcko.fragments(anle138b)
	anle138b.frags
	# [[1]]
	# [[1]]$rings
	# [1] "c1ccccc1" "c1ccc2OCOc2(c1)"
	#
	# [[1]]$frameworks
	# [1] "c1cccc(c1)c2n[nH]cc2" "c1n[nH]c(c1)c2ccc3OCOc3(c2)"
	# [3] "c1cccc(c1)c2n[nH]c(c2)c3ccc4OCOc4(c3)"
	png('anle138b.murcko.fragments.png',width=600,height=120)
	par(mfrow=c(1,5))
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$rings[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[1])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[2])[[1]])
	rcdkplot(parse.smiles(anle138b.frags[[1]]$frameworks[3])[[1]])
	dev.off()

	anle138b.frags = get.murcko.fragments(anle138b,min.frag.size=3)
	anle138b.frags[[1]]$rings
	# [1] "c1ccccc1" "c1n[nH]cc1" "c1ccc2OCOc2(c1)"

	# now for a pretend SAR

	# load FDA drug list
	drugs = read.table('http://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt',
	sep='\t',header=TRUE,allowEscapes=FALSE,quote='',comment.char='')
	# remove those with no SMILES or with a really huge smiles
	# otherwise R will hang on the macromolecules
	drugs = drugs[nchar(drugs$smiles) > 0 & nchar(drugs$smiles) < 200,]
	drug.objects = parse.smiles(drugs$smiles) # create rcdk IAtomContainer objects for each drug
	# check that lengths are same
	dim(drugs) # 1467 3
	length(drug.objects) # 1467

	statins = c("atorvastatin","fluvastatin","lovastatin","pitavastatin","pravastatin","rosuvastatin","simvastatin")
	drugs[tolower(drugs$generic_name) %in% statins,] # check that the statins are in the drug list

	# examine statin structures
	png('statin.structures.png',width=1200,height=600)
	par(mfrow=c(2,4))
	for (statin in statins) {
	rcdkplot(drug.objects[tolower(drugs$generic_name) == statin][[1]],marg=2,main=statin)
	}
	dev.off()

	# get all murcko fragments
	mfrags = get.murcko.fragments(drug.objects,min.frag.size=3)

	# get a list of all possible fragments in any of these drugs
	allfrags = unique(unlist(mfrags))
	length(allfrags) # 2035

	# get only the ring systems
	allrings = unique(unlist(lapply(mfrags,"[[",1)))
	length(allrings) # 556


	# convert to a matrix
	mat = matrix(nrow=length(drug.objects), ncol=length(allrings))
	for (i in 1:length(drug.objects)) {
	mat[i,] = allrings %in% mfrags[[i]]$rings
	}

	# aside: if you want rings and frameworks thrown in together,
	# do.call(c,mfrags[[1]]) will concatenate the elements of mfrags[[1]] into a vector
	# lapply then does this to each element of the list
	mfrags_all = lapply(mfrags,do.call,what=c)

	is_statin = tolower(drugs$generic_name) %in% statins # is each drug a statin?
	pvals = numeric(length(allrings)) # vector to hold p vals
	for (j in 1:length(allrings)) { # iterate over rings
	hasfrag = mat[,j] # does each drug contain this fragment?
	contingency_table = table(data.frame(is_statin,hasfrag)) # 2x2 table
	pvals[j] = fisher.test(contingency_table)$p.value # Fisher test of independence
	}
	png('murcko.ring.statin.qqplot.png',width=500,height=500)
	qqunif(pvals,pch=19,cex.main=.8,
	main='Association of Murcko ring systems with statin activity\namong FDA-approved drugs')
	dev.off()

	png('statins.most.enriched.ring.system.png',width=300,height=300)
	allrings[pvals < .000001]
	# [1] "C=1CCC2CCCC=C2(C=1)"
	# what statins contain it?
	drugs$generic_name[is_statin & mat[,which(pvals < .000001)]]
	# "Lovastatin" "Pravastatin" "Simvastatin"
	# how many other drugs contain it
	sum(!is_statin & mat[,which(pvals < .000001)])
	# 0
	rcdkplot(parse.smiles(allrings[pvals < .000001])[[1]],marg=1,main=allrings[pvals < .000001])
	dev.off()

	png('frag.freq.hist1.png',width=600,height=400)
	hist(colSums(mat),col='yellow',breaks=100,
	xlab='# of molecules containing a given fragment',
	ylab='# of fragments like this',
	main='Histogram of fragment frequencies')
	dev.off()

	sum(colSums(mat)==1)
	# [1] 351

	sum(colSums(mat[is_statin,]) > 0)
	# 8

	start_time = Sys.time()
	get.exhaustive.fragments(drug.objects[[which(drugs$generic_name=='Rosuvastatin')]],min.frag.size=3)
	Sys.time() - start_time
	# Time difference of 3.394177 mins