Created
July 17, 2014 20:41
-
-
Save abhik1368/88634a9c3185c40f7fa3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages('RPostgreSQL', type='source') | |
#initiate connection | |
library(RPostgreSQL) | |
library(BMS) | |
library(ggplot2) | |
drv <- dbDriver("PostgreSQL") | |
con <- dbConnect(drv, dbname="chembl_18") | |
#query the database | |
rs <- dbSendQuery(con,"select chembl_id,standard_value,ecfp4 from rdkfps_1 where target_chembl_id ='CHEMBL224' and published_type ='IC50'") | |
#fetch all the results | |
d<-fetch(rs ,n =-1) | |
#close resultSet rs | |
dbClearResult(rs) | |
#remove duplicates ids | |
d<-d[!duplicated(d[,1]), ] | |
# remove the \\x from the hexadecimal string and convert hexadecimal to binary format | |
ad<-data.frame() | |
for (i in 1:dim(d)[1]){ | |
a = hex2bin(gsub("[\\x]","",d[i,3])) | |
ad<-rbind(a,ad) | |
} | |
# remove the third columns | |
d[,3]<-NULL | |
#append the descriptor set | |
d<-cbind(d,ad) | |
#Near constant columns and NA | |
descs <- d[, !apply(d, 2, function(x) any(is.na(x)) )] | |
descs <- d[, !apply( d, 2, function(x) length(unique(x)) == 1 )] | |
# Remove correlated columns > 0.25 | |
r2 <- which(cor(descs[3:511])^2 > .25, arr.ind=TRUE) | |
r2 <- r2[ r2[,1] > r2[,2] , ] | |
data <- descs[, -unique(r2[,2])] | |
#Check Dimensions of the dataset | |
dim(data) | |
#------------------------------------------------------------- | |
#train and test set | |
names(set)[2]<-"Activity" # change the column name | |
#Convert IC50 column to pIC50 | |
data[,2]<- -(log(data[,2]*10^-9)) | |
# split train and test set | |
ind<-sample(2,nrow(data),replace=TRUE,prob=c(0.8,0.2)) | |
trainsetX<-data[ind==1,2:dim(data)[2]] | |
trainY<-trainsetX$Activity | |
testsetX<-data[ind==2,2:dim(data)[2]] | |
testY<- testsetX$Activity | |
#------------------------------------------------------------- | |
# Function to compute RMSE and R2 | |
r2se<- function (obs,pred){ | |
rmse<-(mean((obs -pred)^2))^0.5 | |
ssr<-sum((obs - pred)^2) | |
sst<-sum((obs - mean(obs))^2) | |
R2<-1-(ssr/sst) | |
output<-list(RMSE=rmse,RSquared=R2) | |
return(output) | |
} | |
#------------------------------------------------------------- | |
# Function to plot qsar results with ggplot2 | |
plotsar <-function(results){ | |
myplot<-ggplot(results,aes(x=observed,y=predicted,color=factor(set),shape=factor(set)))+ | |
geom_point()+scale_colour_manual(values=c("blue", "yellow"))+ | |
theme(axis.line = element_line(colour = "white"))+ | |
theme(panel.grid.major = element_line(colour = "white", linetype = "dotted"))+ | |
theme(axis.ticks.y=element_line(size=1),axis.text.y=element_text(size=16),axis.ticks.length=unit(0.25,"cm"), | |
panel.background = element_rect(fill = "black", colour = NA))+ | |
theme(legend.key = element_rect(fill = "black"))+ | |
theme(axis.title=element_text(size=12,face="bold",colour = 'white'), | |
plot.background = element_rect(colour = 'black', fill = 'black'))+ | |
theme( legend.title = element_text(size = 12, face = "bold", hjust = 0, colour = 'white'), | |
legend.background=element_rect(color="black",fill="black"), | |
legend.text=element_text(size=12,color="white",face="bold"), | |
axis.title.x = element_text(size = 12, colour = 'white', vjust = 1) , | |
axis.title.y = element_text(size = 12, colour = 'white'))+ | |
ggtitle("Predicted v/s test set QSAR results")+ | |
theme(plot.title=element_text(lineheight=.8,face="bold",color="white",size=14))+stat_smooth(span = 0.9) | |
return(myplot) | |
} | |
#------------------------------------------------------------- | |
# Ridge Regression Model | |
library(elasticnet) | |
ridgeModel <- enet(x = as.matrix(trainsetX), y = trainY,lambda = 0.01) | |
ridgetest <- predict(ridgeModel, newx = as.matrix(testsetX), | |
s = 1, mode = "fraction",type = "fit") | |
ridgetrain <- predict(ridgeModel, newx = as.matrix(trainsetX), | |
s = 1, mode = "fraction",type = "fit") | |
ridgeval1<-data.frame(obs = trainY, pred = ridgetrain$fit) | |
ridgeval2<-data.frame(obs = testY, pred = ridgetest$fit) | |
r2se(ridgeval1$obs,ridgeval1$pred) | |
r2se(ridgeval2$obs,ridgeval2$pred) | |
train.results<-data.frame(observed=trainY,predicted=ridgetrain$fit,set="train") | |
test.results <- data.frame(observed=testY,predicted=ridgetest$fit,set="test") | |
results<-rbind(train.results,test.results) | |
plotsar(results) | |
## close connection con | |
dbDisconnect(con) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment