Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active September 28, 2018 09:29
Show Gist options
  • Save dinovski/298ee98efcd44ca9c3aae8c473bdf91b to your computer and use it in GitHub Desktop.
Save dinovski/298ee98efcd44ca9c3aae8c473bdf91b to your computer and use it in GitHub Desktop.
reanalysis of GSE26682
#!/usr/bin/Rscript
#source("http://bioconductor.org/biocLite.R")
#biocLite("curatedCRCData")
#biocLite("inSilicoMerging")
library(Biobase)
library(GEOquery)
library(rgl)
library(inSilicoMerging)
library(gdata)
library(scatterplot3d)
## ABOUT
## 176 (eset1) and 155 samples (eset2) MSI and MSS tumors
## http://www.ncbi.nlm.nih.gov/geo/geo2r/?acc=GSE26682&platform=GPL96
## Probes of the U133A array present in the U133A Plus 2.0 were selected and quantile normalized to mimic the distribution of the U133A array.
## For all 331 samples, MAS 5.0-calculated signal intensities were normalized using the quantile normalization procedure implemented in
## robust multiarray analysis and the normalized data were log 2 transformed.
## Sample specific median centering and scaling by the standard deviation were additionally applied.
## Probe sets that were not expressed or that exhibited low variability across samples were excluded.
gds<-getGEO("GSE26682", AnnotGPL=TRUE, getGPL=TRUE) #eset
## save locally
# saveRDS(gds, "gse26682.rds")
# saveRDS(exprs(gds[[1]]), "gse26682_exp1.rds")
# saveRDS(exprs(gds[[2]]), "gse26682_exp2.rds")
# exp1<-readRDS("~/Dropbox/brandeis/gse26682_exp1.rds")
# exp2<-readRDS("~/Dropbox/brandeis/gse26682_exp2.rds")
# exp.list<-list(exp1, exp2)
# exp.sets<-merge(exp1, expmethod="NONE")
####
dim(exprs(gds[[1]]))
dim(exprs(gds[[2]]))
## merge esets
esets<-merge(gds, method="NONE")
## gene names
gene.info<-fData(esets)
gene.id<-gene.info[,1] #ID
gene.name<-gene.info[,3] #gene
gene.go<-gene.info[,16:17] #GO function
gene.summary<-cbind(as.data.frame(gene.id), as.data.frame(gene.name), as.data.frame(gene.go))
gene.names<-cbind(as.data.frame(gene.id), as.data.frame(gene.name))
## phenotypic data
pdata<-pData(esets)
pdata<-data.frame(pdata$geo_accession, pdata$characteristics_ch1, pdata$characteristics_ch1.1, pdata$characteristics_ch1.2)
names(pdata)<-c("geo_accession", "age", "gender", "status")
## write table and reformat on command line
#write.table(pdata, "~/geo_pdata.txt", row.names=F, quote=F, sep=",")
####
## pdata
pdata<-read.csv("~/geo_pdata_reformat.txt", header=T)
rownames(pdata)<-pdata$geo_accession
pdata$geo_accession<-NULL
## expression data
#dim(exprs(esets))
exp<-exprs(esets)
ge<-as.data.frame(t(exp))
rownames(ge)==rownames(pdata)
## status = stable, low, high, unknown
e.m<-pdata$status
#####################
## initial analysis: MSS versus MSI high and low
## remove samples with unknown status (16)
sample.sel <- e.m != "Unknown"
ge<-ge[sample.sel,]
pdata<-pdata[sample.sel,]
e.m<-e.m[sample.sel]
rownames(ge)==rownames(pdata)
####2 factor class
## add column for stable (MSS) or unstable (MSI)
MSS<-as.factor("0")
MSI<-as.factor("1")
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else { return(MSI) } )
e.m<-pdata[,4]
## expression values = (ge) and pdata (or e.m) = with microsatellite stability status
## and gene.names df with name corresponding to each probe ID = 300 samples total: 218 stable
############
## check expression levels across samples (that data were indeed quantile normalized and log2 transformed)
boxplot((t(ge)[,1:100]),las=3,cex.axis=0.8, col="lightgreen")
boxplot((t(ge)[,101:200]),las=3,cex.axis=0.8, col="lightgreen")
boxplot((t(ge)[,201:300]),las=3,cex.axis=0.8, col="lightgreen")
############
## PCA
pca<-prcomp(ge, scale=T)
class<-pdata$class
plot(pca$x[,1], pca$x[,2], col=class, pch=19)
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class)))
plot(pca)
pca.rpm=prcomp(ge,scale=T)
sc3=scatterplot3d(pca.rpm$x[,1:3],pch=20,highlight.3d=TRUE,cex.symbol=2)
s3d.coords = sc3$xyz.convert(pca.rpm$x[,1:3])
text(s3d.coords$x,s3d.coords$y,labels=pdata[,3],cex=0.8,pos=4)
## kmeans : top BH corrected genes: 202836_s_at 213738_s_at
fit.e=kmeans(ge.top,3)
## assign cluster ID to sample
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster)
plot(ge.top[,1:2], pch=20,col=ifelse(fit.e$cluster==1,"blue","green"))
points(fit.e$centers,pch='x',cex=2,col='red')
fit.loge=kmeans(log2(ge+1),2)
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.loge$cluster)
## 'ge' rows/obervations=samples and cols/variables=genes
## one outlier sample (GSM656860)
####
## t-test for association between gene expression and microsatellite stability (stable versus high/low)
tt.pvals<-apply(ge,2,function(x) t.test(x ~ e.m)$p.value)
sort(tt.pvals)[1:20]
## add actual gene name to probe id/p value
df.pvals<-data.frame(tt.pvals)
df.pvals$gene.id<-rownames(df.pvals)
df.pvals<-inner_join(df.pvals, gene.names)
df.pvals<-df.pvals[ order(df.pvals[,1]), ]
####### logistic regression model on top genes from t-test
## top genes from t test all 300 samples
sel<-order(tt.pvals, decreasing=F)
top<-ge[,sel][1:80]
M<-glm(e.m ~ ., data=top,family="binomial")
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5))
# Total cases that are not NA: 300
# Correct predictions (accuracy): 280(93.3%)
# TPR (sensitivity)=TP/P: 82.9%
# TNR (specificity)=TN/N: 97.2%
# PPV (precision)=TP/(TP+FP): 91.9%
# FDR (false discovery)=1-PPV: 8.11%
# FPR =FP/N=1-TNR: 2.75%
## see how samples cluster for top 100 most significant genes
df.pvals$gene.id == data.frame(rownames(t(ge)))
df.pvals.100<-df.pvals.s[1:100,]
top.sel<-rownames(t(ge)) %in% df.pvals.100$gene.id
## select only those genes from ge
top.exp<-t(ge)[top.sel,]
## PCA
## columns = variables (genes)
pca<-prcomp(t(top.exp), scale=T)
class<-pdata$class
plot(pca$x[,1], pca$x[,2], col=class, pch=19)
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class)))
## weak separation separation between MSS (0) and MSI (1)
## hierarchical clustering of top 100 genes from the t-test
d = dist(t(top.exp))
d.log = dist(log2(t(top.exp)+1))
labels<-rownames(top.exp)
plot(hclust(d,method="average"),label=pdata[,3], main="Average, count")
plot(hclust(d,method="ward.D2"),label=pdata[,3], main="Ward,count")
plot(hclust(d.log,method="average"),label=pdata[,3], main="Average, log-count")
plot(hclust(d.log,method="ward.D2"),label=pdata[,3], main="Ward, log-count")
## MSI high do form a separate cluster when using average linkage but the branch is not very high
pca.rpm=prcomp(t(top.exp),scale=T)
sc3=scatterplot3d(pca.rpm$x[,1:3],pch=20,highlight.3d=TRUE,cex.symbol=2)
s3d.coords = sc3$xyz.convert(pca.rpm$x[,1:3])
text(s3d.coords$x,s3d.coords$y,labels=pdata[,3],cex=0.8,pos=4)
## can see a separate cluster for samples with high MSI, similar to the hierarchical clustering and PCA
## kmeans
fit.e=kmeans(ge,3)
## assign cluster ID to sample
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster)
## color by cluster
plot(ge[,1:2],pch=19,col=ifelse(fit.e$cluster==1,"green","lightblue"), main="kmeans, color by cluster")
points(fit.e$centers,pch='x',cex=2,col='red')
cr<-c("blue","green")
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=cr)
## Using k=2, can see more or less 2 clusters
fit.loge=kmeans(log2(ge+1),2)
data.frame(CLASS=pdata[,3],CLUST.ID=fit.loge$cluster)
#############
## following analysis is with 3 separate classes defined: (stable=0, low=1, high=2)
MSS<-as.factor("0")
MSI.low<-as.factor("1")
MSI.high<-as.factor("2")
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else if (x[3] == "Low" ) { return(MSI.low) } else { return(MSI.high) } )
## add classes to e.m variable
e.m<-pdata[,4]
## anova, using 3 distinct classes
MSS<-as.factor("0")
MSI.low<-as.factor("1")
MSI.high<-as.factor("2")
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else if (x[3] == "Low" ) { return(MSI.low) } else { return(MSI.high) } )
## add classes to e.m variable
e.m<-pdata[,4]
## anova for reg1a
gene<-ge[,c("209752_at")]
anova(lm(gene ~ pdata$class))
# pval=5.891e-09
## lowest variance
gene<-ge[,c("221131_at")]
anova(lm(gene ~ pdata$class))
# pval = 0.7202
## MLH1
gene<-ge[,c("202520_s_at")]
anova(lm(gene ~ pdata$class))
# pval = 2.2e-16
## expression, 3 classes
with(pdata, boxplot( ge[,c("209752_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="REG1A expression" ))
with(pdata, boxplot(ge[,c("205886_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="REG1B expression" ))
with(pdata, boxplot(ge[,c("204673_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="MUC2 expression" ))
with(pdata, boxplot( ge[,c("221131_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="A4GNT expression" ))
## MLH1
with(pdata, boxplot(ge[,c("202520_s_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="MLH1 expression" ))
## variables=samples (columns)
pca<-prcomp(t(top.exp), scale=T)
class<-pdata$class
plot(pca$x[,1], pca$x[,2], col=class, pch=19, xlab="PC1", ylab="PC2")
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class)))
## stable samples mostly overlap the low instablity samples
## kmeans top 2 genes (from t-test)
top2<-ge[,c("202836_s_at", "213738_s_at")]
fit.e=kmeans(ge,3)
## assign cluster ID to sample
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster)
attach(df) ; plot(top2, col=c("black","blue","green")[2]); detach(df)
c("black","orange","green")[df$CLUST.ID]
## color by cluster and add actual gene names
gene.names[gene.id == "202836_s_at",]
gene.names[gene.id == "213738_s_at",]
plot(top2,pch=19,col=c("black","orange","green")[df$CLUST.ID], main="kmeans, color by cluster",
xlab="TXNL4A", ylab="ATP5A1")
points(fit.e$centers,pch='x',cex=2,col='red')
legend("topright", legend=levels(as.factor(df$CLUST.ID)), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=c("black","orange","green") )
## color by microsatellite class
df$CLASS.n<-factor(df$CLASS) # need to factor to remove unused level
df$CLASS<-NULL
attach(df) ; plot(top2, col=c("black","blue","green")[1]); detach(df)
col.class<-c("black","blue","green")[df$CLASS.n]
plot(top2, pch=19, col=col.class, main="color by class",
xlab="TXNL4A", ylab="ATP5A1")
legend("topright", legend=levels(df$CLASS.n), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=c("black","blue","green"))
####
## try fit on top hit from t-test (stable versus unstable)
M<-glm(e.m ~ ge[,c("201464_x_at")],family="binomial")
predict(M, type="response")[1:5]
## convert fitted probabilites to category predictions
as.numeric(predict(M, type="response")[1:5]>0.5)
## compare predictions to known values of the tumor class model was fit to
assess.prediction=function(truth,predicted) {
predicted = predicted[ ! is.na(truth) ]
truth = truth[ ! is.na(truth) ]
truth = truth[ ! is.na(predicted) ]
predicted = predicted[ ! is.na(predicted) ]
cat("Total cases that are not NA: ",length(truth),"\n",sep="")
cat("Correct predictions (accuracy): ",sum(truth==predicted),
"(",signif(sum(truth==predicted)*100/length(truth),3),"%)\n",sep="")
TP = sum(truth==1 & predicted==1)
TN = sum(truth==0 & predicted==0)
FP = sum(truth==0 & predicted==1)
FN = sum(truth==1 & predicted==0)
P = TP+FN # total number of positives in the truth data
N = FP+TN # total number of negatives
cat("TPR (sensitivity)=TP/P: ", signif(100*TP/P,3),"%\n",sep="")
cat("TNR (specificity)=TN/N: ", signif(100*TN/N,3),"%\n",sep="")
cat("PPV (precision)=TP/(TP+FP): ", signif(100*TP/(TP+FP),3),"%\n",sep="")
cat("FDR (false discovery)=1-PPV: ", signif(100*FP/(TP+FP),3),"%\n",sep="")
cat("FPR =FP/N=1-TNR: ", signif(100*FP/N,3),"%\n",sep="")
}
## check predictions
assess.prediction(e.m,as.numeric(predict(M,type="response")>0.5))
##################################
## analysis without MSI:Low samples
## remove samples with unknown status and "Low" stability = 253 total samples
sample.sel <- e.m != "Unknown" & e.m != "Low"
## remove unknown and MSI:low samples from expression, phenotypic data, and 'outcome'
ge<-ge[sample.sel,]
pdata<-pdata[sample.sel,]
e.m<-e.m[sample.sel]
## remove unused levels
e.m<-factor(e.m)
rownames(ge)==rownames(pdata)
## add column for stable (MSS) or MSI
MSS<-as.factor("0")
MSI<-as.factor("1")
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else { return(MSI) } )
e.m<-pdata[,4]
## PCA
pca<-prcomp(ge, scale=T)
class<-pdata$class
plot(pca$x[,1], pca$x[,2], col=class, pch=19, xlab="PC1", ylab="PC2")
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class)))
## t-test:SI high versus MSS for 253 samples
tt.pvals<-apply(ge,2,function(x) t.test(x ~ e.m.filtered)$p.value)
## add actual gene name to probe id/p value
df.pvals<-data.frame(tt.pvals)
## FDR corrected p values
df.pvals$BH<-c(p.adjust(df.pvals$tt.pvals, method = "BH"))
df.pvals$gene.id<-rownames(df.pvals)
df.pvals<-inner_join(df.pvals, gene.names)
## sort from lowest FDR corrected pval
df.pvals<-df.pvals[ order(df.pvals[,2]), ]
df.pvals[1:20,]
## LR top hit (pval)
which.min(tt.pvals)
M<-glm(e.m ~ ge[,c("215780_s_at")],family="binomial")
predict(M,type="response")[1:5]
## convert fitted probabilites to category predictions
as.numeric(predict(M,type="response")[1:5]>0.5)
## compare predictions to known values of the tumor class we were fitting the model to
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5))
## multiple top hits
sel<-order(tt.pvals, decreasing=F)
top<-ge[,sel][1:100]
M<-glm(e.m ~ ., data=top,family="binomial")
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5))
## t-test on 253 samples did not improve predictive accuracy or sensitivity
## LR top hit (fdr corrected)
bh<-c(p.adjust(tt.pvals, method="BH"))
which.min(bh)
gene.names[gene.id=="202589_at",]
M<-glm(e.m ~ ge[,c("202589_at")],family="binomial")
predict(M,type="response")[1:5]
## convert fitted probabilites to category predictions
as.numeric(predict(M,type="response")[1:5]>0.5)
## compare predictions to known values of the tumor class model was fit to
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5))
####
## cluster top genes by variance
## convenience function to calculate correlation-based distance
## note: cor() calculates the matrix of pairwise correlation coefficients between columns (genes)
corr.dist=function(x) { as.dist(1-cor(x)) }
## calculate variations and means across columns (probe/gene)
vars=apply(ge,2,var)
means=apply(ge,2,mean)
## add status to sample names
# pdata.mod<-pdata
# pdata.mod$sample<-rownames(pdata.mod)
# pdata.mod$labels <- do.call(paste, c(pdata.mod[c("sample", "status")], sep = ":"))
# labels<-pdata.mod$labels
select<-order(vars,decreasing=T)[1:300]
ge[,select]
labels<-pdata[,3]
## correlation, top 300 vars
plot(hclust(corr.dist(t(ge[,select])), method="ward.D2"), label=labels)
plot(hclust(dist(t(ge[,select])), method="average"),label=labels)
select<-order(means,decreasing=T)[1:300]
## correlation, top 300 means
plot(hclust(corr.dist(t(ge[,select])), method="ward.D2"), label=labels)
plot(hclust(dist(ge[,select]), method="average"), label=labels)
##############
## naive bayes, 'ge' ordered by FDR corrected t-test significance: 108 with q<0.01
bh.pvals<-p.adjust(tt.pvals, method = "BH")
sel<-order(bh.pvals, decreasing=F)
sel<-bh.pvals<0.01
ge.top<-ge[,sel] #300 samples, 108 genes
Mb1=naiveBayes(e.m ~ . , data=data.frame(G=ge.top[,1]))
assess.prediction(e.m,predict(Mb1,data.frame(G=ge.top[,1])))
n<-100
Mb=naiveBayes(e.m ~ . , data=ge.top[,1:n])
assess.prediction(e.m,predict(Mb,ge.top[,1:n]))
## leave n (samples) out, top 300 variances
library(class)
df<-ge.top
n = 0
n.out = 5
Nrep = 1000
for ( i in 1:Nrep ) {
## randomly choose 5 observations to withhold and keep as the test set:
leave.out = sample(nrow(df),size=n.out)
pred = knn(df[-leave.out,1,drop=F],
df[leave.out,1,drop=F],e.m[-leave.out],k=1)
n = n + sum( pred==e.m[leave.out] )
}
n/(n.out*Nrep)
#~65% accuracy
## supervised: SVM
M=svm(e.m~.,ge.top[,1:20],kernel="polynomial",degree=3)
assess.prediction(e.m,predict(M,ge.top[,1:20]))
M=svm(e.m~.,ge.top[,1:20],kernel="radial")
assess.prediction(e.m,predict(M,ge.top[,1:20]))
M=svm(e.m~.,ge.top[,1:20,drop=F],kernel="radial",gamma=0.1)
assess.prediction(e.m,predict(M,ge.top[,1:20]))
## leave-n-out cross validation
df<-ge.top #300 samples, 108 genes (t-test FDR cutoff <0.01)
predictor.LR$train(e.m ~ . , df[1:20])
assess.prediction(e.m, predictor.LR$predict(df[1:20]))
cv.LR.20=cross.validate(predictor.LR, e.m ~ . , df[1:20])
assess.prediction(cv.LR.20$truth,cv.LR.20$prediction)
## similar to LR on top variance and top hits from t-test
#[1] 80.46
#$TPR
#[1] 45.17804
#$TNR
#[1] 93.48302
#$PPV
#[1] 71.90083
#$FDR
#[1] 28.09917
#$FPR
#[1] 6.516977
#######################
#http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0071755
#getGEO("GSE36335")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment