Skip to content

Instantly share code, notes, and snippets.

@alyssafrazee
Created October 21, 2014 04:08
Show Gist options
  • Save alyssafrazee/5eb6b13f282a030c471f to your computer and use it in GitHub Desktop.
Save alyssafrazee/5eb6b13f282a030c471f to your computer and use it in GitHub Desktop.
PCA for GEUVADIS data
## simple analysis code for GEUVADIS data
## AF Oct 2014
library(ballgown) #biocLite
library(RSkittleBrewer) #install_github
library(RColorBrewer) #CRAN
library(usefulstuff) #install_github
library(RCurl) #CRAN
load('fpkm.rda') # download at http://files.figshare.com/1625419/fpkm.rda
expressed = exprfilter(fpkm, cutoff=1)
expr = subset(expressed, 'UseThisDup == 1', genomesubset=FALSE)
# add lab ID to pData:
QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
pData(expr)$lab = qcstats$SeqLabNumber[match(pData(expr)$SampleID, rownames(qcstats))]
y = log2(texpr(expr)+1)
pca = prcomp(t(y))
pcmat = pca$x
# take the transpose because we want to reduce the *transcript*
# dimension (not the replicate dimension)
# so we put the transcripts in columns)
# make some plots to figure out what the important stuff is:
cols = brewer.pal(5, 'Dark2')
sid = ballgown:::ss(rownames(pcmat), pattern='\\.', slot=2)
sum(sid != pData(expr)$dirname) #just make sure in same order
n = nrow(pData(expr))
popinds = split(1:n, pData(expr)$population)
labinds = split(1:n, pData(expr)$lab)
rinmed = median(pData(expr)$RIN)
pData(expr)$hirin = ifelse(pData(expr)$RIN > rinmed, 'high', 'low')
rininds = split(1:n, pData(expr)$hirin)
plot(pcmat[,1], pcmat[,2], type='n', xlab='PC1', ylab='PC2')
for(i in seq_along(rininds)){
points(pcmat[rininds[[i]], 1], pcmat[rininds[[i]], 2],
pch=19, col=makeTransparent(cols[i]))
}
pdf('population_pc.pdf')
plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
for(i in seq_along(popinds)){
ii = popinds[[i]]
points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(cols[i]))
}
legend('topleft', names(popinds), col=cols, pch=19)
dev.off()
pdf('eur_yri_pc.pdf')
plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
for(i in seq_along(popinds)){
ii = popinds[[i]]
thecolor = ifelse(i==5, 'red', 'black')
points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(thecolor))
}
legend('topleft', c('EUR', 'YRI'), col=c('black', 'red'), pch=19)
dev.off()
## PC1 is seems to be associated with lab:
labcolors = brewer.pal(7, 'Set1')
labcolors[6] = 'black'
labcolors = sample(labcolors) #so the red/pink labs aren't next to each other
pdf('lab_effect.pdf')
plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
for(i in seq_along(labinds)){
points(pcmat[labinds[[i]], 2], pcmat[labinds[[i]], 1],
pch=19, col=makeTransparent(labcolors[i]))
} #zomg.
legend('topleft', col=labcolors, pch=19, paste('lab', 1:7))
dev.off()
## PC2 is correlated with RIN:
cor(pcmat[,2], pData(expr)$RIN) #0.72
(cor(pcmat[,2], pData(expr)$RIN))^2 #0.52
pdf('rin_effect.pdf')
plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
points(pcmat[pData(expr)$hirin=='high', 2],
pcmat[pData(expr)$hirin=='high', 1],
col=makeTransparent('dodgerblue'),
pch=19)
points(pcmat[pData(expr)$hirin=='low', 2],
pcmat[pData(expr)$hirin=='low', 1],
col=makeTransparent('orange'),
pch=19)
legend('topleft', col=c('dodgerblue', 'orange'),
c('high RNA quality', 'low RNA quality'), pch=19)
dev.off()
## can see some population structure in PC2 vs. PC3:
pdf('pc3.pdf')
plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
for(i in seq_along(popinds)){
ii = popinds[[i]]
points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
}
legend('topleft', names(popinds), col=cols, pch=19)
dev.off()
## uncolored plot
pdf('nocolor.pdf')
plot(pcmat[,2], pcmat[,1], xlab='PC2', ylab='PC1', col=makeTransparent('black'), pch=19)
dev.off()
## percent variation explained:
names(pca)
pctvar = pca$sdev^2 / sum(pca$sdev^2)
cumulative_pctvar = cumsum(pca$sdev^2) / sum(pca$sdev^2)
pdf('pctvar.pdf')
plot(cumulative_pctvar, pch=21, bg='gray80', xlab='Principal Component Index', ylab='Cumulative Percent Variance Explained')
dev.off()
pdf('pc3_ceu_gbr_yri.pdf')
plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
for(i in seq_along(popinds)){
ii = popinds[[i]]
if(i %% 2 == 1){
points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
}
}
legend('topleft', names(popinds)[c(1,3,5)], col=cols[c(1,3,5)], pch=19)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment