alyssafrazee/geuvadis_analysis.R

## geuvadis_analysis.R
## simple analysis code for GEUVADIS data
## AF Oct 2014

library(ballgown) #biocLite
library(RSkittleBrewer) #install_github
library(RColorBrewer) #CRAN
library(usefulstuff) #install_github
library(RCurl) #CRAN

load('fpkm.rda') # download at http://files.figshare.com/1625419/fpkm.rda
expressed = exprfilter(fpkm, cutoff=1)
expr = subset(expressed, 'UseThisDup == 1', genomesubset=FALSE)

# add lab ID to pData:
QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
pData(expr)$lab = qcstats$SeqLabNumber[match(pData(expr)$SampleID, rownames(qcstats))]

y = log2(texpr(expr)+1)
pca = prcomp(t(y))
pcmat = pca$x
# take the transpose because we want to reduce the *transcript*
# dimension (not the replicate dimension)
# so we put the transcripts in columns)

# make some plots to figure out what the important stuff is:
cols = brewer.pal(5, 'Dark2')
sid = ballgown:::ss(rownames(pcmat), pattern='\\.', slot=2)
sum(sid != pData(expr)$dirname) #just make sure in same order

n = nrow(pData(expr))
popinds = split(1:n, pData(expr)$population)
labinds = split(1:n, pData(expr)$lab)
rinmed = median(pData(expr)$RIN)
pData(expr)$hirin = ifelse(pData(expr)$RIN > rinmed, 'high', 'low')
rininds = split(1:n, pData(expr)$hirin)

plot(pcmat[,1], pcmat[,2], type='n', xlab='PC1', ylab='PC2')
for(i in seq_along(rininds)){
    points(pcmat[rininds[[i]], 1], pcmat[rininds[[i]], 2],
        pch=19, col=makeTransparent(cols[i]))
}

pdf('population_pc.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(cols[i]))
    }
    legend('topleft', names(popinds), col=cols, pch=19)
dev.off()

pdf('eur_yri_pc.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        thecolor = ifelse(i==5, 'red', 'black')
        points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(thecolor))
    }
    legend('topleft', c('EUR', 'YRI'), col=c('black', 'red'), pch=19)
dev.off()

## PC1 is seems to be associated with lab:
labcolors = brewer.pal(7, 'Set1')
labcolors[6] = 'black'
labcolors = sample(labcolors) #so the red/pink labs aren't next to each other
pdf('lab_effect.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(labinds)){
        points(pcmat[labinds[[i]], 2], pcmat[labinds[[i]], 1],
        pch=19, col=makeTransparent(labcolors[i]))
    } #zomg.
    legend('topleft', col=labcolors, pch=19, paste('lab', 1:7))
dev.off()

## PC2 is correlated with RIN:
cor(pcmat[,2], pData(expr)$RIN) #0.72
(cor(pcmat[,2], pData(expr)$RIN))^2 #0.52
pdf('rin_effect.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    points(pcmat[pData(expr)$hirin=='high', 2],
        pcmat[pData(expr)$hirin=='high', 1],
        col=makeTransparent('dodgerblue'),
        pch=19)
    points(pcmat[pData(expr)$hirin=='low', 2],
        pcmat[pData(expr)$hirin=='low', 1],
        col=makeTransparent('orange'),
        pch=19)
    legend('topleft', col=c('dodgerblue', 'orange'),
        c('high RNA quality', 'low RNA quality'), pch=19)
dev.off()

## can see some population structure in PC2 vs. PC3:
pdf('pc3.pdf')
    plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
    }
    legend('topleft', names(popinds), col=cols, pch=19)
dev.off()

## uncolored plot
pdf('nocolor.pdf')
    plot(pcmat[,2], pcmat[,1], xlab='PC2', ylab='PC1', col=makeTransparent('black'), pch=19)
dev.off()

## percent variation explained:
names(pca)
pctvar = pca$sdev^2 / sum(pca$sdev^2)
cumulative_pctvar = cumsum(pca$sdev^2) / sum(pca$sdev^2)
pdf('pctvar.pdf')
    plot(cumulative_pctvar, pch=21, bg='gray80', xlab='Principal Component Index', ylab='Cumulative Percent Variance Explained')
dev.off()

pdf('pc3_ceu_gbr_yri.pdf')
    plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        if(i %% 2 == 1){
            points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
        }
    }
    legend('topleft', names(popinds)[c(1,3,5)], col=cols[c(1,3,5)], pch=19)
dev.off()
	## simple analysis code for GEUVADIS data
	## AF Oct 2014

	library(ballgown) #biocLite
	library(RSkittleBrewer) #install_github
	library(RColorBrewer) #CRAN
	library(usefulstuff) #install_github
	library(RCurl) #CRAN

	load('fpkm.rda') # download at http://files.figshare.com/1625419/fpkm.rda
	expressed = exprfilter(fpkm, cutoff=1)
	expr = subset(expressed, 'UseThisDup == 1', genomesubset=FALSE)

	# add lab ID to pData:
	QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
	qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
	pData(expr)$lab = qcstats$SeqLabNumber[match(pData(expr)$SampleID, rownames(qcstats))]

	y = log2(texpr(expr)+1)
	pca = prcomp(t(y))
	pcmat = pca$x
	# take the transpose because we want to reduce the transcript
	# dimension (not the replicate dimension)
	# so we put the transcripts in columns)

	# make some plots to figure out what the important stuff is:
	cols = brewer.pal(5, 'Dark2')
	sid = ballgown:::ss(rownames(pcmat), pattern='\\.', slot=2)
	sum(sid != pData(expr)$dirname) #just make sure in same order

	n = nrow(pData(expr))
	popinds = split(1:n, pData(expr)$population)
	labinds = split(1:n, pData(expr)$lab)
	rinmed = median(pData(expr)$RIN)
	pData(expr)$hirin = ifelse(pData(expr)$RIN > rinmed, 'high', 'low')
	rininds = split(1:n, pData(expr)$hirin)

	plot(pcmat[,1], pcmat[,2], type='n', xlab='PC1', ylab='PC2')
	for(i in seq_along(rininds)){
	points(pcmat[rininds[[i]], 1], pcmat[rininds[[i]], 2],
	pch=19, col=makeTransparent(cols[i]))
	}

	pdf('population_pc.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(cols[i]))
	}
	legend('topleft', names(popinds), col=cols, pch=19)
	dev.off()

	pdf('eur_yri_pc.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	thecolor = ifelse(i==5, 'red', 'black')
	points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(thecolor))
	}
	legend('topleft', c('EUR', 'YRI'), col=c('black', 'red'), pch=19)
	dev.off()

	## PC1 is seems to be associated with lab:
	labcolors = brewer.pal(7, 'Set1')
	labcolors[6] = 'black'
	labcolors = sample(labcolors) #so the red/pink labs aren't next to each other
	pdf('lab_effect.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(labinds)){
	points(pcmat[labinds[[i]], 2], pcmat[labinds[[i]], 1],
	pch=19, col=makeTransparent(labcolors[i]))
	} #zomg.
	legend('topleft', col=labcolors, pch=19, paste('lab', 1:7))
	dev.off()

	## PC2 is correlated with RIN:
	cor(pcmat[,2], pData(expr)$RIN) #0.72
	(cor(pcmat[,2], pData(expr)$RIN))^2 #0.52
	pdf('rin_effect.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	points(pcmat[pData(expr)$hirin=='high', 2],
	pcmat[pData(expr)$hirin=='high', 1],
	col=makeTransparent('dodgerblue'),
	pch=19)
	points(pcmat[pData(expr)$hirin=='low', 2],
	pcmat[pData(expr)$hirin=='low', 1],
	col=makeTransparent('orange'),
	pch=19)
	legend('topleft', col=c('dodgerblue', 'orange'),
	c('high RNA quality', 'low RNA quality'), pch=19)
	dev.off()

	## can see some population structure in PC2 vs. PC3:
	pdf('pc3.pdf')
	plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
	}
	legend('topleft', names(popinds), col=cols, pch=19)
	dev.off()

	## uncolored plot
	pdf('nocolor.pdf')
	plot(pcmat[,2], pcmat[,1], xlab='PC2', ylab='PC1', col=makeTransparent('black'), pch=19)
	dev.off()

	## percent variation explained:
	names(pca)
	pctvar = pca$sdev^2 / sum(pca$sdev^2)
	cumulative_pctvar = cumsum(pca$sdev^2) / sum(pca$sdev^2)
	pdf('pctvar.pdf')
	plot(cumulative_pctvar, pch=21, bg='gray80', xlab='Principal Component Index', ylab='Cumulative Percent Variance Explained')
	dev.off()

	pdf('pc3_ceu_gbr_yri.pdf')
	plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	if(i %% 2 == 1){
	points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
	}
	}
	legend('topleft', names(popinds)[c(1,3,5)], col=cols[c(1,3,5)], pch=19)
	dev.off()