alyssafrazee/bioc.md

## bioc.md

      
    Raw
  

              bioc.md
            
          
    notes from BioC 2014

packages and the git-svn bridge

instructions on website
I didn't get my invite to get an SVN account - mystery.

add bioc-sync as a collaborator on your git repo
then, webhooks & services --> add webhook. (there's a URL for this on the bioc instruction page.)
git-svn bridge doesn't really do merging very well: it's "winner-take all." you have to pick whether git or svn wins on merge conflicts.
only deals with master branch of git repo
super useful for dealing wtih pull requests and issues

pbdR (programming with big data in R)

cluster --> "distributing"

multicore machine --> "multithreading"

GPU/MIC/co-processor --> "offloading"

pdbR: connecting R to the HPC libraries that have been developed. r-pbd.org
perceptions: "R? Isn't that slow?" (HPC people). "HPC? Isn't that hard?" (R people).
GenomicRanges

mygrl = structure(bg)$trans
unspliced = range(mygrl)
introns = psetdiff(unspliced, mygrl) #**i think** (we are on the next slide now)
# potentially easier than what I did with ballgownrsem
clever way to get junctions from read alignments!
# read in the aligned READS as a GRangesList, where splices and such are represented as different ranges in an element
x = psetdiff(ranges(mygrl), grl)
unique(x) #gives the introns
AWS and Bioconductor ☁️ ☁️ ☁️

You can use Bioconductor in the cloud!

  
## diffexpression.R
## edgeR tutorial
# http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html
library('edgeR')
rm(list=ls())
setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine
load('ds1.Rdata')  # stem cell data!
setwd('-')

colnames(counts) # first 2 column names give the cell type
dim(counts) #30,727 by 27.

grp = as.factor(substr(colnames(counts), 1, 2))
table(grp)

## simple explorations
o = order(grp)
pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL)
# expect that cells of same type will be most similar to each other

# mds plots
d = DGEList(counts=counts, group=grp)
d = calcNormFactors(d)
head(d$samples)
cps = cpm(d)
k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm
d = d[k,]
dim(d) #21,707 rows

cols = as.numeric(d$samples$group)
plotMDS(d, col=cols)

par(mfrow=c(2,2))
plotMDS(d, col=cols, main='500 / 1LFC')
plotMDS(d, col=cols, method='bcv', main='500 / BCV')
plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC')
plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV')

mm = model.matrix(~ -1 + grp)
d = estimateGLMCommonDisp(d, mm)
d = estimateGLMTrendedDisp(d, mm)
d = estimateGLMTagwiseDisp(d, mm)
par(mfrow=c(1,1))
plotBCV(d) # plot biological coefficient of variation

plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE)
# used less frequently than mean/dispersion plot

# MA plot in lecture...

## fit model
f = glmFit(d, mm)
con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm))
con

lrt = glmLRT(f, contrast=con)
topTags(lrt, 20)

## spot checks: plot expression for DE genes
cps = cpm(d)
o = order(colnames(counts))
barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference


## same test without constructing contrasts (parameterization detail)
grp = relevel(grp, ref="ES") # make ES the reference category
mm1 = model.matrix(~grp) # model now has intercept
f1 = glmFit(d, mm1)
lrt1 = glmLRT(f1, coef=2)
topTags(lrt1, 10)


## doing counting!
library(parallel)
detectCores() # this is a nice function

# need a GTF file
# BAM files: be careful about whether data is paired or not
# also options to count multimapping, etc.

## NOT RUN
library(Rsubread)
anno = dir(, ".gtf$")
anno
f = dir(, ".bam$") # not on the instance, so this doesn't work
f
fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE)
fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE)

##
setwd('/data/RNA_SEQ_edgeR_voom_featureCounts')
load('ds3.Rdata')
head(gid)
head(md) # the metadata
dx = DGEList(ecounts, group=md$treatment)
dx = calcNormFactors(dx)
xmm = model.matrix(~libtype + treatment, data=md)
plotMDS(dx)

# differential splicing
vx = voom(dx, xmm, plot=TRUE)
fx = lmFit(vx, xmm) #heteroskedastic regression at exon level
# TODO find out about this heteroskedastic business
# I mean, it's not heteroskedastic variance within an exon, ya?
ex = diffSplice(fx, geneid=gid, exonid=eid)
topSplice(ex)


par(mfrow=c(1,2))
plotSplice(ex, geneid='FBgn0005630')
plotSplice(ex, geneid='FBgn0034072') # this is sweet.

## rbioc_for_everyone.R
## Martin Morgan's R/Bioconductor for everyone!
## BioC 2014, 7/31/2014

source('http://bioconductor.org/biocLite.R')

# topics discussed:
# everything is a vector
# R makes it easy to conceputalize things from the inside out
# "tilde" == "as a function of"

# exercise!
library(pasillaBamSubset)
library(GenomicAlignments)
library(ShortRead)

flag = scanBamFlag(isMinusStrand=FALSE)
param = ScanBamParam(what=c('seq', 'qual'), flag=flag)

fl = untreated1_chr4() #it knows, I'm running this on my EC2 Bioc instance
res = readGAlignments(fl, param=param)
tail(sort(table(cigar(res))))
# 91810 reads aligned with cigar 75M, no insertions or deletions.
# 337 reads had 52 Matches, a 101-bp gap, and 23 more Matches

abc = alphabetByCycle(mcols(res)$seq) ###OMG useful
# this is a matrix
abc = abc[1:4,] # M, R, etc. are not relevant nucleotides for this example
matplot(t(abc)) #ohh that's nice
matplot(t(abc), type='l', lwd=2, lty=1)
legend('topright', legend=rownames(abc), col=1:4, lty=1, lwd=2)

# look at the data
# understand what you see
# act appropriately

# more examples/exercises available at:
# http://www.bioconductor.org/help/course-materials/2014/BioC2014/RBiocForEveryone-lab.pdf
	## edgeR tutorial
	# http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html
	library('edgeR')
	rm(list=ls())
	setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine
	load('ds1.Rdata') # stem cell data!
	setwd('-')

	colnames(counts) # first 2 column names give the cell type
	dim(counts) #30,727 by 27.

	grp = as.factor(substr(colnames(counts), 1, 2))
	table(grp)

	## simple explorations
	o = order(grp)
	pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL)
	# expect that cells of same type will be most similar to each other

	# mds plots
	d = DGEList(counts=counts, group=grp)
	d = calcNormFactors(d)
	head(d$samples)
	cps = cpm(d)
	k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm
	d = d[k,]
	dim(d) #21,707 rows

	cols = as.numeric(d$samples$group)
	plotMDS(d, col=cols)

	par(mfrow=c(2,2))
	plotMDS(d, col=cols, main='500 / 1LFC')
	plotMDS(d, col=cols, method='bcv', main='500 / BCV')
	plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC')
	plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV')

	mm = model.matrix(~ -1 + grp)
	d = estimateGLMCommonDisp(d, mm)
	d = estimateGLMTrendedDisp(d, mm)
	d = estimateGLMTagwiseDisp(d, mm)
	par(mfrow=c(1,1))
	plotBCV(d) # plot biological coefficient of variation

	plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE)
	# used less frequently than mean/dispersion plot

	# MA plot in lecture...

	## fit model
	f = glmFit(d, mm)
	con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm))
	con

	lrt = glmLRT(f, contrast=con)
	topTags(lrt, 20)

	## spot checks: plot expression for DE genes
	cps = cpm(d)
	o = order(colnames(counts))
	barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference


	## same test without constructing contrasts (parameterization detail)
	grp = relevel(grp, ref="ES") # make ES the reference category
	mm1 = model.matrix(~grp) # model now has intercept
	f1 = glmFit(d, mm1)
	lrt1 = glmLRT(f1, coef=2)
	topTags(lrt1, 10)



	## doing counting!
	library(parallel)
	detectCores() # this is a nice function

	# need a GTF file
	# BAM files: be careful about whether data is paired or not
	# also options to count multimapping, etc.

	## NOT RUN
	library(Rsubread)
	anno = dir(, ".gtf$")
	anno
	f = dir(, ".bam$") # not on the instance, so this doesn't work
	f
	fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE)
	fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE)

	##
	setwd('/data/RNA_SEQ_edgeR_voom_featureCounts')
	load('ds3.Rdata')
	head(gid)
	head(md) # the metadata
	dx = DGEList(ecounts, group=md$treatment)
	dx = calcNormFactors(dx)
	xmm = model.matrix(~libtype + treatment, data=md)
	plotMDS(dx)

	# differential splicing
	vx = voom(dx, xmm, plot=TRUE)
	fx = lmFit(vx, xmm) #heteroskedastic regression at exon level
	# TODO find out about this heteroskedastic business
	# I mean, it's not heteroskedastic variance within an exon, ya?
	ex = diffSplice(fx, geneid=gid, exonid=eid)
	topSplice(ex)


	par(mfrow=c(1,2))
	plotSplice(ex, geneid='FBgn0005630')
	plotSplice(ex, geneid='FBgn0034072') # this is sweet.
	## Martin Morgan's R/Bioconductor for everyone!
	## BioC 2014, 7/31/2014

	source('http://bioconductor.org/biocLite.R')

	# topics discussed:
	# everything is a vector
	# R makes it easy to conceputalize things from the inside out
	# "tilde" == "as a function of"

	# exercise!
	library(pasillaBamSubset)
	library(GenomicAlignments)
	library(ShortRead)

	flag = scanBamFlag(isMinusStrand=FALSE)
	param = ScanBamParam(what=c('seq', 'qual'), flag=flag)

	fl = untreated1_chr4() #it knows, I'm running this on my EC2 Bioc instance
	res = readGAlignments(fl, param=param)
	tail(sort(table(cigar(res))))
	# 91810 reads aligned with cigar 75M, no insertions or deletions.
	# 337 reads had 52 Matches, a 101-bp gap, and 23 more Matches

	abc = alphabetByCycle(mcols(res)$seq) ###OMG useful
	# this is a matrix
	abc = abc[1:4,] # M, R, etc. are not relevant nucleotides for this example
	matplot(t(abc)) #ohh that's nice
	matplot(t(abc), type='l', lwd=2, lty=1)
	legend('topright', legend=rownames(abc), col=1:4, lty=1, lwd=2)

	# look at the data
	# understand what you see
	# act appropriately

	# more examples/exercises available at:
	# http://www.bioconductor.org/help/course-materials/2014/BioC2014/RBiocForEveryone-lab.pdf