Skip to content

Instantly share code, notes, and snippets.

@alyssafrazee
Last active August 29, 2015 14:04
Show Gist options
  • Save alyssafrazee/4b8c1a45e3d2983a889d to your computer and use it in GitHub Desktop.
Save alyssafrazee/4b8c1a45e3d2983a889d to your computer and use it in GitHub Desktop.
notes from BioC 2014

notes from BioC 2014

packages and the git-svn bridge

instructions on website I didn't get my invite to get an SVN account - mystery.

  • add bioc-sync as a collaborator on your git repo
  • then, webhooks & services --> add webhook. (there's a URL for this on the bioc instruction page.)
  • git-svn bridge doesn't really do merging very well: it's "winner-take all." you have to pick whether git or svn wins on merge conflicts.
  • only deals with master branch of git repo
  • super useful for dealing wtih pull requests and issues

pbdR (programming with big data in R)

cluster --> "distributing"
multicore machine --> "multithreading"
GPU/MIC/co-processor --> "offloading"
pdbR: connecting R to the HPC libraries that have been developed. r-pbd.org

perceptions: "R? Isn't that slow?" (HPC people). "HPC? Isn't that hard?" (R people).

GenomicRanges

mygrl = structure(bg)$trans
unspliced = range(mygrl)
introns = psetdiff(unspliced, mygrl) #**i think** (we are on the next slide now)
# potentially easier than what I did with ballgownrsem

clever way to get junctions from read alignments!

# read in the aligned READS as a GRangesList, where splices and such are represented as different ranges in an element
x = psetdiff(ranges(mygrl), grl)
unique(x) #gives the introns

AWS and Bioconductor ☁️ ☁️ ☁️

You can use Bioconductor in the cloud!

## edgeR tutorial
# http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html
library('edgeR')
rm(list=ls())
setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine
load('ds1.Rdata') # stem cell data!
setwd('-')
colnames(counts) # first 2 column names give the cell type
dim(counts) #30,727 by 27.
grp = as.factor(substr(colnames(counts), 1, 2))
table(grp)
## simple explorations
o = order(grp)
pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL)
# expect that cells of same type will be most similar to each other
# mds plots
d = DGEList(counts=counts, group=grp)
d = calcNormFactors(d)
head(d$samples)
cps = cpm(d)
k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm
d = d[k,]
dim(d) #21,707 rows
cols = as.numeric(d$samples$group)
plotMDS(d, col=cols)
par(mfrow=c(2,2))
plotMDS(d, col=cols, main='500 / 1LFC')
plotMDS(d, col=cols, method='bcv', main='500 / BCV')
plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC')
plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV')
mm = model.matrix(~ -1 + grp)
d = estimateGLMCommonDisp(d, mm)
d = estimateGLMTrendedDisp(d, mm)
d = estimateGLMTagwiseDisp(d, mm)
par(mfrow=c(1,1))
plotBCV(d) # plot biological coefficient of variation
plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE)
# used less frequently than mean/dispersion plot
# MA plot in lecture...
## fit model
f = glmFit(d, mm)
con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm))
con
lrt = glmLRT(f, contrast=con)
topTags(lrt, 20)
## spot checks: plot expression for DE genes
cps = cpm(d)
o = order(colnames(counts))
barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference
## same test without constructing contrasts (parameterization detail)
grp = relevel(grp, ref="ES") # make ES the reference category
mm1 = model.matrix(~grp) # model now has intercept
f1 = glmFit(d, mm1)
lrt1 = glmLRT(f1, coef=2)
topTags(lrt1, 10)
## doing counting!
library(parallel)
detectCores() # this is a nice function
# need a GTF file
# BAM files: be careful about whether data is paired or not
# also options to count multimapping, etc.
## NOT RUN
library(Rsubread)
anno = dir(, ".gtf$")
anno
f = dir(, ".bam$") # not on the instance, so this doesn't work
f
fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE)
fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE)
##
setwd('/data/RNA_SEQ_edgeR_voom_featureCounts')
load('ds3.Rdata')
head(gid)
head(md) # the metadata
dx = DGEList(ecounts, group=md$treatment)
dx = calcNormFactors(dx)
xmm = model.matrix(~libtype + treatment, data=md)
plotMDS(dx)
# differential splicing
vx = voom(dx, xmm, plot=TRUE)
fx = lmFit(vx, xmm) #heteroskedastic regression at exon level
# TODO find out about this heteroskedastic business
# I mean, it's not heteroskedastic variance within an exon, ya?
ex = diffSplice(fx, geneid=gid, exonid=eid)
topSplice(ex)
par(mfrow=c(1,2))
plotSplice(ex, geneid='FBgn0005630')
plotSplice(ex, geneid='FBgn0034072') # this is sweet.
## Martin Morgan's R/Bioconductor for everyone!
## BioC 2014, 7/31/2014
source('http://bioconductor.org/biocLite.R')
# topics discussed:
# everything is a vector
# R makes it easy to conceputalize things from the inside out
# "tilde" == "as a function of"
# exercise!
library(pasillaBamSubset)
library(GenomicAlignments)
library(ShortRead)
flag = scanBamFlag(isMinusStrand=FALSE)
param = ScanBamParam(what=c('seq', 'qual'), flag=flag)
fl = untreated1_chr4() #it knows, I'm running this on my EC2 Bioc instance
res = readGAlignments(fl, param=param)
tail(sort(table(cigar(res))))
# 91810 reads aligned with cigar 75M, no insertions or deletions.
# 337 reads had 52 Matches, a 101-bp gap, and 23 more Matches
abc = alphabetByCycle(mcols(res)$seq) ###OMG useful
# this is a matrix
abc = abc[1:4,] # M, R, etc. are not relevant nucleotides for this example
matplot(t(abc)) #ohh that's nice
matplot(t(abc), type='l', lwd=2, lty=1)
legend('topright', legend=rownames(abc), col=1:4, lty=1, lwd=2)
# look at the data
# understand what you see
# act appropriately
# more examples/exercises available at:
# http://www.bioconductor.org/help/course-materials/2014/BioC2014/RBiocForEveryone-lab.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment