Created
August 13, 2020 05:30
-
-
Save chrisamiller/0f584d184762ce35f1ba4187937803ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(edgeR); | |
library(gplots); | |
library(RColorBrewer); | |
library(tximport); | |
# takes three arguments - config file, transcript to gene table, and output directory | |
# config file specifies the samples to import, groupings, and paths to abundance.tsv files from kallisto | |
# groups should be either 0 or 1 | |
# header: sample \t group \t /path/to/abundance.tsv | |
##--------------------------------------------------------------------------------- | |
## Functions | |
## make a qc plot | |
mdsPlot <- function(y,file="mdsplot.pdf"){ | |
pdf(file=file,width=6,height=6,useDingbats=FALSE); | |
plotMDS(y, top=1000, method="bcv", cex=0.5); | |
##legend("bottomleft",as.character(unique(y$samples$group)),col=1:3,pch=20); | |
dev <- dev.off(); | |
} | |
## hierarchical clustering/heatmap | |
doClustering <- function(data, type="top", num=1000, outviz="heatmap.pdf", outfile="clustered.tsv"){ | |
rmean = rowMeans(data); # row mean | |
rsd = apply(data,1,sd); # row SD | |
## do row-normalization: | |
data.z = (data-rmean)/rsd; | |
data = data.z; | |
measurements = length(data[1,]); | |
## turn data into numerical matrix: | |
data.mat <- data.matrix(data); | |
## create heatmap | |
col.sep=seq(0,length(data.mat[1,]),1); | |
row.sep=seq(0,length(data.mat[,1]),1); | |
##red-blue: | |
colors = unique(c(seq(-3,-0.5,length=27),seq(-0.5,0.5,length=25),seq(0.5,3,length=26))); | |
colfunc <- colorRampPalette(rev(c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0'))) | |
if(type=="all"){ | |
## huge heatmap of everything - mem intensive | |
pdf(outviz,width=8,height=10,useDingbats=FALSE); | |
#alternately: distfun=function(x) as.dist(1-abs(cor(t(x)))) | |
#or distfun=as.dist(1-cor(t(x))) | |
heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,1), | |
key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",Rowv=TRUE, | |
Colv=TRUE,col=colfunc(75),breaks=colors,symbreaks=FALSE,labCol=names(data.mat),labRow="", | |
cexCol=0.7,scale="none", margins=c(12,2)); | |
dev.off(); | |
ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap | |
write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA); | |
return(); | |
} | |
if(type=="top"){ | |
##heatmap of top N differential genes | |
rsd.topN = sort(rsd, decreasing=TRUE)[num]; | |
data.topN = data.z[which(rsd >= rsd.topN),] | |
data.mat <- data.matrix(data.topN); | |
pdf(outviz,width=8,height=10,useDingbats=FALSE); | |
heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,10), | |
key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col", | |
Rowv=TRUE,Colv=TRUE,col=colfunc(75), breaks=colors, symbreaks=FALSE, | |
labCol=names(data.mat),labRow="", cexCol=0.7, scale="none", margins=c(12,2)); | |
## was sepwidth=0.01,0.01, margins=10,10 for genewise plot | |
dev.off(); | |
ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap | |
write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA); | |
return(); | |
} | |
## ##do the same thing with euclidean distance | |
## pdf("heatmap.top1000.euclidian.pdf",width=8,height=24,useDingbats=FALSE); | |
## heat.out <- heatmap.2(data.mat, lhei=c(0.5,9.5), lwid=c(3,10), key=TRUE, key.xlab="Count",trace="none", | |
## density.info="none", dendrogram="col", Rowv=TRUE, Colv=TRUE, col=colfunc(75), | |
## breaks=colors, symbreaks=FALSE, labCol=names(data.mat), labRow="", cexCol=0.7, | |
## scale="none", margins=c(15,5));# was sepwidth=0.01,0.01, margins=10,10 for genewise plot | |
## ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap | |
## write.table(file="out.lengthScaledTPM.counts.normalized.ordered1.tsv", ordered.data, sep="\t", quote=FALSE); | |
## dev.off(); | |
stop(paste0("unknown type ",type," for heatmap")) | |
} | |
##identify diff expr genes | |
edgeRclassic <- function(comparison, y, counts.cpm, outdir) { # "comparison" is just a label/name given to the comparison, y and counts.cpm are defined above | |
print("below warning about 'Design matrix not provided' is expected") | |
y.classic <- estimateDisp(y); # this performs dispersion (common and tagwise (ie gene wise)) estimation in the classic version of EdgeR | |
et <- exactTest(y.classic); # recall that y.classic already has the group information | |
total = length(et@.Data[[1]]$PValue); | |
results <- topTags(et, adjust.method="BH", sort.by="PValue", n=total); # table of the top differntially-expressed "tags" ie "genes" | |
results.table <- results@.Data[[1]]; | |
results.pos <- results.table[which(results.table$logFC >= 0 & results.table$FDR <= 0.05),]; | |
results.neg <- results.table[which(results.table$logFC < 0 & results.table$FDR<= 0.05),]; | |
data.pos <- counts.cpm[rownames(results.pos),] # note that this uses x.0.wt.cpm | |
data.neg <- counts.cpm[rownames(results.neg),] | |
## write.table(results@.Data[[1]],file=sprintf("%s/DEGs.%s.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA); | |
## write.table(results.pos,file=sprintf("%s/DEGs.%s.pos.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA); | |
## write.table(results.neg,file=sprintf("%s/DEGs.%s.neg.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA); | |
## write.table(data.pos,file=sprintf("%s/DEGs.%s.pos.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA); | |
## write.table(data.neg,file=sprintf("%s/DEGs.%s.neg.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA); | |
result = list(table=results, pos=data.pos, neg=data.neg); | |
return(result); | |
} | |
#make groups vector | |
getGroups <- function(n,config){ | |
groups=c(); | |
for(i in n){ | |
g = config[config$sample==i,]$group | |
if(is.na(g) | g==""){ | |
stop(paste0("sample name ",i," not found in config file")) | |
} else { | |
groups=c(groups,g) | |
} | |
} | |
return(groups) | |
} | |
#create matrix from the given files | |
createMatrix <- function(config, conv.table){ | |
## list of kallisto output files to be read as input | |
k=config$path | |
names(k) = config$sample | |
# transcript-gene conversions for ENSEMBL annotations | |
tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F); | |
writeTables <- function(txi,suffix){ | |
write.table(txi$abundance, file=paste0(outdir,"/raw.abundance.tsv"), sep='\t', quote=FALSE); | |
write.table(txi$counts, file=paste0(outdir,"/raw.counts.tsv"), sep='\t', quote=FALSE); | |
write.table(txi$length, file=paste0(outdir,"/raw.length.tsv"), sep='\t', quote=FALSE); | |
} | |
# length-scaled genewise counts from abundance: | |
txi <- tximport(k, type="kallisto", tx2gene=tx2gene, countsFromAbundance="lengthScaledTPM",ignoreTxVersion=T); | |
print("txi created") | |
writeTables(txi,"lengthScaledTPM") | |
} | |
#take a matrix, add gene names from gene ids | |
addGeneNames <- function(m, conv.table, type="matrix"){ | |
if(type=="matrix"){ | |
m = as.data.frame(m) | |
m$gene_id = row.names(m) | |
row.names(m) = NULL | |
} else { #assume type = 'df' | |
names(m)[which(names(m)=="Row.names")] = "gene_id" | |
} | |
tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F); | |
#tx2gene=unique(tx2gene[,c("ensembl_gene_id","external_gene_name")]) | |
tx2gene=unique(tx2gene[,c(2,3)]) | |
names(tx2gene) = c("ensembl_gene_id","external_gene_name") | |
return(merge(m,tx2gene,by.x="gene_id",by.y="ensembl_gene_id")) | |
} | |
##------------------------------------------------------ | |
args <- commandArgs(trailingOnly = TRUE) | |
if(length(args) < 3){ | |
stop("requires three arguments: config file, transcript to gene table, and output directory") | |
} | |
config=read.table(args[1], header=F, stringsAsFactors=F, sep="\t") | |
names(config) = c("sample","group","path") | |
#create the output directory | |
outdir=args[3]; | |
dir.create(outdir) | |
mat = createMatrix(config,args[2]) | |
#read in the counts | |
print("reading count data") | |
rawcounts <- read.table(mat, sep='\t', quote="", check.names=FALSE,header=TRUE,row.names=1) | |
#subset to only those in the config file | |
y <- DGEList(counts=rawcounts); | |
print("filtering/normalizing...") | |
## filter here for low expression, requiring counts-per-million (CPM) be at least 1 in at least half of the samples, and recalculate library sizes: | |
min.thresh=ceiling(length(names(rawcounts))/2); | |
keep <- rowSums(cpm(y)>1) >= min.thresh; | |
y <- y[keep, , keep.lib.sizes=FALSE]; # forces recalculation of library sizes | |
#normalize the data | |
y <- calcNormFactors(y); | |
counts.cpm <- cpm(y, normalized.lib.sizes=TRUE) | |
write.table(addGeneNames(counts.cpm,args[2]),file=paste0(outdir,"/counts.normalized.tsv"),quote=FALSE, sep='\t',row.names=F); | |
print("creating mdsplot...") | |
##make mds plot for QC | |
mdsPlot(y,file=paste0(outdir,"/mdsplot.pdf")); | |
print("doing hierarchical clustering/creating heatmap...") | |
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.top1000.pdf"), outfile=paste0(outdir,"/heatmap.clustered.top1000.tsv"), num=1000, type="top"); | |
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.all.pdf"), outfile=paste0(outdir,"/heatmap.clustered.all.tsv"), type="all"); | |
##now, before we calculate diff expr, let's subset to just the samples we actually want to compare | |
counts.cpm.comp = counts.cpm[,colnames(counts.cpm) %in% config$sample] | |
y2 <- DGEList(counts=counts.cpm.comp, group=getGroups(colnames(counts.cpm.comp),config)); | |
print("calculating diff expression...") | |
results = edgeRclassic("comp1", y2, counts.cpm.comp, outdir) | |
z = merge(counts.cpm,results$table@.Data[[1]],by=0) | |
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.tsv"),quote=FALSE,sep='\t',row.names=FALSE); | |
#make a heatmap of just the differentially expressed genes across all samples | |
#grab the DE genes | |
results.table=results$table@.Data[[1]]; | |
results.sig <- results.table[which(results.table$FDR <= 0.05),]; | |
if(length(results.sig[,1]) == 0){ | |
print("No differentially expressed genes detected - execution stopped") | |
system(paste0("touch ",outdir,"/counts.normalized.pvals.degs.tsv")) | |
stop("No differentially expressed genes detected") | |
} | |
counts.cpm.de <- counts.cpm[which(rownames(counts.cpm) %in% rownames(results.sig)),] # note that this uses x.0.wt.cpm | |
z = merge(counts.cpm.de,results.sig,by=0) | |
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.degs.tsv"),quote=FALSE,sep='\t',row.names=FALSE); | |
print("doing hierarchical clustering/creating heatmap...") | |
doClustering(counts.cpm.de, outviz=paste0(outdir,"/heatmap.degenes.pdf"), outfile=paste0(outdir,"/heatmap.clustered.de.tsv"), type="all"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment