Skip to content

Instantly share code, notes, and snippets.

@chrisamiller
Created August 13, 2020 05:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisamiller/0f584d184762ce35f1ba4187937803ed to your computer and use it in GitHub Desktop.
Save chrisamiller/0f584d184762ce35f1ba4187937803ed to your computer and use it in GitHub Desktop.
library(edgeR);
library(gplots);
library(RColorBrewer);
library(tximport);
# takes three arguments - config file, transcript to gene table, and output directory
# config file specifies the samples to import, groupings, and paths to abundance.tsv files from kallisto
# groups should be either 0 or 1
# header: sample \t group \t /path/to/abundance.tsv
##---------------------------------------------------------------------------------
## Functions
## make a qc plot
mdsPlot <- function(y,file="mdsplot.pdf"){
pdf(file=file,width=6,height=6,useDingbats=FALSE);
plotMDS(y, top=1000, method="bcv", cex=0.5);
##legend("bottomleft",as.character(unique(y$samples$group)),col=1:3,pch=20);
dev <- dev.off();
}
## hierarchical clustering/heatmap
doClustering <- function(data, type="top", num=1000, outviz="heatmap.pdf", outfile="clustered.tsv"){
rmean = rowMeans(data); # row mean
rsd = apply(data,1,sd); # row SD
## do row-normalization:
data.z = (data-rmean)/rsd;
data = data.z;
measurements = length(data[1,]);
## turn data into numerical matrix:
data.mat <- data.matrix(data);
## create heatmap
col.sep=seq(0,length(data.mat[1,]),1);
row.sep=seq(0,length(data.mat[,1]),1);
##red-blue:
colors = unique(c(seq(-3,-0.5,length=27),seq(-0.5,0.5,length=25),seq(0.5,3,length=26)));
colfunc <- colorRampPalette(rev(c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0')))
if(type=="all"){
## huge heatmap of everything - mem intensive
pdf(outviz,width=8,height=10,useDingbats=FALSE);
#alternately: distfun=function(x) as.dist(1-abs(cor(t(x))))
#or distfun=as.dist(1-cor(t(x)))
heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,1),
key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",Rowv=TRUE,
Colv=TRUE,col=colfunc(75),breaks=colors,symbreaks=FALSE,labCol=names(data.mat),labRow="",
cexCol=0.7,scale="none", margins=c(12,2));
dev.off();
ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
return();
}
if(type=="top"){
##heatmap of top N differential genes
rsd.topN = sort(rsd, decreasing=TRUE)[num];
data.topN = data.z[which(rsd >= rsd.topN),]
data.mat <- data.matrix(data.topN);
pdf(outviz,width=8,height=10,useDingbats=FALSE);
heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,10),
key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",
Rowv=TRUE,Colv=TRUE,col=colfunc(75), breaks=colors, symbreaks=FALSE,
labCol=names(data.mat),labRow="", cexCol=0.7, scale="none", margins=c(12,2));
## was sepwidth=0.01,0.01, margins=10,10 for genewise plot
dev.off();
ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
return();
}
## ##do the same thing with euclidean distance
## pdf("heatmap.top1000.euclidian.pdf",width=8,height=24,useDingbats=FALSE);
## heat.out <- heatmap.2(data.mat, lhei=c(0.5,9.5), lwid=c(3,10), key=TRUE, key.xlab="Count",trace="none",
## density.info="none", dendrogram="col", Rowv=TRUE, Colv=TRUE, col=colfunc(75),
## breaks=colors, symbreaks=FALSE, labCol=names(data.mat), labRow="", cexCol=0.7,
## scale="none", margins=c(15,5));# was sepwidth=0.01,0.01, margins=10,10 for genewise plot
## ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
## write.table(file="out.lengthScaledTPM.counts.normalized.ordered1.tsv", ordered.data, sep="\t", quote=FALSE);
## dev.off();
stop(paste0("unknown type ",type," for heatmap"))
}
##identify diff expr genes
edgeRclassic <- function(comparison, y, counts.cpm, outdir) { # "comparison" is just a label/name given to the comparison, y and counts.cpm are defined above
print("below warning about 'Design matrix not provided' is expected")
y.classic <- estimateDisp(y); # this performs dispersion (common and tagwise (ie gene wise)) estimation in the classic version of EdgeR
et <- exactTest(y.classic); # recall that y.classic already has the group information
total = length(et@.Data[[1]]$PValue);
results <- topTags(et, adjust.method="BH", sort.by="PValue", n=total); # table of the top differntially-expressed "tags" ie "genes"
results.table <- results@.Data[[1]];
results.pos <- results.table[which(results.table$logFC >= 0 & results.table$FDR <= 0.05),];
results.neg <- results.table[which(results.table$logFC < 0 & results.table$FDR<= 0.05),];
data.pos <- counts.cpm[rownames(results.pos),] # note that this uses x.0.wt.cpm
data.neg <- counts.cpm[rownames(results.neg),]
## write.table(results@.Data[[1]],file=sprintf("%s/DEGs.%s.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
## write.table(results.pos,file=sprintf("%s/DEGs.%s.pos.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
## write.table(results.neg,file=sprintf("%s/DEGs.%s.neg.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
## write.table(data.pos,file=sprintf("%s/DEGs.%s.pos.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
## write.table(data.neg,file=sprintf("%s/DEGs.%s.neg.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
result = list(table=results, pos=data.pos, neg=data.neg);
return(result);
}
#make groups vector
getGroups <- function(n,config){
groups=c();
for(i in n){
g = config[config$sample==i,]$group
if(is.na(g) | g==""){
stop(paste0("sample name ",i," not found in config file"))
} else {
groups=c(groups,g)
}
}
return(groups)
}
#create matrix from the given files
createMatrix <- function(config, conv.table){
## list of kallisto output files to be read as input
k=config$path
names(k) = config$sample
# transcript-gene conversions for ENSEMBL annotations
tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);
writeTables <- function(txi,suffix){
write.table(txi$abundance, file=paste0(outdir,"/raw.abundance.tsv"), sep='\t', quote=FALSE);
write.table(txi$counts, file=paste0(outdir,"/raw.counts.tsv"), sep='\t', quote=FALSE);
write.table(txi$length, file=paste0(outdir,"/raw.length.tsv"), sep='\t', quote=FALSE);
}
# length-scaled genewise counts from abundance:
txi <- tximport(k, type="kallisto", tx2gene=tx2gene, countsFromAbundance="lengthScaledTPM",ignoreTxVersion=T);
print("txi created")
writeTables(txi,"lengthScaledTPM")
}
#take a matrix, add gene names from gene ids
addGeneNames <- function(m, conv.table, type="matrix"){
if(type=="matrix"){
m = as.data.frame(m)
m$gene_id = row.names(m)
row.names(m) = NULL
} else { #assume type = 'df'
names(m)[which(names(m)=="Row.names")] = "gene_id"
}
tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);
#tx2gene=unique(tx2gene[,c("ensembl_gene_id","external_gene_name")])
tx2gene=unique(tx2gene[,c(2,3)])
names(tx2gene) = c("ensembl_gene_id","external_gene_name")
return(merge(m,tx2gene,by.x="gene_id",by.y="ensembl_gene_id"))
}
##------------------------------------------------------
args <- commandArgs(trailingOnly = TRUE)
if(length(args) < 3){
stop("requires three arguments: config file, transcript to gene table, and output directory")
}
config=read.table(args[1], header=F, stringsAsFactors=F, sep="\t")
names(config) = c("sample","group","path")
#create the output directory
outdir=args[3];
dir.create(outdir)
mat = createMatrix(config,args[2])
#read in the counts
print("reading count data")
rawcounts <- read.table(mat, sep='\t', quote="", check.names=FALSE,header=TRUE,row.names=1)
#subset to only those in the config file
y <- DGEList(counts=rawcounts);
print("filtering/normalizing...")
## filter here for low expression, requiring counts-per-million (CPM) be at least 1 in at least half of the samples, and recalculate library sizes:
min.thresh=ceiling(length(names(rawcounts))/2);
keep <- rowSums(cpm(y)>1) >= min.thresh;
y <- y[keep, , keep.lib.sizes=FALSE]; # forces recalculation of library sizes
#normalize the data
y <- calcNormFactors(y);
counts.cpm <- cpm(y, normalized.lib.sizes=TRUE)
write.table(addGeneNames(counts.cpm,args[2]),file=paste0(outdir,"/counts.normalized.tsv"),quote=FALSE, sep='\t',row.names=F);
print("creating mdsplot...")
##make mds plot for QC
mdsPlot(y,file=paste0(outdir,"/mdsplot.pdf"));
print("doing hierarchical clustering/creating heatmap...")
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.top1000.pdf"), outfile=paste0(outdir,"/heatmap.clustered.top1000.tsv"), num=1000, type="top");
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.all.pdf"), outfile=paste0(outdir,"/heatmap.clustered.all.tsv"), type="all");
##now, before we calculate diff expr, let's subset to just the samples we actually want to compare
counts.cpm.comp = counts.cpm[,colnames(counts.cpm) %in% config$sample]
y2 <- DGEList(counts=counts.cpm.comp, group=getGroups(colnames(counts.cpm.comp),config));
print("calculating diff expression...")
results = edgeRclassic("comp1", y2, counts.cpm.comp, outdir)
z = merge(counts.cpm,results$table@.Data[[1]],by=0)
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.tsv"),quote=FALSE,sep='\t',row.names=FALSE);
#make a heatmap of just the differentially expressed genes across all samples
#grab the DE genes
results.table=results$table@.Data[[1]];
results.sig <- results.table[which(results.table$FDR <= 0.05),];
if(length(results.sig[,1]) == 0){
print("No differentially expressed genes detected - execution stopped")
system(paste0("touch ",outdir,"/counts.normalized.pvals.degs.tsv"))
stop("No differentially expressed genes detected")
}
counts.cpm.de <- counts.cpm[which(rownames(counts.cpm) %in% rownames(results.sig)),] # note that this uses x.0.wt.cpm
z = merge(counts.cpm.de,results.sig,by=0)
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.degs.tsv"),quote=FALSE,sep='\t',row.names=FALSE);
print("doing hierarchical clustering/creating heatmap...")
doClustering(counts.cpm.de, outviz=paste0(outdir,"/heatmap.degenes.pdf"), outfile=paste0(outdir,"/heatmap.clustered.de.tsv"), type="all");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment