chrisamiller/kallisto_to_degs.R

## kallisto_to_degs.R
library(edgeR);
library(gplots);
library(RColorBrewer);
library(tximport);

# takes three arguments - config file, transcript to gene table, and output directory

# config file specifies the samples to import, groupings, and paths to abundance.tsv files from kallisto
# groups should be either 0 or 1
# header: sample \t group \t /path/to/abundance.tsv


##---------------------------------------------------------------------------------
## Functions

## make a qc plot
mdsPlot <- function(y,file="mdsplot.pdf"){
  pdf(file=file,width=6,height=6,useDingbats=FALSE);
  plotMDS(y, top=1000, method="bcv", cex=0.5);
  ##legend("bottomleft",as.character(unique(y$samples$group)),col=1:3,pch=20);
  dev <- dev.off();
}

## hierarchical clustering/heatmap
doClustering <- function(data, type="top", num=1000, outviz="heatmap.pdf", outfile="clustered.tsv"){
  rmean = rowMeans(data); # row mean
  rsd = apply(data,1,sd); # row SD

  ## do row-normalization:
  data.z = (data-rmean)/rsd;
  data = data.z;

  measurements = length(data[1,]);

  ## turn data into numerical matrix:
  data.mat <- data.matrix(data);

  ## create heatmap
  col.sep=seq(0,length(data.mat[1,]),1);
  row.sep=seq(0,length(data.mat[,1]),1);

  ##red-blue:
  colors = unique(c(seq(-3,-0.5,length=27),seq(-0.5,0.5,length=25),seq(0.5,3,length=26)));
  colfunc <- colorRampPalette(rev(c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0')))
  if(type=="all"){
    ## huge heatmap of everything  - mem intensive
    pdf(outviz,width=8,height=10,useDingbats=FALSE);
    #alternately: distfun=function(x) as.dist(1-abs(cor(t(x))))
    #or           distfun=as.dist(1-cor(t(x)))
    heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,1),
              key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",Rowv=TRUE,
              Colv=TRUE,col=colfunc(75),breaks=colors,symbreaks=FALSE,labCol=names(data.mat),labRow="",
              cexCol=0.7,scale="none", margins=c(12,2));
    dev.off();
    ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
    write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
    return();
  }

  if(type=="top"){
    ##heatmap of top N differential genes
    rsd.topN = sort(rsd, decreasing=TRUE)[num];
    data.topN = data.z[which(rsd >= rsd.topN),]
    data.mat <- data.matrix(data.topN);

    pdf(outviz,width=8,height=10,useDingbats=FALSE);

    heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,10),
                          key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",
                          Rowv=TRUE,Colv=TRUE,col=colfunc(75), breaks=colors, symbreaks=FALSE,
                          labCol=names(data.mat),labRow="", cexCol=0.7, scale="none", margins=c(12,2));
    ##                    was sepwidth=0.01,0.01, margins=10,10 for genewise plot
    dev.off();
    ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
    write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
    return();
  }

## ##do the same thing with euclidean distance
## pdf("heatmap.top1000.euclidian.pdf",width=8,height=24,useDingbats=FALSE);
## heat.out <- heatmap.2(data.mat, lhei=c(0.5,9.5), lwid=c(3,10), key=TRUE, key.xlab="Count",trace="none",
##                       density.info="none", dendrogram="col", Rowv=TRUE, Colv=TRUE, col=colfunc(75),
##                       breaks=colors, symbreaks=FALSE, labCol=names(data.mat), labRow="", cexCol=0.7,
##                       scale="none", margins=c(15,5));# was sepwidth=0.01,0.01, margins=10,10 for genewise plot

## ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
## write.table(file="out.lengthScaledTPM.counts.normalized.ordered1.tsv", ordered.data, sep="\t", quote=FALSE);
## dev.off();

  stop(paste0("unknown type ",type," for heatmap"))
}

##identify diff expr genes
edgeRclassic <- function(comparison, y, counts.cpm, outdir) { # "comparison" is just a label/name given to the comparison, y and counts.cpm are defined above
  print("below warning about 'Design matrix not provided' is expected")
  y.classic <- estimateDisp(y); # this performs dispersion (common and tagwise (ie gene wise)) estimation in the classic version of EdgeR
  et <- exactTest(y.classic); # recall that y.classic already has the group information
  total = length(et@.Data[[1]]$PValue);
  results <- topTags(et, adjust.method="BH", sort.by="PValue", n=total); # table of the top differntially-expressed "tags" ie "genes"
  results.table <- results@.Data[[1]];
  results.pos <- results.table[which(results.table$logFC >= 0 & results.table$FDR <= 0.05),];
  results.neg <- results.table[which(results.table$logFC < 0 & results.table$FDR<= 0.05),];
  data.pos <- counts.cpm[rownames(results.pos),] # note that this uses x.0.wt.cpm
  data.neg <- counts.cpm[rownames(results.neg),]
  ## write.table(results@.Data[[1]],file=sprintf("%s/DEGs.%s.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
  ## write.table(results.pos,file=sprintf("%s/DEGs.%s.pos.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
  ## write.table(results.neg,file=sprintf("%s/DEGs.%s.neg.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
  ## write.table(data.pos,file=sprintf("%s/DEGs.%s.pos.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
  ## write.table(data.neg,file=sprintf("%s/DEGs.%s.neg.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);

  result = list(table=results, pos=data.pos, neg=data.neg);
  return(result);
}

#make groups vector
getGroups <- function(n,config){
  groups=c();
  for(i in n){
    g = config[config$sample==i,]$group
    if(is.na(g) | g==""){
      stop(paste0("sample name ",i," not found in config file"))
    } else {
      groups=c(groups,g)
    }
  }
  return(groups)
}

#create matrix from the given files
createMatrix <- function(config, conv.table){
  ## list of kallisto output files to be read as input
  k=config$path
  names(k) = config$sample

  # transcript-gene conversions for ENSEMBL annotations
  tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);

  writeTables <- function(txi,suffix){
    write.table(txi$abundance, file=paste0(outdir,"/raw.abundance.tsv"), sep='\t', quote=FALSE);
    write.table(txi$counts, file=paste0(outdir,"/raw.counts.tsv"), sep='\t', quote=FALSE);
    write.table(txi$length, file=paste0(outdir,"/raw.length.tsv"), sep='\t', quote=FALSE);
  }

  # length-scaled genewise counts from abundance:
  txi <- tximport(k, type="kallisto", tx2gene=tx2gene, countsFromAbundance="lengthScaledTPM",ignoreTxVersion=T);
  print("txi created")
  writeTables(txi,"lengthScaledTPM")
}

#take a matrix, add gene names from gene ids
addGeneNames <- function(m, conv.table, type="matrix"){
  if(type=="matrix"){
    m = as.data.frame(m)
    m$gene_id = row.names(m)
    row.names(m) = NULL
  } else { #assume type = 'df'
    names(m)[which(names(m)=="Row.names")] = "gene_id"
  }


  tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);
  #tx2gene=unique(tx2gene[,c("ensembl_gene_id","external_gene_name")])
  tx2gene=unique(tx2gene[,c(2,3)])
  names(tx2gene) = c("ensembl_gene_id","external_gene_name")

  return(merge(m,tx2gene,by.x="gene_id",by.y="ensembl_gene_id"))
}

##------------------------------------------------------
args <- commandArgs(trailingOnly = TRUE)
if(length(args) < 3){
  stop("requires three arguments: config file, transcript to gene table, and output directory")
}
config=read.table(args[1], header=F, stringsAsFactors=F, sep="\t")
names(config) = c("sample","group","path")

#create the output directory
outdir=args[3];
dir.create(outdir)

mat = createMatrix(config,args[2])

#read in the counts
print("reading count data")
rawcounts <- read.table(mat, sep='\t', quote="", check.names=FALSE,header=TRUE,row.names=1)
#subset to only those in the config file
y <- DGEList(counts=rawcounts);

print("filtering/normalizing...")
## filter here for low expression, requiring counts-per-million (CPM) be at least 1 in at least half of the samples, and recalculate library sizes:
min.thresh=ceiling(length(names(rawcounts))/2);
keep <- rowSums(cpm(y)>1) >= min.thresh;
y <- y[keep, , keep.lib.sizes=FALSE]; # forces recalculation of library sizes

#normalize the data
y <- calcNormFactors(y);
counts.cpm <- cpm(y, normalized.lib.sizes=TRUE)

write.table(addGeneNames(counts.cpm,args[2]),file=paste0(outdir,"/counts.normalized.tsv"),quote=FALSE, sep='\t',row.names=F);

print("creating mdsplot...")
##make mds plot for QC
mdsPlot(y,file=paste0(outdir,"/mdsplot.pdf"));

print("doing hierarchical clustering/creating heatmap...")
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.top1000.pdf"), outfile=paste0(outdir,"/heatmap.clustered.top1000.tsv"), num=1000, type="top");
doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.all.pdf"), outfile=paste0(outdir,"/heatmap.clustered.all.tsv"), type="all");

##now, before we calculate diff expr, let's subset to just the samples we actually want to compare
counts.cpm.comp = counts.cpm[,colnames(counts.cpm) %in% config$sample]
y2 <- DGEList(counts=counts.cpm.comp, group=getGroups(colnames(counts.cpm.comp),config));

print("calculating diff expression...")
results = edgeRclassic("comp1", y2, counts.cpm.comp, outdir)
z = merge(counts.cpm,results$table@.Data[[1]],by=0)
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.tsv"),quote=FALSE,sep='\t',row.names=FALSE);

#make a heatmap of just the differentially expressed genes across all samples
#grab the DE genes
results.table=results$table@.Data[[1]];
results.sig <- results.table[which(results.table$FDR <= 0.05),];
if(length(results.sig[,1]) == 0){
  print("No differentially expressed genes detected - execution stopped")
  system(paste0("touch ",outdir,"/counts.normalized.pvals.degs.tsv"))
  stop("No differentially expressed genes detected")
}

counts.cpm.de <- counts.cpm[which(rownames(counts.cpm) %in% rownames(results.sig)),] # note that this uses x.0.wt.cpm
z = merge(counts.cpm.de,results.sig,by=0)
write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.degs.tsv"),quote=FALSE,sep='\t',row.names=FALSE);

print("doing hierarchical clustering/creating heatmap...")
doClustering(counts.cpm.de, outviz=paste0(outdir,"/heatmap.degenes.pdf"), outfile=paste0(outdir,"/heatmap.clustered.de.tsv"), type="all");
	library(edgeR);
	library(gplots);
	library(RColorBrewer);
	library(tximport);

	# takes three arguments - config file, transcript to gene table, and output directory

	# config file specifies the samples to import, groupings, and paths to abundance.tsv files from kallisto
	# groups should be either 0 or 1
	# header: sample \t group \t /path/to/abundance.tsv


	##---------------------------------------------------------------------------------
	## Functions

	## make a qc plot
	mdsPlot <- function(y,file="mdsplot.pdf"){
	pdf(file=file,width=6,height=6,useDingbats=FALSE);
	plotMDS(y, top=1000, method="bcv", cex=0.5);
	##legend("bottomleft",as.character(unique(y$samples$group)),col=1:3,pch=20);
	dev <- dev.off();
	}

	## hierarchical clustering/heatmap
	doClustering <- function(data, type="top", num=1000, outviz="heatmap.pdf", outfile="clustered.tsv"){
	rmean = rowMeans(data); # row mean
	rsd = apply(data,1,sd); # row SD

	## do row-normalization:
	data.z = (data-rmean)/rsd;
	data = data.z;

	measurements = length(data[1,]);

	## turn data into numerical matrix:
	data.mat <- data.matrix(data);

	## create heatmap
	col.sep=seq(0,length(data.mat[1,]),1);
	row.sep=seq(0,length(data.mat[,1]),1);

	##red-blue:
	colors = unique(c(seq(-3,-0.5,length=27),seq(-0.5,0.5,length=25),seq(0.5,3,length=26)));
	colfunc <- colorRampPalette(rev(c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0')))
	if(type=="all"){
	## huge heatmap of everything - mem intensive
	pdf(outviz,width=8,height=10,useDingbats=FALSE);
	#alternately: distfun=function(x) as.dist(1-abs(cor(t(x))))
	#or distfun=as.dist(1-cor(t(x)))
	heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,1),
	key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",Rowv=TRUE,
	Colv=TRUE,col=colfunc(75),breaks=colors,symbreaks=FALSE,labCol=names(data.mat),labRow="",
	cexCol=0.7,scale="none", margins=c(12,2));
	dev.off();
	ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
	write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
	return();
	}

	if(type=="top"){
	##heatmap of top N differential genes
	rsd.topN = sort(rsd, decreasing=TRUE)[num];
	data.topN = data.z[which(rsd >= rsd.topN),]
	data.mat <- data.matrix(data.topN);

	pdf(outviz,width=8,height=10,useDingbats=FALSE);

	heat.out <- heatmap.2(data.mat, lhei=c(1,4), #lwid=c(3,10),
	key=TRUE, key.xlab="Count",trace="none",density.info="none",dendrogram="col",
	Rowv=TRUE,Colv=TRUE,col=colfunc(75), breaks=colors, symbreaks=FALSE,
	labCol=names(data.mat),labRow="", cexCol=0.7, scale="none", margins=c(12,2));
	## was sepwidth=0.01,0.01, margins=10,10 for genewise plot
	dev.off();
	ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
	write.table(file=outfile, ordered.data, sep="\t", quote=FALSE,col.names=NA);
	return();
	}

	## ##do the same thing with euclidean distance
	## pdf("heatmap.top1000.euclidian.pdf",width=8,height=24,useDingbats=FALSE);
	## heat.out <- heatmap.2(data.mat, lhei=c(0.5,9.5), lwid=c(3,10), key=TRUE, key.xlab="Count",trace="none",
	## density.info="none", dendrogram="col", Rowv=TRUE, Colv=TRUE, col=colfunc(75),
	## breaks=colors, symbreaks=FALSE, labCol=names(data.mat), labRow="", cexCol=0.7,
	## scale="none", margins=c(15,5));# was sepwidth=0.01,0.01, margins=10,10 for genewise plot

	## ordered.data <- data.mat[heat.out$rowInd,heat.out$colInd]; # z-transformed data as ordered in the heatmap
	## write.table(file="out.lengthScaledTPM.counts.normalized.ordered1.tsv", ordered.data, sep="\t", quote=FALSE);
	## dev.off();

	stop(paste0("unknown type ",type," for heatmap"))
	}

	##identify diff expr genes
	edgeRclassic <- function(comparison, y, counts.cpm, outdir) { # "comparison" is just a label/name given to the comparison, y and counts.cpm are defined above
	print("below warning about 'Design matrix not provided' is expected")
	y.classic <- estimateDisp(y); # this performs dispersion (common and tagwise (ie gene wise)) estimation in the classic version of EdgeR
	et <- exactTest(y.classic); # recall that y.classic already has the group information
	total = length(et@.Data[[1]]$PValue);
	results <- topTags(et, adjust.method="BH", sort.by="PValue", n=total); # table of the top differntially-expressed "tags" ie "genes"
	results.table <- results@.Data[[1]];
	results.pos <- results.table[which(results.table$logFC >= 0 & results.table$FDR <= 0.05),];
	results.neg <- results.table[which(results.table$logFC < 0 & results.table$FDR<= 0.05),];
	data.pos <- counts.cpm[rownames(results.pos),] # note that this uses x.0.wt.cpm
	data.neg <- counts.cpm[rownames(results.neg),]
	## write.table(results@.Data[[1]],file=sprintf("%s/DEGs.%s.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
	## write.table(results.pos,file=sprintf("%s/DEGs.%s.pos.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
	## write.table(results.neg,file=sprintf("%s/DEGs.%s.neg.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
	## write.table(data.pos,file=sprintf("%s/DEGs.%s.pos.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);
	## write.table(data.neg,file=sprintf("%s/DEGs.%s.neg.CPM.tsv", outdir, comparison), quote=FALSE,sep='\t',row.names=TRUE, col.names=NA);

	result = list(table=results, pos=data.pos, neg=data.neg);
	return(result);
	}

	#make groups vector
	getGroups <- function(n,config){
	groups=c();
	for(i in n){
	g = config[config$sample==i,]$group
	if(is.na(g) \| g==""){
	stop(paste0("sample name ",i," not found in config file"))
	} else {
	groups=c(groups,g)
	}
	}
	return(groups)
	}

	#create matrix from the given files
	createMatrix <- function(config, conv.table){
	## list of kallisto output files to be read as input
	k=config$path
	names(k) = config$sample

	# transcript-gene conversions for ENSEMBL annotations
	tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);

	writeTables <- function(txi,suffix){
	write.table(txi$abundance, file=paste0(outdir,"/raw.abundance.tsv"), sep='\t', quote=FALSE);
	write.table(txi$counts, file=paste0(outdir,"/raw.counts.tsv"), sep='\t', quote=FALSE);
	write.table(txi$length, file=paste0(outdir,"/raw.length.tsv"), sep='\t', quote=FALSE);
	}

	# length-scaled genewise counts from abundance:
	txi <- tximport(k, type="kallisto", tx2gene=tx2gene, countsFromAbundance="lengthScaledTPM",ignoreTxVersion=T);
	print("txi created")
	writeTables(txi,"lengthScaledTPM")
	}

	#take a matrix, add gene names from gene ids
	addGeneNames <- function(m, conv.table, type="matrix"){
	if(type=="matrix"){
	m = as.data.frame(m)
	m$gene_id = row.names(m)
	row.names(m) = NULL
	} else { #assume type = 'df'
	names(m)[which(names(m)=="Row.names")] = "gene_id"
	}


	tx2gene=read.table(conv.table, sep='\t',header=TRUE,quote='',check.names=FALSE, stringsAsFactors=F);
	#tx2gene=unique(tx2gene[,c("ensembl_gene_id","external_gene_name")])
	tx2gene=unique(tx2gene[,c(2,3)])
	names(tx2gene) = c("ensembl_gene_id","external_gene_name")

	return(merge(m,tx2gene,by.x="gene_id",by.y="ensembl_gene_id"))
	}

	##------------------------------------------------------
	args <- commandArgs(trailingOnly = TRUE)
	if(length(args) < 3){
	stop("requires three arguments: config file, transcript to gene table, and output directory")
	}
	config=read.table(args[1], header=F, stringsAsFactors=F, sep="\t")
	names(config) = c("sample","group","path")

	#create the output directory
	outdir=args[3];
	dir.create(outdir)

	mat = createMatrix(config,args[2])

	#read in the counts
	print("reading count data")
	rawcounts <- read.table(mat, sep='\t', quote="", check.names=FALSE,header=TRUE,row.names=1)
	#subset to only those in the config file
	y <- DGEList(counts=rawcounts);

	print("filtering/normalizing...")
	## filter here for low expression, requiring counts-per-million (CPM) be at least 1 in at least half of the samples, and recalculate library sizes:
	min.thresh=ceiling(length(names(rawcounts))/2);
	keep <- rowSums(cpm(y)>1) >= min.thresh;
	y <- y[keep, , keep.lib.sizes=FALSE]; # forces recalculation of library sizes

	#normalize the data
	y <- calcNormFactors(y);
	counts.cpm <- cpm(y, normalized.lib.sizes=TRUE)

	write.table(addGeneNames(counts.cpm,args[2]),file=paste0(outdir,"/counts.normalized.tsv"),quote=FALSE, sep='\t',row.names=F);

	print("creating mdsplot...")
	##make mds plot for QC
	mdsPlot(y,file=paste0(outdir,"/mdsplot.pdf"));

	print("doing hierarchical clustering/creating heatmap...")
	doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.top1000.pdf"), outfile=paste0(outdir,"/heatmap.clustered.top1000.tsv"), num=1000, type="top");
	doClustering(counts.cpm, outviz=paste0(outdir,"/heatmap.all.pdf"), outfile=paste0(outdir,"/heatmap.clustered.all.tsv"), type="all");

	##now, before we calculate diff expr, let's subset to just the samples we actually want to compare
	counts.cpm.comp = counts.cpm[,colnames(counts.cpm) %in% config$sample]
	y2 <- DGEList(counts=counts.cpm.comp, group=getGroups(colnames(counts.cpm.comp),config));

	print("calculating diff expression...")
	results = edgeRclassic("comp1", y2, counts.cpm.comp, outdir)
	z = merge(counts.cpm,results$table@.Data[[1]],by=0)
	write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.tsv"),quote=FALSE,sep='\t',row.names=FALSE);

	#make a heatmap of just the differentially expressed genes across all samples
	#grab the DE genes
	results.table=results$table@.Data[[1]];
	results.sig <- results.table[which(results.table$FDR <= 0.05),];
	if(length(results.sig[,1]) == 0){
	print("No differentially expressed genes detected - execution stopped")
	system(paste0("touch ",outdir,"/counts.normalized.pvals.degs.tsv"))
	stop("No differentially expressed genes detected")
	}

	counts.cpm.de <- counts.cpm[which(rownames(counts.cpm) %in% rownames(results.sig)),] # note that this uses x.0.wt.cpm
	z = merge(counts.cpm.de,results.sig,by=0)
	write.table(addGeneNames(z,args[2],type="df"),file=paste0(outdir,"/counts.normalized.pvals.degs.tsv"),quote=FALSE,sep='\t',row.names=FALSE);

	print("doing hierarchical clustering/creating heatmap...")
	doClustering(counts.cpm.de, outviz=paste0(outdir,"/heatmap.degenes.pdf"), outfile=paste0(outdir,"/heatmap.clustered.de.tsv"), type="all");