Instantly share code, notes, and snippets.

Embed
What would you like to do?
Template for analysis with DESeq2
## RNA-seq analysis with DESeq2
## Stephen Turner, @genetics_blog
# RNA-seq data from GSE52202
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with
# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl")
# Import & pre-process ----------------------------------------------------
# Import data from featureCounts
## Previously ran at command line something like this:
## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam
countdata <- read.table("counts.txt", header=TRUE, row.names=1)
# Remove first five columns (chr, start, end, strand, length)
countdata <- countdata[ ,6:ncol(countdata)]
# Remove .bam or .sam from filenames
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))
# Convert to matrix
countdata <- as.matrix(countdata)
head(countdata)
# Assign condition (first four are controls, second four contain the expansion)
(condition <- factor(c(rep("ctl", 4), rep("exp", 4))))
# Analysis with DESeq2 ----------------------------------------------------
library(DESeq2)
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds
# Run the DESeq pipeline
dds <- DESeq(dds)
# Plot dispersions
png("qc-dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
# Colors for plots below
## Ugly:
## (mycols <- 1:length(unique(condition)))
## Use RColorBrewer, better
library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
library(gplots)
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
dev.off()
# Principal components analysis
## Could do with built-in DESeq2 function:
## DESeq2::plotPCA(rld, intgroup="condition")
## I like mine better:
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
png("qc-pca.png", 1000, 1000, pointsize=20)
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
dev.off()
# Get differential expression results
res <- results(dds)
table(res$padj<0.05)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
write.csv(resdata, file="diffexpr-results.csv")
## Examine plot of p-values
hist(res$pvalue, breaks=50, col="grey")
## Examine independent filtering
attr(res, "filterThreshold")
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections")
## MA plot
## Could do with built-in DESeq2 function:
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
## I like mine better:
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}
png("diffexpr-maplot.png", 1500, 1000, pointsize=20)
maplot(resdata, main="MA Plot")
dev.off()
## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
dev.off()
@geparada

This comment has been minimized.

geparada commented Aug 19, 2015

Thanks!
This template help me a lot to start learning DE analysis.

@jmzeng1314

This comment has been minimized.

jmzeng1314 commented Mar 2, 2016

wah, it's really a long code

@aforntacc

This comment has been minimized.

aforntacc commented May 28, 2016

Hi, i am new to R, also using the DESEQ2 for the first time. i obtained my data using featureCounts and your code is just good for beginners like me, however i got the error below, please can you kindly assist me on how to resolve it. thank you.

Error in DESeqDataSet(se, design = design, ignoreRank) :
some values in assay are negative

i used the following code
countwtmt <- read.table ("C:/Users/charles167/Desktop/RNASeq wei/countwtmt.txt", sep="\t", header=TRUE, row.names=1)
countwtmt <- countwtmt[ ,6:ncol(countwtmt)]
countwtmt <- as.matrix(countwtmt)
condition <- factor(c(rep("ctl", 2), rep("exp", 2))) # i have two replicates for Wildtype and Mutant respectively.
library(DESeq2)
(coldata <- data.frame(row.names=colnames(countwtmt), condition))
dds <- DESeqDataSetFromMatrix (countData=countwtmt, colData=coldata, design= ~condition, ignoreRank = FALSE)
Error in DESeqDataSet(se, design = design, ignoreRank) :
some values in assay are negative
dds
Error: object 'dds' not found

@JeremyHorst

This comment has been minimized.

JeremyHorst commented Feb 7, 2017

Thank you very much for this beautiful code. I love the volcano plot. Nice contribution.

@ellango85

This comment has been minimized.

ellango85 commented Jul 13, 2017

Thank you for the detailed script. Useful script for beginners handling RNA-seq

@GildasLepennetier

This comment has been minimized.

GildasLepennetier commented Jul 26, 2017

Great code!
-> to answer to "aforntacc" on May 28, 2016 : you probably do not give count data, but something else, since the function 'DESeqDataSetFromMatrix' crashes. The error message is "some values in assay are negative", so I think it is quite clear that you have negative values, something not possible in count data.

@ccarlos-shanley

This comment has been minimized.

ccarlos-shanley commented Jul 31, 2017

Thank you!

@luizafp

This comment has been minimized.

luizafp commented Aug 21, 2017

Hello, Thanks for the code! really helpful for beginners.

When I'm trying to run the pipeline:
dds <- DESeq(dds)

I get this error message:
estimating size factors
Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, :
every gene contains at least one zero, cannot compute log geometric means

Anyone could give me some advice? i cant have zeros?

@Mbrink94

This comment has been minimized.

Mbrink94 commented Nov 26, 2017

Hi, I'm struggling with the DESeqDataSetFromMatrix command and can't seem to find similar problems online. I have metagenomic count data and would like to follow the above template for differential abundance analysis.

I use this code:
countdata <- read.table("OTU_family.txt", fill=TRUE, header=TRUE, row.names=1)
countdata <- as.matrix(countdata)
(condition <- factor(c(rep("Natural", 5),rep("Healthy", 7), rep ("Diseased", 7), rep ("Stressed", 7))))
library(DESeq2)
(coldata <- data.frame(row.names=colnames(countdata), condition))
library(SummarizedExperiment)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)

and get this error:
Error in slot(value, what) :
no slot of name "exptData" for this object of class "SummarizedExperiment"

Does anyone have advice on what I'm doing wrong?

Thanks in advance.

@mcmahonjohnj

This comment has been minimized.

mcmahonjohnj commented May 28, 2018

When I get the DE results, I get values output for each cell. What are these values?

@AnimaTardeb

This comment has been minimized.

AnimaTardeb commented Jun 11, 2018

Thanks a lot for your code 👍

@ruby23

This comment has been minimized.

ruby23 commented Jun 28, 2018

Here's the code that I used:-
library(DESeq2)
countData = read.table(file = 'C:/Users/Dell/Desktop/M.BIOINFORMATICS/FYP/datafr.txt', header = TRUE, sep = '\t', row.names = 1)
colData = read.table(file = 'C:/Users/Dell/Desktop/M.BIOINFORMATICS/FYP/coldata.txt', header = TRUE, sep = '\t', row.names = 1)
colData[['condition']] = factor(colData[['condition']], levels = c('Normal', 'Tumour'))
dataset <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)

And here's the error that I'm getting:-

dataset <- DESeqDataSetFromMatrix(countData = countData,colData = colData,design = ~condition)
Error in DESeqDataSet(se, design = design, ignoreRank) :
some values in assay are negative

The countData is basically a breast cancer NGS dataset which comprises of 11 normal samples and 62 tumour samples. There are 1391 genes (rows) in total. There are negative values in the original dataset. I'm new to R and trying to cope up. Can anyone please tell me why there shouldn't be any negative values? Am i missing any steps? Please help me out.

@lbcp

This comment has been minimized.

lbcp commented Jul 26, 2018

Thanks a lot for the script. It really helped to get me started with the analysis.

@ruby23 There shouldn't be any negative values because the DESeq2 package requires raw counts. That means, you should have only positive integer values or zeros in your data. Since you probably didn't acquire the NGS data yourself, make sure that you use the raw counts and not some already normalised or log-transformed values.

@RichardBJ

This comment has been minimized.

RichardBJ commented Aug 31, 2018

@aforntacc As above should be no negative counts, min NGS "count" is zero :-). The template script assumes your counts are in a particular column, but other columns will have things like log2 changes. Check your input file and then modify this line accordingly.
countdata <- countdata[ ,6:ncol(countdata)]

@jbergsveinson

This comment has been minimized.

jbergsveinson commented Oct 26, 2018

Great consolidated script. Works seamlessly for me up until maplot and volcanoplot however. I receive the following error using the unedited script above:
Error in plot.window(...) : need finite 'xlim' values
Calls: plot -> plot -> plot.default -> localWindow -> plot.window
In addition: Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
....

I've both checked my input for completely null/NA entries, and have attempted to edit script, by adding explicit xlim call (i.e., xlim=c(-2.5,2),) at line 126 (maplot) and line 139 (volcanoplot), however to no avail. What else could I be missing/am I not addressing?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment