GabrielHoffman/gene_CV2.R

## gene_CV2.R
# some setup code from tximport
library("tximport")
library("readr")
library("tximportData")
dir <- system.file("extdata", package="tximportData")
samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
samples$condition <- factor(rep(c("A","B"),each=3))
rownames(samples) <- samples$run
samples[,c("pop","center","run","condition")]

files <- file.path(dir,"salmon", samples$run, "quant.sf.gz")
names(files) <- samples$run
tx2gene <- read_csv(file.path(dir, "tx2gene.gencode.v27.csv"))

txi <- tximport(files, type="salmon", tx2gene=tx2gene)


# Evalaute CV^2 for each gene or transcript
library(scran)

# only include genes with > 1 CPM in at least 2 samples
isexpr = rowSums(edgeR::cpm(txi$counts)>1) >= 2

# create a an object
se <- SummarizedExperiment( assays=list(counts = txi$counts[isexpr,]), colData = samples)

# Evaluate the square coefficient of variation for each gene
# Fit a background trend line, kind of like voom, under the assumption that most genes will not
# have true biological variation.  This might not be true is your case, so the trend line might not
# indicate the background, only average CV^2 at that expression level.
# Either way, ratio should be a good filtering metric.
res = modelGeneCV2( se )

# get the trend line
fit = metadata(res)

# plot genes and trend
plot(res$mean, res$total, pch=16, log="xy", xlab="Mean expression (log2 CPM)", ylab=bquote(CV^2))
curve(fit$trend(x), add=TRUE, col="dodgerblue")

# take a look
head(res)

# DataFrame with 6 rows and 6 columns
#                         mean     total     trend     ratio   p.value       FDR
#                    <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
# ENSG00000000419.12  1257.944 0.0444472 0.0437731  1.015401 0.4923557  0.881590
# ENSG00000000457.13   504.574 0.0244170 0.0540405  0.451828 0.7524034  0.881590
# ENSG00000000460.16   716.048 0.0428744 0.0489772  0.875394 0.5616063  0.881590
# ENSG00000000938.12  2707.572 0.0900935 0.0400914  2.247205 0.0603485  0.300738
# ENSG00000001036.13   822.391 0.0445398 0.0474151  0.939360 0.5300724  0.881590
# ENSG00000001084.10   439.084 0.0152997 0.0565975  0.270324 0.8180373  0.881590

# total: CV^2 of the gene
# trend: 'expected' CV^2 given genome-wide smoothing
# ratio: ratio between trend and total.  If greater than 1, CV^2 excees the backgroud level of variation
# p.value: tests the null that ratio <= 1
	# some setup code from tximport
	library("tximport")
	library("readr")
	library("tximportData")
	dir <- system.file("extdata", package="tximportData")
	samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
	samples$condition <- factor(rep(c("A","B"),each=3))
	rownames(samples) <- samples$run
	samples[,c("pop","center","run","condition")]

	files <- file.path(dir,"salmon", samples$run, "quant.sf.gz")
	names(files) <- samples$run
	tx2gene <- read_csv(file.path(dir, "tx2gene.gencode.v27.csv"))

	txi <- tximport(files, type="salmon", tx2gene=tx2gene)


	# Evalaute CV^2 for each gene or transcript
	library(scran)

	# only include genes with > 1 CPM in at least 2 samples
	isexpr = rowSums(edgeR::cpm(txi$counts)>1) >= 2

	# create a an object
	se <- SummarizedExperiment( assays=list(counts = txi$counts[isexpr,]), colData = samples)

	# Evaluate the square coefficient of variation for each gene
	# Fit a background trend line, kind of like voom, under the assumption that most genes will not
	# have true biological variation. This might not be true is your case, so the trend line might not
	# indicate the background, only average CV^2 at that expression level.
	# Either way, ratio should be a good filtering metric.
	res = modelGeneCV2( se )

	# get the trend line
	fit = metadata(res)

	# plot genes and trend
	plot(res$mean, res$total, pch=16, log="xy", xlab="Mean expression (log2 CPM)", ylab=bquote(CV^2))
	curve(fit$trend(x), add=TRUE, col="dodgerblue")

	# take a look
	head(res)

	# DataFrame with 6 rows and 6 columns
	# mean total trend ratio p.value FDR
	# <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
	# ENSG00000000419.12 1257.944 0.0444472 0.0437731 1.015401 0.4923557 0.881590
	# ENSG00000000457.13 504.574 0.0244170 0.0540405 0.451828 0.7524034 0.881590
	# ENSG00000000460.16 716.048 0.0428744 0.0489772 0.875394 0.5616063 0.881590
	# ENSG00000000938.12 2707.572 0.0900935 0.0400914 2.247205 0.0603485 0.300738
	# ENSG00000001036.13 822.391 0.0445398 0.0474151 0.939360 0.5300724 0.881590
	# ENSG00000001084.10 439.084 0.0152997 0.0565975 0.270324 0.8180373 0.881590

	# total: CV^2 of the gene
	# trend: 'expected' CV^2 given genome-wide smoothing
	# ratio: ratio between trend and total. If greater than 1, CV^2 excees the backgroud level of variation
	# p.value: tests the null that ratio <= 1