Last active
July 8, 2020 16:24
-
-
Save GabrielHoffman/7c17256e3ee024b7c52ec7cc74f70f2c to your computer and use it in GitHub Desktop.
Identifying gene with signal using coefficient of variation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# some setup code from tximport | |
library("tximport") | |
library("readr") | |
library("tximportData") | |
dir <- system.file("extdata", package="tximportData") | |
samples <- read.table(file.path(dir,"samples.txt"), header=TRUE) | |
samples$condition <- factor(rep(c("A","B"),each=3)) | |
rownames(samples) <- samples$run | |
samples[,c("pop","center","run","condition")] | |
files <- file.path(dir,"salmon", samples$run, "quant.sf.gz") | |
names(files) <- samples$run | |
tx2gene <- read_csv(file.path(dir, "tx2gene.gencode.v27.csv")) | |
txi <- tximport(files, type="salmon", tx2gene=tx2gene) | |
# Evalaute CV^2 for each gene or transcript | |
library(scran) | |
# only include genes with > 1 CPM in at least 2 samples | |
isexpr = rowSums(edgeR::cpm(txi$counts)>1) >= 2 | |
# create a an object | |
se <- SummarizedExperiment( assays=list(counts = txi$counts[isexpr,]), colData = samples) | |
# Evaluate the square coefficient of variation for each gene | |
# Fit a background trend line, kind of like voom, under the assumption that most genes will not | |
# have true biological variation. This might not be true is your case, so the trend line might not | |
# indicate the background, only average CV^2 at that expression level. | |
# Either way, ratio should be a good filtering metric. | |
res = modelGeneCV2( se ) | |
# get the trend line | |
fit = metadata(res) | |
# plot genes and trend | |
plot(res$mean, res$total, pch=16, log="xy", xlab="Mean expression (log2 CPM)", ylab=bquote(CV^2)) | |
curve(fit$trend(x), add=TRUE, col="dodgerblue") | |
# take a look | |
head(res) | |
# DataFrame with 6 rows and 6 columns | |
# mean total trend ratio p.value FDR | |
# <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> | |
# ENSG00000000419.12 1257.944 0.0444472 0.0437731 1.015401 0.4923557 0.881590 | |
# ENSG00000000457.13 504.574 0.0244170 0.0540405 0.451828 0.7524034 0.881590 | |
# ENSG00000000460.16 716.048 0.0428744 0.0489772 0.875394 0.5616063 0.881590 | |
# ENSG00000000938.12 2707.572 0.0900935 0.0400914 2.247205 0.0603485 0.300738 | |
# ENSG00000001036.13 822.391 0.0445398 0.0474151 0.939360 0.5300724 0.881590 | |
# ENSG00000001084.10 439.084 0.0152997 0.0565975 0.270324 0.8180373 0.881590 | |
# total: CV^2 of the gene | |
# trend: 'expected' CV^2 given genome-wide smoothing | |
# ratio: ratio between trend and total. If greater than 1, CV^2 excees the backgroud level of variation | |
# p.value: tests the null that ratio <= 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment