Source code for my blog post
library(data.table) | |
library(ggplot2) | |
library(magrittr) | |
library(pbapply) | |
large <- fread("ERR260132_genes.csv") | |
#' Sample a rarefied version of a count vector. | |
#' | |
#' @param x A named vector of counts. | |
#' @param n Total number of counts in the new vector. | |
#' @return A named vector with counts for each observed event in the sample. | |
#' @examples | |
#' counts <- 1:100 | |
#' subsample(counts, 10) | |
#' @export | |
subsample <- function(x, n) { | |
sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x)) | |
names(sa) <- names(x)[1:length(sa)] | |
return(sa[sa > 0]) | |
} | |
#' Orlitsky's diminishing attenuation estimator (q1/3). | |
#' | |
#' @param data A named vector of counts for which to approximate discrete | |
#' probablities. | |
#' @return A named vector `p` assigning a probability to each event in data. | |
#' Those do not sum up to one since there is also a remaining probability to | |
#' observe a new event given as p(new) = 1 - sum(p). | |
#' @examples | |
#' x <- sample(1:10, 100, replace=T) | |
#' p <- orlitsky(x) | |
#' @export | |
orlitsky <- function(data) { | |
n <- length(data) | |
phi <- c(tabulate(data), 0) | |
cn <- ceiling((n+1)^1/3) | |
new <- max(cn, phi[1] + 1) | |
probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data]) | |
names(probs) <- names(data) | |
return(probs/sum(c(probs, new))) | |
} | |
real <- large[, expected_count] | |
names(real) <- large[, name] | |
#real <- rep(1, 1000) | |
#names(real) <- as.character(1:1000) | |
real <- real/sum(real) | |
error <- function(x, y) { | |
return(median(x / y[names(x)])) | |
} | |
#' Benchmark different probability estimators. | |
#' | |
#' @param real The real discrete distribution from which to generate samples. | |
#' @param n A series of sample sizes. | |
#' @return Nothing. | |
#' @examples | |
#' p <- (1:10)/10 | |
#' benchmark(p) | |
#' @export | |
benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) { | |
samples <- pblapply(10^seq(1, 8, length.out=100), function(k) { | |
x <- subsample(real, k) | |
po <- orlitsky(x) # diminishing attenuation | |
pp <- x/sum(x) # proportions | |
pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts | |
data.table(n=k, | |
orlitsky=error(po, real), | |
orlitsky_new=1 - sum(po), | |
proportion=error(pp, real), | |
pseudo=error(pc, real)) | |
}) | |
return(rbindlist(samples)) | |
} | |
bench <- benchmark(real) | |
bench_long <- melt(bench, id.vars="n") | |
att_plot <- ggplot(rare_long[!grepl("new", variable)], | |
aes(x=n, y=value, col=variable)) + | |
geom_point() + geom_smooth() + | |
scale_x_log10() + scale_y_log10() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment