Skip to content

Instantly share code, notes, and snippets.

@cdiener
Created September 1, 2017 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdiener/069a24964d0b426de067020e20bca0ec to your computer and use it in GitHub Desktop.
Save cdiener/069a24964d0b426de067020e20bca0ec to your computer and use it in GitHub Desktop.
Source code for my blog post
library(data.table)
library(ggplot2)
library(magrittr)
library(pbapply)
large <- fread("ERR260132_genes.csv")
#' Sample a rarefied version of a count vector.
#'
#' @param x A named vector of counts.
#' @param n Total number of counts in the new vector.
#' @return A named vector with counts for each observed event in the sample.
#' @examples
#' counts <- 1:100
#' subsample(counts, 10)
#' @export
subsample <- function(x, n) {
sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x))
names(sa) <- names(x)[1:length(sa)]
return(sa[sa > 0])
}
#' Orlitsky's diminishing attenuation estimator (q1/3).
#'
#' @param data A named vector of counts for which to approximate discrete
#' probablities.
#' @return A named vector `p` assigning a probability to each event in data.
#' Those do not sum up to one since there is also a remaining probability to
#' observe a new event given as p(new) = 1 - sum(p).
#' @examples
#' x <- sample(1:10, 100, replace=T)
#' p <- orlitsky(x)
#' @export
orlitsky <- function(data) {
n <- length(data)
phi <- c(tabulate(data), 0)
cn <- ceiling((n+1)^1/3)
new <- max(cn, phi[1] + 1)
probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data])
names(probs) <- names(data)
return(probs/sum(c(probs, new)))
}
real <- large[, expected_count]
names(real) <- large[, name]
#real <- rep(1, 1000)
#names(real) <- as.character(1:1000)
real <- real/sum(real)
error <- function(x, y) {
return(median(x / y[names(x)]))
}
#' Benchmark different probability estimators.
#'
#' @param real The real discrete distribution from which to generate samples.
#' @param n A series of sample sizes.
#' @return Nothing.
#' @examples
#' p <- (1:10)/10
#' benchmark(p)
#' @export
benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) {
samples <- pblapply(10^seq(1, 8, length.out=100), function(k) {
x <- subsample(real, k)
po <- orlitsky(x) # diminishing attenuation
pp <- x/sum(x) # proportions
pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts
data.table(n=k,
orlitsky=error(po, real),
orlitsky_new=1 - sum(po),
proportion=error(pp, real),
pseudo=error(pc, real))
})
return(rbindlist(samples))
}
bench <- benchmark(real)
bench_long <- melt(bench, id.vars="n")
att_plot <- ggplot(rare_long[!grepl("new", variable)],
aes(x=n, y=value, col=variable)) +
geom_point() + geom_smooth() +
scale_x_log10() + scale_y_log10()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment