cdiener/orlitsky.R

## orlitsky.R
library(data.table)
library(ggplot2)
library(magrittr)
library(pbapply)

large <- fread("ERR260132_genes.csv")

#' Sample a rarefied version of a count vector.
#'
#' @param x A named vector of counts.
#' @param n Total number of counts in the new vector.
#' @return A named vector with counts for each observed event in the sample.
#' @examples
#'  counts <- 1:100
#'  subsample(counts, 10)
#' @export
subsample <- function(x, n) {
    sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x))
    names(sa) <- names(x)[1:length(sa)]
    return(sa[sa > 0])
}

#' Orlitsky's diminishing attenuation estimator (q1/3).
#'
#' @param data A named vector of counts for which to approximate discrete
#'  probablities.
#' @return A named vector `p` assigning a probability to each event in data.
#'  Those do not sum up to one since there is also a remaining probability to
#'  observe a new event given as p(new) = 1 - sum(p).
#' @examples
#'  x <- sample(1:10, 100, replace=T)
#'  p <- orlitsky(x)
#' @export
orlitsky <- function(data) {
    n <- length(data)
    phi <- c(tabulate(data), 0)
    cn <- ceiling((n+1)^1/3)
    new <- max(cn, phi[1] + 1)
    probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data])
    names(probs) <- names(data)
    return(probs/sum(c(probs, new)))
}

real <- large[, expected_count]
names(real) <- large[, name]
#real <- rep(1, 1000)
#names(real) <- as.character(1:1000)
real <- real/sum(real)


error <- function(x, y) {
    return(median(x / y[names(x)]))
}

#' Benchmark different probability estimators.
#'
#' @param real The real discrete distribution from which to generate samples.
#' @param n A series of sample sizes.
#' @return Nothing.
#' @examples
#'  p <- (1:10)/10
#'  benchmark(p)
#' @export
benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) {
    samples <- pblapply(10^seq(1, 8, length.out=100), function(k) {
        x <- subsample(real, k)
        po <- orlitsky(x)              # diminishing attenuation
        pp <- x/sum(x)                 # proportions
        pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts
        data.table(n=k,
                orlitsky=error(po, real),
                orlitsky_new=1 - sum(po),
                proportion=error(pp, real),
                pseudo=error(pc, real))
    })
    return(rbindlist(samples))
}

bench <- benchmark(real)
bench_long <- melt(bench, id.vars="n")

att_plot <- ggplot(rare_long[!grepl("new", variable)],
                   aes(x=n, y=value, col=variable)) +
            geom_point() + geom_smooth() +
            scale_x_log10() + scale_y_log10()
	library(data.table)
	library(ggplot2)
	library(magrittr)
	library(pbapply)

	large <- fread("ERR260132_genes.csv")

	#' Sample a rarefied version of a count vector.
	#'
	#' @param x A named vector of counts.
	#' @param n Total number of counts in the new vector.
	#' @return A named vector with counts for each observed event in the sample.
	#' @examples
	#' counts <- 1:100
	#' subsample(counts, 10)
	#' @export
	subsample <- function(x, n) {
	sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x))
	names(sa) <- names(x)[1:length(sa)]
	return(sa[sa > 0])
	}

	#' Orlitsky's diminishing attenuation estimator (q1/3).
	#'
	#' @param data A named vector of counts for which to approximate discrete
	#' probablities.
	#' @return A named vector `p` assigning a probability to each event in data.
	#' Those do not sum up to one since there is also a remaining probability to
	#' observe a new event given as p(new) = 1 - sum(p).
	#' @examples
	#' x <- sample(1:10, 100, replace=T)
	#' p <- orlitsky(x)
	#' @export
	orlitsky <- function(data) {
	n <- length(data)
	phi <- c(tabulate(data), 0)
	cn <- ceiling((n+1)^1/3)
	new <- max(cn, phi[1] + 1)
	probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data])
	names(probs) <- names(data)
	return(probs/sum(c(probs, new)))
	}

	real <- large[, expected_count]
	names(real) <- large[, name]
	#real <- rep(1, 1000)
	#names(real) <- as.character(1:1000)
	real <- real/sum(real)


	error <- function(x, y) {
	return(median(x / y[names(x)]))
	}

	#' Benchmark different probability estimators.
	#'
	#' @param real The real discrete distribution from which to generate samples.
	#' @param n A series of sample sizes.
	#' @return Nothing.
	#' @examples
	#' p <- (1:10)/10
	#' benchmark(p)
	#' @export
	benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) {
	samples <- pblapply(10^seq(1, 8, length.out=100), function(k) {
	x <- subsample(real, k)
	po <- orlitsky(x) # diminishing attenuation
	pp <- x/sum(x) # proportions
	pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts
	data.table(n=k,
	orlitsky=error(po, real),
	orlitsky_new=1 - sum(po),
	proportion=error(pp, real),
	pseudo=error(pc, real))
	})
	return(rbindlist(samples))
	}

	bench <- benchmark(real)
	bench_long <- melt(bench, id.vars="n")

	att_plot <- ggplot(rare_long[!grepl("new", variable)],
	aes(x=n, y=value, col=variable)) +
	geom_point() + geom_smooth() +
	scale_x_log10() + scale_y_log10()