markrobinsonuzh/A difficult NB estimation problem

## A difficult NB estimation problem
# Initial problem posed at:
# http://stats.stackexchange.com/questions/173450/an-impossible-estimation-problem?stw=2

set.seed(167)
x = rnbinom(100, size=3.2, prob=.8);
#x = rnbinom(100, size=3.2, mu=.8);
# note: mu = size*(1-prob)/prob ..
# in this case, mu=prob (3.2*.2/.8 = .8)

n <- 100
ns <- 1000

# randomly create another 999 samples with this setting
z <- matrix(nrow=ns,ncol=n)
for(i in 2:ns)
   z[i,] <- rnbinom(n, size=3.2, mu=.8)
z[1,] <- x

library(edgeR)
library(matrixStats)

# estimate with edgeR i.e., adjusted profile likelihood w/ no moderation
d <- DGEList(z)
d$samples$lib.size <- 1
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d,prior.df=0)

# plot the estimates with the truth (blue), Guillaume's sample (red)
plot( 2^f$coefficients, 1/d$tagwise.dispersion, log="y", ylab="size", xlab="mu")
abline(v=.8, col="blue", lwd=3)
abline(h=3.2, col="blue", lwd=3)
points( 2^f$coefficients[1], 1/d$tagwise.dispersion[1], pch=19, cex=2, col="red")


# dispersion is basically zero (or size is large) whenever
# variance is less than mean
#> table( var_lt_mean=rowVars(z) < rowMeans(z),
#         size_gt_199=1/d$tagwise.dispersion > 199)
#           size_gt_199
#var_lt_mean FALSE TRUE
#      FALSE   871    4
#      TRUE      0  125
	# Initial problem posed at:
	# http://stats.stackexchange.com/questions/173450/an-impossible-estimation-problem?stw=2

	set.seed(167)
	x = rnbinom(100, size=3.2, prob=.8);
	#x = rnbinom(100, size=3.2, mu=.8);
	# note: mu = size*(1-prob)/prob ..
	# in this case, mu=prob (3.2*.2/.8 = .8)

	n <- 100
	ns <- 1000

	# randomly create another 999 samples with this setting
	z <- matrix(nrow=ns,ncol=n)
	for(i in 2:ns)
	z[i,] <- rnbinom(n, size=3.2, mu=.8)
	z[1,] <- x

	library(edgeR)
	library(matrixStats)

	# estimate with edgeR i.e., adjusted profile likelihood w/ no moderation
	d <- DGEList(z)
	d$samples$lib.size <- 1
	d <- estimateCommonDisp(d)
	d <- estimateTagwiseDisp(d,prior.df=0)

	# plot the estimates with the truth (blue), Guillaume's sample (red)
	plot( 2^f$coefficients, 1/d$tagwise.dispersion, log="y", ylab="size", xlab="mu")
	abline(v=.8, col="blue", lwd=3)
	abline(h=3.2, col="blue", lwd=3)
	points( 2^f$coefficients[1], 1/d$tagwise.dispersion[1], pch=19, cex=2, col="red")


	# dispersion is basically zero (or size is large) whenever
	# variance is less than mean
	#> table( var_lt_mean=rowVars(z) < rowMeans(z),
	# size_gt_199=1/d$tagwise.dispersion > 199)
	# size_gt_199
	#var_lt_mean FALSE TRUE
	# FALSE 871 4
	# TRUE 0 125