gu-mi/Pearson_Fisher.R

## Pearson_Fisher.R
# http://www.r-bloggers.com/example-10-7-fisher-vs-pearson/
# Author: Ken Kleinman

# In R, we'll simulate observations from a multinomial distribution
# with the desired cell probabilities, and assemble the result into
# a table to calculate the p-values. This will make it easier to simulate
# tables under the alternative, as we need to do to assess power.
# If there are empty rows or columns, the chisq.test() function produces
# a p-value of "NaN", which will create problems later. To avoid this,
# we'll put the table generation inside a while() function. This operates
# like the do while construction in SAS (and other programming languages).
# The condition we check for is whether there is a row or column with 0 observations;
# if so, try generating the data again. We begin by initializing the table with 0's.

makeitm = function(n.in.table, probs)  {
  myt = matrix(rep(0,4), ncol=2)
  while( (min(colSums(myt)) == 0) | (min(rowSums(myt)) == 0) ){
    myt = matrix(rmultinom(n=1, size=n.in.table, probs), ncol=2,byrow=TRUE)
  }
  chisqp = chisq.test(myt, correct=FALSE)$p.value
  fishp = fisher.test(myt)$p.value
  return(c(chisqp, fishp))
}

# With this basic building block in place, we can write a function to call it
# repeatedly (using the replicate() function, then calculate the proportion of
# rejections and get the CI for the probability of rejections.

many.ns = function(tablen, nds, probs) {
res1 = t(replicate(nds, makeitm(tablen,probs)))
res2 = res1 < 0.05
pear = sum(res2[,1])/nds
fish = sum(res2[,2])/nds
pearci = binom.test(sum(res2[,1]),nds)$conf.int[1:2]
fishci = binom.test(sum(res2[,2]),nds)$conf.int[1:2]
return(c("N.ds" = nds, "N.table" = tablen, probs,
         "Pearson.p" = pear, "PCIl"=pearci[1], "PCIu"=pearci[2],
         "Fisher.p" = fish, "FCIl" = fishci[1], "FCIu" = fishci[2]))
}

# Finally, we're ready to actually do the experiment, using sapply() to call
# the function that calls replicate() to call the function that makes a table.
# The result is converted to a data frame to make the plotting simpler.
# The first call below replicates the SAS result shown above and has very similar
# estimates and CI, but is not displayed here. The second shows an odds ratio of 3,
# the third (plotted below) has an OR of 1.75, and the last an OR of 1.5.

#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=10000,
  probs = c(.56,.24,.14,.06))))
#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000,
  probs = c(.6,.2,.1,.1))))
res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000), many.ns, nds=1000,
  probs = c(.58,.22,.12,.08))))
#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000,
  probs = c(.57,.23,.13,.07))))

with(res3, plot(x = 1, y =1, type="n",
                ylim = c(0, max(c(PCIu,FCIu))), xlim=c(0,1000),
                xlab = "N in table", ylab="Rejections",
                main="Fisher vs. Pearson"))
with(res3,points(y=Pearson.p, x=N.table,col="blue"))  # blue is Pearson
# Draw line segments between pairs of points:
with(res3,segments(x0 = N.table, x1=N.table, y0 = PCIl, y1= PCIu, col = "blue"))
with(res3,points(y=Fisher.p, x=N.table + 2,col="red"))  # red is Fisher
with(res3,segments(x0 = N.table+2, x1=N.table+2, y0 = FCIl, y1= FCIu, col = "red"))
abline(v=83)
abline(h=0.05)

# Overall, the results show (in the SAS plot, at the top) that the
# Pearson chi-square test does perfectly well at protecting the alpha level
# under the null with these margins, even when the expected number of cases
# in one cell is as small as 1. In contrast, compared to the exact test,
# the Chi-square test has a bit more power, for these cell probabilities.
# The power difference is greatest when the N is smaller. Given this example,
# I would say that the rule of thumb may be too conservative, pushing people
# away from a more powerful test unnecessarily. In the future, I plan to be
# more positive about using the Pearson chi-square.
	# http://www.r-bloggers.com/example-10-7-fisher-vs-pearson/
	# Author: Ken Kleinman

	# In R, we'll simulate observations from a multinomial distribution
	# with the desired cell probabilities, and assemble the result into
	# a table to calculate the p-values. This will make it easier to simulate
	# tables under the alternative, as we need to do to assess power.
	# If there are empty rows or columns, the chisq.test() function produces
	# a p-value of "NaN", which will create problems later. To avoid this,
	# we'll put the table generation inside a while() function. This operates
	# like the do while construction in SAS (and other programming languages).
	# The condition we check for is whether there is a row or column with 0 observations;
	# if so, try generating the data again. We begin by initializing the table with 0's.

	makeitm = function(n.in.table, probs) {
	myt = matrix(rep(0,4), ncol=2)
	while( (min(colSums(myt)) == 0) \| (min(rowSums(myt)) == 0) ){
	myt = matrix(rmultinom(n=1, size=n.in.table, probs), ncol=2,byrow=TRUE)
	}
	chisqp = chisq.test(myt, correct=FALSE)$p.value
	fishp = fisher.test(myt)$p.value
	return(c(chisqp, fishp))
	}

	# With this basic building block in place, we can write a function to call it
	# repeatedly (using the replicate() function, then calculate the proportion of
	# rejections and get the CI for the probability of rejections.

	many.ns = function(tablen, nds, probs) {
	res1 = t(replicate(nds, makeitm(tablen,probs)))
	res2 = res1 < 0.05
	pear = sum(res2[,1])/nds
	fish = sum(res2[,2])/nds
	pearci = binom.test(sum(res2[,1]),nds)$conf.int[1:2]
	fishci = binom.test(sum(res2[,2]),nds)$conf.int[1:2]
	return(c("N.ds" = nds, "N.table" = tablen, probs,
	"Pearson.p" = pear, "PCIl"=pearci[1], "PCIu"=pearci[2],
	"Fisher.p" = fish, "FCIl" = fishci[1], "FCIu" = fishci[2]))
	}

	# Finally, we're ready to actually do the experiment, using sapply() to call
	# the function that calls replicate() to call the function that makes a table.
	# The result is converted to a data frame to make the plotting simpler.
	# The first call below replicates the SAS result shown above and has very similar
	# estimates and CI, but is not displayed here. The second shows an odds ratio of 3,
	# the third (plotted below) has an OR of 1.75, and the last an OR of 1.5.

	#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=10000,
	probs = c(.56,.24,.14,.06))))
	#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000,
	probs = c(.6,.2,.1,.1))))
	res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000), many.ns, nds=1000,
	probs = c(.58,.22,.12,.08))))
	#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000,
	probs = c(.57,.23,.13,.07))))

	with(res3, plot(x = 1, y =1, type="n",
	ylim = c(0, max(c(PCIu,FCIu))), xlim=c(0,1000),
	xlab = "N in table", ylab="Rejections",
	main="Fisher vs. Pearson"))
	with(res3,points(y=Pearson.p, x=N.table,col="blue")) # blue is Pearson
	# Draw line segments between pairs of points:
	with(res3,segments(x0 = N.table, x1=N.table, y0 = PCIl, y1= PCIu, col = "blue"))
	with(res3,points(y=Fisher.p, x=N.table + 2,col="red")) # red is Fisher
	with(res3,segments(x0 = N.table+2, x1=N.table+2, y0 = FCIl, y1= FCIu, col = "red"))
	abline(v=83)
	abline(h=0.05)

	# Overall, the results show (in the SAS plot, at the top) that the
	# Pearson chi-square test does perfectly well at protecting the alpha level
	# under the null with these margins, even when the expected number of cases
	# in one cell is as small as 1. In contrast, compared to the exact test,
	# the Chi-square test has a bit more power, for these cell probabilities.
	# The power difference is greatest when the N is smaller. Given this example,
	# I would say that the rule of thumb may be too conservative, pushing people
	# away from a more powerful test unnecessarily. In the future, I plan to be
	# more positive about using the Pearson chi-square.