BioSciEconomist/fun with CIs.r

## fun with CIs.r
#*-----------------------------------------------------------------
# | PROGRAM NAME: fun with CIs.r
# | DATE: 7/16/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: basic CI simulation
# *----------------------------------------------------------------


rm(list=ls()) # get rid of any existing data
cat("\f") # clear console
dev.off() # clear plots
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


library(MASS)


#
# simulate population
#

mu <- 100  # mean 4000
sd <- 10 # standard deviation 3 x mean


# simulate costs based on parameter assumptions above
set.seed(1234567)
y <- rnorm(10000,100,10)

# check descriptive results
summary(y)
sd(y)
hist(y, breaks = 50)


# data frame
ID <- seq(1,10000)

df <- data.frame(ID,y)


#
# simualte sampling
#


Sn = 30
set.seed(9876) # make this repeatable 1234567

xbar <- c()
stderr <- c()
CI_Lower <- c()
CI_Upper <- c()
repNum <- c()

for (i in 1:500) {
  sample <- df[sample(nrow(df), Sn, replace = F), ] # sample from treatment group

  est <- mean(sample$y)
  se <- sd(sample$y)/sqrt(Sn) # SE
  #LB <- est - 1.96*se
  #UB <- est + 1.96*se
  LB <- est - se # just look at error bars
  UB <- est + se # just look at error bars
  xbar <- c(xbar,est)
  stderr <- c(stderr,se)
  CI_Lower <- c(CI_Lower, LB)
  CI_Upper <- c(CI_Upper, UB)
  repNum <- c(repNum,i) # get p-value from this bs sample
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
}


rpt <- data.frame(repNum,xbar,stderr,CI_Lower,CI_Upper)

rm(LB,UB,se,est,i,sample) # cleanup


k <- 100 # population parameter we are estimating


# report the simulation results
rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
rpt$sig <- ifelse((rpt$CI_Lower <= 0 &  rpt$CI_Upper >= 0),0,1) # 1 if statistically different from 0
rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
rpt$diffEst <- abs(k - rpt$xbar) # difference between point estimate and k
rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB  & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
summary(rpt) # check

sd(rpt$xbar)

hist(rpt$xbar, breaks = 50)


# visualize

plot(rpt$xbar, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
abline(h = 0)

# order by estimate
tmp <- rpt[order(rpt$xbar),]
plot(tmp$xbar, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
     xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
abline(h = 0)\
abline(h = 100, col="green")
	#*-----------------------------------------------------------------
	# \| PROGRAM NAME: fun with CIs.r
	# \| DATE: 7/16/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: basic CI simulation
	# *----------------------------------------------------------------



	rm(list=ls()) # get rid of any existing data
	cat("\f") # clear console
	dev.off() # clear plots
	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


	library(MASS)



	#
	# simulate population
	#

	mu <- 100 # mean 4000
	sd <- 10 # standard deviation 3 x mean



	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	y <- rnorm(10000,100,10)

	# check descriptive results
	summary(y)
	sd(y)
	hist(y, breaks = 50)



	# data frame
	ID <- seq(1,10000)

	df <- data.frame(ID,y)



	#
	# simualte sampling
	#


	Sn = 30
	set.seed(9876) # make this repeatable 1234567

	xbar <- c()
	stderr <- c()
	CI_Lower <- c()
	CI_Upper <- c()
	repNum <- c()

	for (i in 1:500) {
	sample <- df[sample(nrow(df), Sn, replace = F), ] # sample from treatment group

	est <- mean(sample$y)
	se <- sd(sample$y)/sqrt(Sn) # SE
	#LB <- est - 1.96*se
	#UB <- est + 1.96*se
	LB <- est - se # just look at error bars
	UB <- est + se # just look at error bars
	xbar <- c(xbar,est)
	stderr <- c(stderr,se)
	CI_Lower <- c(CI_Lower, LB)
	CI_Upper <- c(CI_Upper, UB)
	repNum <- c(repNum,i) # get p-value from this bs sample
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}


	rpt <- data.frame(repNum,xbar,stderr,CI_Lower,CI_Upper)

	rm(LB,UB,se,est,i,sample) # cleanup


	k <- 100 # population parameter we are estimating


	# report the simulation results
	rpt$flag <- ifelse((rpt$CI_Lower <= k & rpt$CI_Upper >= k),1,0) # interval contains 'k'
	rpt$sig <- ifelse((rpt$CI_Lower <= 0 & rpt$CI_Upper >= 0),0,1) # 1 if statistically different from 0
	rpt$diffLB <- abs(k - rpt$CI_Lower) # difference between k and CI lower bound
	rpt$diffUB <- abs(k - rpt$CI_Upper) # difference between k and CI upper bound
	rpt$diffEst <- abs(k - rpt$xbar) # difference between point estimate and k
	rpt$est_closer <- ifelse(rpt$diffEst < rpt$diffLB & rpt$diffEst < rpt$diffUB,1,0) # flag when point estimate is closer to k
	summary(rpt) # check

	sd(rpt$xbar)

	hist(rpt$xbar, breaks = 50)


	# visualize

	plot(rpt$xbar, pch=20, ylim=range(pretty(c(rpt$CI_Lower, rpt$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lend=1)
	segments(seq_len(nrow(rpt)), rpt$CI_Lower, y1= rpt$CI_Upper, lwd=1, lend=1)
	abline(h = 0)

	# order by estimate
	tmp <- rpt[order(rpt$xbar),]
	plot(tmp$xbar, pch=20, ylim=range(pretty(c(tmp$CI_Lower, tmp$CI_Upper))),
	xlab='', ylab='', las=1, cex.axis=0.8, cex=.5, xaxt='n')
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lend=1)
	segments(seq_len(nrow(tmp)), tmp$CI_Lower, y1= tmp$CI_Upper, lwd=1, lend=1)
	abline(h = 0)\
	abline(h = 100, col="green")