adambear91/tobit_vs_OLS_simulations.R

## tobit_vs_OLS_simulations.R
require(ggplot2)
require(GGally)
require(VGAM)

# Function to create a simulated data set and run models on it
oneRun <- function(number_of_obs,eff_IV1,eff_IV2,baseline,sd,lowerCutoff,upperCutoff,roundOn){
  # partition data into 4 conditions, specified by IV1 and IV2
  partition <- number_of_obs /4
  df <- expand.grid(IV1 = c(T,F), IV2 = c(T,F))
  df <- df[rep(row.names(df), partition), 1:2]

  # sample uncensored values from normal distribution
  df$DV_UC <- rnorm(number_of_obs, mean = baseline, sd = sd)
  df$DV_UC <- eff_IV1*df$IV1 + eff_IV2*df$IV2 + df$DV_UC

  # create censored variable with upper and lower cutoffs
  df$DV_C <-df$DV_UC
  df$DV_C[df$DV_C>upperCutoff] <- upperCutoff
  df$DV_C[df$DV_C<lowerCutoff] <- lowerCutoff

  # if rounding is on, round values to nearest integer
  if (roundOn) {
    df$DV_C <- round(df$DV_C)
  }

  # Run OLS model and store interaction p-value
  fit1 <- lm(DV_C ~ IV1 * IV2, data=df)
  pOLS <- summary(fit1)$coefficients[[16]]

  # Run Tobit model and store interaction p-value
  fit2 <- vglm(DV_C ~ IV1 * IV2, data=df,family=tobit(Upper = upperCutoff, Lower = lowerCutoff))
  ctable <- coef(summary(fit2))
  pTobit <- ctable[20]

  return (c(pOLS,pTobit))
}

# Parameters for simulations
number_of_obs <- 800 # number of observations per data set
eff_IV1 <- 1.5 # main effect 1
eff_IV2 <- 1.5 # main effect 2
baseline <- 1.5 # baseline mean
sd <- 2 # standard deviation of all conditions
lowerCutoff <- 1 # lower bound on measure
upperCutoff <- 7 # upper bound on measure
roundOn <- TRUE # rounds values to nearest integer
numSims <- 5000 # number of simulated data sets

# data frame to store simulated p-values
df_out <- data.frame(pOLS=numeric(),
                     pTobit=numeric())

# Loop through all simulations and store resulting p-values
for ( i in 1:numSims) {
  if (i%%100==0) { print(i) } # print every 100 simulations to console
  ps <- oneRun(number_of_obs,eff_IV1,eff_IV2,baseline,sd,lowerCutoff,upperCutoff,roundOn)
  newdf <- data.frame(pOLS=ps[[1]], pTobit=ps[[2]])
  names(newdf)<-c("pOLS","pTobit")
  df_out <- rbind(df_out, newdf)
}

# Create overlaid histogram of p-values
OLS <- data.frame(ps = df_out$pOLS)
Tobit <- data.frame(ps = df_out$pTobit)
OLS$label <- 'OLS'
Tobit$label <- 'Tobit'
pDists <- rbind(OLS,Tobit)
ggplot(pDists, aes(ps, fill = label)) + geom_histogram(alpha = 0.5, aes(y = ..count..), position = 'identity', binwidth=.02)

# Thanks to Mohsen Mosleh for help writing this code.
	require(ggplot2)
	require(GGally)
	require(VGAM)

	# Function to create a simulated data set and run models on it
	oneRun <- function(number_of_obs,eff_IV1,eff_IV2,baseline,sd,lowerCutoff,upperCutoff,roundOn){
	# partition data into 4 conditions, specified by IV1 and IV2
	partition <- number_of_obs /4
	df <- expand.grid(IV1 = c(T,F), IV2 = c(T,F))
	df <- df[rep(row.names(df), partition), 1:2]

	# sample uncensored values from normal distribution
	df$DV_UC <- rnorm(number_of_obs, mean = baseline, sd = sd)
	df$DV_UC <- eff_IV1df$IV1 + eff_IV2df$IV2 + df$DV_UC

	# create censored variable with upper and lower cutoffs
	df$DV_C <-df$DV_UC
	df$DV_C[df$DV_C>upperCutoff] <- upperCutoff
	df$DV_C[df$DV_C<lowerCutoff] <- lowerCutoff

	# if rounding is on, round values to nearest integer
	if (roundOn) {
	df$DV_C <- round(df$DV_C)
	}

	# Run OLS model and store interaction p-value
	fit1 <- lm(DV_C ~ IV1 * IV2, data=df)
	pOLS <- summary(fit1)$coefficients[[16]]

	# Run Tobit model and store interaction p-value
	fit2 <- vglm(DV_C ~ IV1 * IV2, data=df,family=tobit(Upper = upperCutoff, Lower = lowerCutoff))
	ctable <- coef(summary(fit2))
	pTobit <- ctable[20]

	return (c(pOLS,pTobit))
	}

	# Parameters for simulations
	number_of_obs <- 800 # number of observations per data set
	eff_IV1 <- 1.5 # main effect 1
	eff_IV2 <- 1.5 # main effect 2
	baseline <- 1.5 # baseline mean
	sd <- 2 # standard deviation of all conditions
	lowerCutoff <- 1 # lower bound on measure
	upperCutoff <- 7 # upper bound on measure
	roundOn <- TRUE # rounds values to nearest integer
	numSims <- 5000 # number of simulated data sets

	# data frame to store simulated p-values
	df_out <- data.frame(pOLS=numeric(),
	pTobit=numeric())

	# Loop through all simulations and store resulting p-values
	for ( i in 1:numSims) {
	if (i%%100==0) { print(i) } # print every 100 simulations to console
	ps <- oneRun(number_of_obs,eff_IV1,eff_IV2,baseline,sd,lowerCutoff,upperCutoff,roundOn)
	newdf <- data.frame(pOLS=ps[[1]], pTobit=ps[[2]])
	names(newdf)<-c("pOLS","pTobit")
	df_out <- rbind(df_out, newdf)
	}

	# Create overlaid histogram of p-values
	OLS <- data.frame(ps = df_out$pOLS)
	Tobit <- data.frame(ps = df_out$pTobit)
	OLS$label <- 'OLS'
	Tobit$label <- 'Tobit'
	pDists <- rbind(OLS,Tobit)
	ggplot(pDists, aes(ps, fill = label)) + geom_histogram(alpha = 0.5, aes(y = ..count..), position = 'identity', binwidth=.02)

	# Thanks to Mohsen Mosleh for help writing this code.