ncray/Error Model Simulation Code.R

## Error Model Simulation Code.R
## There's a dependency on https://github.com/JohnLangford/vowpal_wabbit
library(plyr)
library(dplyr)
library(magrittr)
library(pander)

panderOptions('table.split.table', Inf)

LOW_MEAN <- .4
HIGH_MEAN <- .6
LOW_N <- 10
HIGH_N <- 40
LOW_SD <- .05
HIGH_SD <- .1
NUM_REPLICATES <- 20000
## WARNING!!, removes files in WORKING_DIR
WORKING_DIR <- tempdir()
set.seed(0)

## Writes a file in VW format.
WriteVW <- function(y, x, file.name) {
    write.table(data.frame(paste(y, "|"), x), file = file.name, sep = " ",
                quote = FALSE, row.names = FALSE, col.names = FALSE)
}

## Generates Normal sample data.
GenerateData <- function(N, Mean, SD, Replicate) {
    data.frame(y = rnorm(n = N, mean = Mean, sd = SD))
}

## Creates buckets based on quantiles.
QuantileBuckets <- function(x, num.buckets) {
    cut(x,
        quantile(x, seq(0, 1, 1 / num.buckets)) %>% unique,
        include.lowest = TRUE)
}

## Rounds numeric columns in a data frame to num.digits.
RoundDF <- function(x, num.digits = 3) {
    cols <- sapply(x, class) == 'numeric'
    x[cols] <-  round(x[cols], num.digits)
    x
}

## Generate parameters based on cartesian product.
params.df <- expand.grid(N = c(LOW_N, HIGH_N),
                         Mean = c(LOW_MEAN, HIGH_MEAN),
                         SD = c(LOW_SD, HIGH_SD),
                         Replicate = 1:NUM_REPLICATES)

setwd(WORKING_DIR)
## WARNING!!, removes files in WORKING_DIR
list.files(WORKING_DIR) %>% file.remove

## Generate data based on parameter data frame.
df <- mdply(params.df, GenerateData)

## Add description column, which is a feature in the VW model, so it learns a
## coefficient for each replicate/parameter combo.
df %<>% mutate(Description = paste(N, SD, Replicate, Mean, sep = "-"))

## Shuffle rows (online learning).
df <- df[sample(1:nrow(df)), ]
WriteVW(df$y, df$Description, "overall_data.txt")

## Train online model, and read in predictions.
## Calculating absolute error instead of squared error so the units will be nicer. The principles are the same.
system("vw overall_data.txt -p preds.txt -b 26")
df %<>% mutate(Prediction = scan("preds.txt"),
               AbsoluteError = abs(Prediction - y))

## Write error data and train basic error model.
WriteVW(df$AbsoluteError, df$Description, "overall_error_data.txt")
system("vw overall_error_data.txt -p overall_error_preds.txt -b 26")

## Train conservative (bootstrapped) error model.
system("vw overall_error_data.txt -p bootstrap_preds.txt --bootstrap 100 -b 26 --random_seed 0")

## Read in error model output.
bootstrap.df <- read.csv("bootstrap_preds.txt", header = FALSE, sep = " ")
df %<>% mutate(EstimatedError = scan("overall_error_preds.txt"),
               BootstrapAvg = bootstrap.df$V1,
               BootstrapMin = bootstrap.df$V2,
               BootstrapMax = bootstrap.df$V3,
               BootstrapWidth = BootstrapMax - BootstrapMin,
               ErrorModelDecile = QuantileBuckets(EstimatedError, 10),
               ConservativeErrorModelDecile = QuantileBuckets(BootstrapMax, 10),
               BootstrapAvgBucket = QuantileBuckets(BootstrapAvg, 10))

## Define summary function.
MySummary <- . %>%
  summarize(AvgEstimatedError = mean(EstimatedError),
            AvgAbsoluteError = mean(AbsoluteError),
            `%Outliers` = mean(AbsoluteError > .15) * 100,
            NumInBucket = n(),
            `%HighN_LowSD` = mean(N == HIGH_N & SD == LOW_SD) * 100,
            `%HighN_HighSD` = mean(N == HIGH_N & SD == HIGH_SD) * 100,
            `%LowN_LowSD` = mean(N == LOW_N & SD == LOW_SD) * 100,
            `%LowN_HighSD` = mean(N == LOW_N & SD == HIGH_SD) * 100,
            PropHighN = mean(N == HIGH_N),
            PropLowSD = mean(SD == LOW_SD),
            PropLowMean = mean(Mean == LOW_MEAN))


error.row <- df %>%
  group_by(ErrorModelDecile) %>%
  MySummary %>%
  RoundDF %>%
  mutate(Description = "Error Model Best Decile") %>%
  select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
         `%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
  head(1)


conservative.row <- df %>%
  group_by(ConservativeErrorModelDecile) %>%
  MySummary %>%
  RoundDF %>%
  mutate(Description = "Conservative Error Model Best Decile") %>%
  select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
         `%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
  head(1)

overall.row <- df %>%
  group_by() %>%
  MySummary %>%
  RoundDF %>%
  mutate(Description = "Overall") %>%
  select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
         `%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
  head(1)

rbind(overall.row, error.row, conservative.row) %>%
  pandoc.table(caption = "Overall Error Metrics Versus Error Model Best Decile and Conservative Error Model Best Decile")

df %>%
  group_by() %>%
  MySummary %>%
  pandoc.table(caption = "Overall Metrics")

df %>%
  group_by(ErrorModelDecile) %>%
  MySummary %>%
  pandoc.table(caption = "Error Model Metrics By Decile")

df %>%
  group_by(ConservativeErrorModelDecile) %>%
  MySummary %>%
  pandoc.table(caption = "Conservative Error Model Model Metrics By Decile")
	## There's a dependency on https://github.com/JohnLangford/vowpal_wabbit
	library(plyr)
	library(dplyr)
	library(magrittr)
	library(pander)

	panderOptions('table.split.table', Inf)

	LOW_MEAN <- .4
	HIGH_MEAN <- .6
	LOW_N <- 10
	HIGH_N <- 40
	LOW_SD <- .05
	HIGH_SD <- .1
	NUM_REPLICATES <- 20000
	## WARNING!!, removes files in WORKING_DIR
	WORKING_DIR <- tempdir()
	set.seed(0)

	## Writes a file in VW format.
	WriteVW <- function(y, x, file.name) {
	write.table(data.frame(paste(y, "\|"), x), file = file.name, sep = " ",
	quote = FALSE, row.names = FALSE, col.names = FALSE)
	}

	## Generates Normal sample data.
	GenerateData <- function(N, Mean, SD, Replicate) {
	data.frame(y = rnorm(n = N, mean = Mean, sd = SD))
	}

	## Creates buckets based on quantiles.
	QuantileBuckets <- function(x, num.buckets) {
	cut(x,
	quantile(x, seq(0, 1, 1 / num.buckets)) %>% unique,
	include.lowest = TRUE)
	}

	## Rounds numeric columns in a data frame to num.digits.
	RoundDF <- function(x, num.digits = 3) {
	cols <- sapply(x, class) == 'numeric'
	x[cols] <- round(x[cols], num.digits)
	x
	}

	## Generate parameters based on cartesian product.
	params.df <- expand.grid(N = c(LOW_N, HIGH_N),
	Mean = c(LOW_MEAN, HIGH_MEAN),
	SD = c(LOW_SD, HIGH_SD),
	Replicate = 1:NUM_REPLICATES)

	setwd(WORKING_DIR)
	## WARNING!!, removes files in WORKING_DIR
	list.files(WORKING_DIR) %>% file.remove

	## Generate data based on parameter data frame.
	df <- mdply(params.df, GenerateData)

	## Add description column, which is a feature in the VW model, so it learns a
	## coefficient for each replicate/parameter combo.
	df %<>% mutate(Description = paste(N, SD, Replicate, Mean, sep = "-"))

	## Shuffle rows (online learning).
	df <- df[sample(1:nrow(df)), ]
	WriteVW(df$y, df$Description, "overall_data.txt")

	## Train online model, and read in predictions.
	## Calculating absolute error instead of squared error so the units will be nicer. The principles are the same.
	system("vw overall_data.txt -p preds.txt -b 26")
	df %<>% mutate(Prediction = scan("preds.txt"),
	AbsoluteError = abs(Prediction - y))

	## Write error data and train basic error model.
	WriteVW(df$AbsoluteError, df$Description, "overall_error_data.txt")
	system("vw overall_error_data.txt -p overall_error_preds.txt -b 26")

	## Train conservative (bootstrapped) error model.
	system("vw overall_error_data.txt -p bootstrap_preds.txt --bootstrap 100 -b 26 --random_seed 0")

	## Read in error model output.
	bootstrap.df <- read.csv("bootstrap_preds.txt", header = FALSE, sep = " ")
	df %<>% mutate(EstimatedError = scan("overall_error_preds.txt"),
	BootstrapAvg = bootstrap.df$V1,
	BootstrapMin = bootstrap.df$V2,
	BootstrapMax = bootstrap.df$V3,
	BootstrapWidth = BootstrapMax - BootstrapMin,
	ErrorModelDecile = QuantileBuckets(EstimatedError, 10),
	ConservativeErrorModelDecile = QuantileBuckets(BootstrapMax, 10),
	BootstrapAvgBucket = QuantileBuckets(BootstrapAvg, 10))

	## Define summary function.
	MySummary <- . %>%
	summarize(AvgEstimatedError = mean(EstimatedError),
	AvgAbsoluteError = mean(AbsoluteError),
	`%Outliers` = mean(AbsoluteError > .15) * 100,
	NumInBucket = n(),
	`%HighN_LowSD` = mean(N == HIGH_N & SD == LOW_SD) * 100,
	`%HighN_HighSD` = mean(N == HIGH_N & SD == HIGH_SD) * 100,
	`%LowN_LowSD` = mean(N == LOW_N & SD == LOW_SD) * 100,
	`%LowN_HighSD` = mean(N == LOW_N & SD == HIGH_SD) * 100,
	PropHighN = mean(N == HIGH_N),
	PropLowSD = mean(SD == LOW_SD),
	PropLowMean = mean(Mean == LOW_MEAN))


	error.row <- df %>%
	group_by(ErrorModelDecile) %>%
	MySummary %>%
	RoundDF %>%
	mutate(Description = "Error Model Best Decile") %>%
	select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
	`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
	head(1)


	conservative.row <- df %>%
	group_by(ConservativeErrorModelDecile) %>%
	MySummary %>%
	RoundDF %>%
	mutate(Description = "Conservative Error Model Best Decile") %>%
	select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
	`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
	head(1)

	overall.row <- df %>%
	group_by() %>%
	MySummary %>%
	RoundDF %>%
	mutate(Description = "Overall") %>%
	select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
	`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
	head(1)

	rbind(overall.row, error.row, conservative.row) %>%
	pandoc.table(caption = "Overall Error Metrics Versus Error Model Best Decile and Conservative Error Model Best Decile")

	df %>%
	group_by() %>%
	MySummary %>%
	pandoc.table(caption = "Overall Metrics")

	df %>%
	group_by(ErrorModelDecile) %>%
	MySummary %>%
	pandoc.table(caption = "Error Model Metrics By Decile")

	df %>%
	group_by(ConservativeErrorModelDecile) %>%
	MySummary %>%
	pandoc.table(caption = "Conservative Error Model Model Metrics By Decile")