Last active
August 29, 2015 14:27
-
-
Save ncray/4cffdfce696086eda079 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## There's a dependency on https://github.com/JohnLangford/vowpal_wabbit | |
library(plyr) | |
library(dplyr) | |
library(magrittr) | |
library(pander) | |
panderOptions('table.split.table', Inf) | |
LOW_MEAN <- .4 | |
HIGH_MEAN <- .6 | |
LOW_N <- 10 | |
HIGH_N <- 40 | |
LOW_SD <- .05 | |
HIGH_SD <- .1 | |
NUM_REPLICATES <- 20000 | |
## WARNING!!, removes files in WORKING_DIR | |
WORKING_DIR <- tempdir() | |
set.seed(0) | |
## Writes a file in VW format. | |
WriteVW <- function(y, x, file.name) { | |
write.table(data.frame(paste(y, "|"), x), file = file.name, sep = " ", | |
quote = FALSE, row.names = FALSE, col.names = FALSE) | |
} | |
## Generates Normal sample data. | |
GenerateData <- function(N, Mean, SD, Replicate) { | |
data.frame(y = rnorm(n = N, mean = Mean, sd = SD)) | |
} | |
## Creates buckets based on quantiles. | |
QuantileBuckets <- function(x, num.buckets) { | |
cut(x, | |
quantile(x, seq(0, 1, 1 / num.buckets)) %>% unique, | |
include.lowest = TRUE) | |
} | |
## Rounds numeric columns in a data frame to num.digits. | |
RoundDF <- function(x, num.digits = 3) { | |
cols <- sapply(x, class) == 'numeric' | |
x[cols] <- round(x[cols], num.digits) | |
x | |
} | |
## Generate parameters based on cartesian product. | |
params.df <- expand.grid(N = c(LOW_N, HIGH_N), | |
Mean = c(LOW_MEAN, HIGH_MEAN), | |
SD = c(LOW_SD, HIGH_SD), | |
Replicate = 1:NUM_REPLICATES) | |
setwd(WORKING_DIR) | |
## WARNING!!, removes files in WORKING_DIR | |
list.files(WORKING_DIR) %>% file.remove | |
## Generate data based on parameter data frame. | |
df <- mdply(params.df, GenerateData) | |
## Add description column, which is a feature in the VW model, so it learns a | |
## coefficient for each replicate/parameter combo. | |
df %<>% mutate(Description = paste(N, SD, Replicate, Mean, sep = "-")) | |
## Shuffle rows (online learning). | |
df <- df[sample(1:nrow(df)), ] | |
WriteVW(df$y, df$Description, "overall_data.txt") | |
## Train online model, and read in predictions. | |
## Calculating absolute error instead of squared error so the units will be nicer. The principles are the same. | |
system("vw overall_data.txt -p preds.txt -b 26") | |
df %<>% mutate(Prediction = scan("preds.txt"), | |
AbsoluteError = abs(Prediction - y)) | |
## Write error data and train basic error model. | |
WriteVW(df$AbsoluteError, df$Description, "overall_error_data.txt") | |
system("vw overall_error_data.txt -p overall_error_preds.txt -b 26") | |
## Train conservative (bootstrapped) error model. | |
system("vw overall_error_data.txt -p bootstrap_preds.txt --bootstrap 100 -b 26 --random_seed 0") | |
## Read in error model output. | |
bootstrap.df <- read.csv("bootstrap_preds.txt", header = FALSE, sep = " ") | |
df %<>% mutate(EstimatedError = scan("overall_error_preds.txt"), | |
BootstrapAvg = bootstrap.df$V1, | |
BootstrapMin = bootstrap.df$V2, | |
BootstrapMax = bootstrap.df$V3, | |
BootstrapWidth = BootstrapMax - BootstrapMin, | |
ErrorModelDecile = QuantileBuckets(EstimatedError, 10), | |
ConservativeErrorModelDecile = QuantileBuckets(BootstrapMax, 10), | |
BootstrapAvgBucket = QuantileBuckets(BootstrapAvg, 10)) | |
## Define summary function. | |
MySummary <- . %>% | |
summarize(AvgEstimatedError = mean(EstimatedError), | |
AvgAbsoluteError = mean(AbsoluteError), | |
`%Outliers` = mean(AbsoluteError > .15) * 100, | |
NumInBucket = n(), | |
`%HighN_LowSD` = mean(N == HIGH_N & SD == LOW_SD) * 100, | |
`%HighN_HighSD` = mean(N == HIGH_N & SD == HIGH_SD) * 100, | |
`%LowN_LowSD` = mean(N == LOW_N & SD == LOW_SD) * 100, | |
`%LowN_HighSD` = mean(N == LOW_N & SD == HIGH_SD) * 100, | |
PropHighN = mean(N == HIGH_N), | |
PropLowSD = mean(SD == LOW_SD), | |
PropLowMean = mean(Mean == LOW_MEAN)) | |
error.row <- df %>% | |
group_by(ErrorModelDecile) %>% | |
MySummary %>% | |
RoundDF %>% | |
mutate(Description = "Error Model Best Decile") %>% | |
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`, | |
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>% | |
head(1) | |
conservative.row <- df %>% | |
group_by(ConservativeErrorModelDecile) %>% | |
MySummary %>% | |
RoundDF %>% | |
mutate(Description = "Conservative Error Model Best Decile") %>% | |
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`, | |
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>% | |
head(1) | |
overall.row <- df %>% | |
group_by() %>% | |
MySummary %>% | |
RoundDF %>% | |
mutate(Description = "Overall") %>% | |
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`, | |
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>% | |
head(1) | |
rbind(overall.row, error.row, conservative.row) %>% | |
pandoc.table(caption = "Overall Error Metrics Versus Error Model Best Decile and Conservative Error Model Best Decile") | |
df %>% | |
group_by() %>% | |
MySummary %>% | |
pandoc.table(caption = "Overall Metrics") | |
df %>% | |
group_by(ErrorModelDecile) %>% | |
MySummary %>% | |
pandoc.table(caption = "Error Model Metrics By Decile") | |
df %>% | |
group_by(ConservativeErrorModelDecile) %>% | |
MySummary %>% | |
pandoc.table(caption = "Conservative Error Model Model Metrics By Decile") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment