Skip to content

Instantly share code, notes, and snippets.

@ncray
Last active August 29, 2015 14:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ncray/4cffdfce696086eda079 to your computer and use it in GitHub Desktop.
Save ncray/4cffdfce696086eda079 to your computer and use it in GitHub Desktop.
## There's a dependency on https://github.com/JohnLangford/vowpal_wabbit
library(plyr)
library(dplyr)
library(magrittr)
library(pander)
panderOptions('table.split.table', Inf)
LOW_MEAN <- .4
HIGH_MEAN <- .6
LOW_N <- 10
HIGH_N <- 40
LOW_SD <- .05
HIGH_SD <- .1
NUM_REPLICATES <- 20000
## WARNING!!, removes files in WORKING_DIR
WORKING_DIR <- tempdir()
set.seed(0)
## Writes a file in VW format.
WriteVW <- function(y, x, file.name) {
write.table(data.frame(paste(y, "|"), x), file = file.name, sep = " ",
quote = FALSE, row.names = FALSE, col.names = FALSE)
}
## Generates Normal sample data.
GenerateData <- function(N, Mean, SD, Replicate) {
data.frame(y = rnorm(n = N, mean = Mean, sd = SD))
}
## Creates buckets based on quantiles.
QuantileBuckets <- function(x, num.buckets) {
cut(x,
quantile(x, seq(0, 1, 1 / num.buckets)) %>% unique,
include.lowest = TRUE)
}
## Rounds numeric columns in a data frame to num.digits.
RoundDF <- function(x, num.digits = 3) {
cols <- sapply(x, class) == 'numeric'
x[cols] <- round(x[cols], num.digits)
x
}
## Generate parameters based on cartesian product.
params.df <- expand.grid(N = c(LOW_N, HIGH_N),
Mean = c(LOW_MEAN, HIGH_MEAN),
SD = c(LOW_SD, HIGH_SD),
Replicate = 1:NUM_REPLICATES)
setwd(WORKING_DIR)
## WARNING!!, removes files in WORKING_DIR
list.files(WORKING_DIR) %>% file.remove
## Generate data based on parameter data frame.
df <- mdply(params.df, GenerateData)
## Add description column, which is a feature in the VW model, so it learns a
## coefficient for each replicate/parameter combo.
df %<>% mutate(Description = paste(N, SD, Replicate, Mean, sep = "-"))
## Shuffle rows (online learning).
df <- df[sample(1:nrow(df)), ]
WriteVW(df$y, df$Description, "overall_data.txt")
## Train online model, and read in predictions.
## Calculating absolute error instead of squared error so the units will be nicer. The principles are the same.
system("vw overall_data.txt -p preds.txt -b 26")
df %<>% mutate(Prediction = scan("preds.txt"),
AbsoluteError = abs(Prediction - y))
## Write error data and train basic error model.
WriteVW(df$AbsoluteError, df$Description, "overall_error_data.txt")
system("vw overall_error_data.txt -p overall_error_preds.txt -b 26")
## Train conservative (bootstrapped) error model.
system("vw overall_error_data.txt -p bootstrap_preds.txt --bootstrap 100 -b 26 --random_seed 0")
## Read in error model output.
bootstrap.df <- read.csv("bootstrap_preds.txt", header = FALSE, sep = " ")
df %<>% mutate(EstimatedError = scan("overall_error_preds.txt"),
BootstrapAvg = bootstrap.df$V1,
BootstrapMin = bootstrap.df$V2,
BootstrapMax = bootstrap.df$V3,
BootstrapWidth = BootstrapMax - BootstrapMin,
ErrorModelDecile = QuantileBuckets(EstimatedError, 10),
ConservativeErrorModelDecile = QuantileBuckets(BootstrapMax, 10),
BootstrapAvgBucket = QuantileBuckets(BootstrapAvg, 10))
## Define summary function.
MySummary <- . %>%
summarize(AvgEstimatedError = mean(EstimatedError),
AvgAbsoluteError = mean(AbsoluteError),
`%Outliers` = mean(AbsoluteError > .15) * 100,
NumInBucket = n(),
`%HighN_LowSD` = mean(N == HIGH_N & SD == LOW_SD) * 100,
`%HighN_HighSD` = mean(N == HIGH_N & SD == HIGH_SD) * 100,
`%LowN_LowSD` = mean(N == LOW_N & SD == LOW_SD) * 100,
`%LowN_HighSD` = mean(N == LOW_N & SD == HIGH_SD) * 100,
PropHighN = mean(N == HIGH_N),
PropLowSD = mean(SD == LOW_SD),
PropLowMean = mean(Mean == LOW_MEAN))
error.row <- df %>%
group_by(ErrorModelDecile) %>%
MySummary %>%
RoundDF %>%
mutate(Description = "Error Model Best Decile") %>%
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
head(1)
conservative.row <- df %>%
group_by(ConservativeErrorModelDecile) %>%
MySummary %>%
RoundDF %>%
mutate(Description = "Conservative Error Model Best Decile") %>%
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
head(1)
overall.row <- df %>%
group_by() %>%
MySummary %>%
RoundDF %>%
mutate(Description = "Overall") %>%
select(Description, NumInBucket, AvgAbsoluteError, `%Outliers`, `%HighN_LowSD`,
`%HighN_HighSD`, `%LowN_LowSD`, `%LowN_HighSD`) %>%
head(1)
rbind(overall.row, error.row, conservative.row) %>%
pandoc.table(caption = "Overall Error Metrics Versus Error Model Best Decile and Conservative Error Model Best Decile")
df %>%
group_by() %>%
MySummary %>%
pandoc.table(caption = "Overall Metrics")
df %>%
group_by(ErrorModelDecile) %>%
MySummary %>%
pandoc.table(caption = "Error Model Metrics By Decile")
df %>%
group_by(ConservativeErrorModelDecile) %>%
MySummary %>%
pandoc.table(caption = "Conservative Error Model Model Metrics By Decile")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment