Skip to content

Instantly share code, notes, and snippets.

@bohdanszymanik
Last active December 18, 2015 12:09
Show Gist options
  • Save bohdanszymanik/5780368 to your computer and use it in GitHub Desktop.
Save bohdanszymanik/5780368 to your computer and use it in GitHub Desktop.
Example usage of R data.table in an incident analysis
# data was closed incidents in a tab separated txt file that looked like this
# week category P1 P2 P3
# sucked up into R it formed a data frame like so
# 1 28/05/2012 some category1 NA NA 1
# 2 28/05/2012 some category2 NA 2 1
# 3 28/05/2012 some category3 1 NA 1
# 4 28/05/2012 some category4 NA NA 2
# as it happens, this data was manually collected from excel spreadsheets
# unfortunately the spreadsheets were for differing week ranges and there was some overlap with
# occasionally differing values for P1, P2, P3
#
# the best approach here would be to take the last (ie latest) values for a given week and category
# because these would have been the most accurate (containing all those recently closed)
# the data frame was ordered by recency of the data so this could potentially work...
# otherwise a rough way was just to average the data:)
#
# how about we try both and check they roughly compare...
# start by loading up a data.frame
incidents <- read.delim("C:/Temp/incidents1.txt", header=F)
View(incidents)
colnames(incidents) <- c("week", "category", "P1", "P2", "P3")
nrow(incidents)
nrow(unique(incidents))
# 2587 rows in either case above
# plot out data - to do this we need to reshape so that P1, P2, P3 columns become two columns
# one being the type of the incident, the other the count of that incident type
library(reshape)
incidentsMelt <- melt(incidents[,c("week", "category", "P1", "P2", "P3")],id=c("week", "category"))
library(ggplot2)
qplot(as.Date(week,"%d/%m/%Y"), value, data=incidentsMelt, colour = variable)
# time to sort this data out
library(data.table)
incidentsDT <- data.table(incidents)
setkey(incidentsDT, week, category)
# data.table gives speedy queries when data > 100k rows (not thecase here)
# fast binary scan across keyed data - notice the J to denote a select
incidentsDT[J("28/05/2012", "some categoryA")]
# vector search, but it provides wildcard querying and regex
incidentsDT[(week=="28/05/2012" & category %like% "Application Delvry")]
#
# let's try the mean approach to dealing with repeated, but differing, entries
#
incidentsDT2 <- incidentsDT
incidentsDT2[, MeanP1:=mean(P1), by=list(week, category)]
incidentsDT2[, MeanP2:=mean(P2), by=list(week, category)]
incidentsDT2[, MeanP3:=mean(P3), by=list(week, category)]
head(incidentsDT2)
nrow(incidentsDT2) # 2587 rows
tables()
sapply(incidentsDT2, class)
# selecting columns of a data frame
head(incidents[,c('week', 'category')])
# selecting columns of a data table - use list instead of c
tail(incidentsDT2[, list(week, category)])
nrow(unique(incidentsDT2[, list(week, category, MeanP1, MeanP2, MeanP3)])) # 2381 rows
#
# let's try a data.table aggregation approach
# look for most recent value of P1/P2/P3 based upon max index for a group by week & category
#
head(incidents)
# add an index value for the row
incidents$index <- as.numeric(rownames(incidents))
head(incidents)
incidentsDT3 <- data.table(incidents)
head(incidentsDT3)
# what does this aggregation look like...
head(incidentsDT3[,list(index1=max(index), P1, P2, P3), by=list(week,category)])
# giving 2587 rows...
nrow(incidentsDT3[,list(max(index), P1, P2, P3), by=list(week,category)])
# and 2416 unique rows - why is it different than the 2381 obtained above?
# because we don't have something like a sql having clause
# a better view is this
head(incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)])
# notice that index1 is repeated with the max value for each row:
# week category index index1 P1 P2 P3
# 1: 28/05/2012 some categoryA 1 1156 NA NA 1
# 2: 28/05/2012 some categoryA 1156 1156 NA NA 3
#
# but you can't just include an expression such as index==index1 in the i position
# because i is evaluated before j
# bugger, does this force me to take a two step approach?
incidentsDT4 <- incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)]
head(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)])
# and row count is 2381 matching result from mean approach
nrow(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)])
#
# let's use up some more memory and create a cleaner data table
#
incidentsDT5 <- incidentsDT4[index == index1, list(P1, P2, P3), by = list(week, category)]
# should I make NA 0? Maybe not
#
# let's make some weekly histograms
#
hist(incidentsDT5[,list(SumP1=sum(P1)),by=week][,SumP1]) # oops - NA + a valid number = NA
hist(incidentsDT5[,list(SumP2=sum(P2)),by=week][,SumP2]) # ditto
hist(incidentsDT5[,list(SumP3=sum(P3)),by=week][,SumP3]) # this works because there's no NAs
hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) # lots of zeros in here - might be sensible to model - does zero inflated regression apply here?
# how about we change NA to allow summation
# remove any 0 values
# and if we're doing this,then why not create some more clean data tables
WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)]
WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)]
WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)]
hist(WeeklySumP1[,SumP1])
hist(WeeklySumP2[,SumP2])
hist(WeeklySumP3[,SumP3])
head(hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]))
WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)]
WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)]
WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)]
# plot out the weekly P1, P2, P3 so we can see how they trend
qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,list(week=as.Date(week,"%d/%m/%Y"),SumP1)])
# interesting, above gives the 0 value points, we don't want to see them
qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(week=as.Date(week,"%d/%m/%Y"),SumP1)])
qplot(week,SumP2,data=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(week=as.Date(week,"%d/%m/%Y"),SumP2)])
qplot(week,SumP3,data=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)])
#
# the P3 plot shows the distribution of points climbing in a linear fashion
# what I think should be done is evaluate the distribution at 3 points
# fit to poisson or whatever best fits all 3
# then apply a linear contribution to make this grow appropriately over time
#
#
# let's try some dead salmon views:)
# (make these data.tables a little easier... and note that vwReg wants them coerced to data.frames)
# so I will now rudely interrupt my code with a function
##############################################################################################
# Copyright 2012 Felix Schönbrodt
# All rights reserved.
#
# FreeBSD License
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER `AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation
# are those of the authors and should not be interpreted as representing
# official policies, either expressed or implied, of the copyright
# holder.
# Version history:
# 0.1: original code
# 0.1.1: changed license to FreeBSD; re-established compability to ggplot2 (new version 0.9.2)
## Visually weighted regression / Watercolor plots
## Idea: Solomon Hsiang, with additional ideas from many blog commenters
# B = number bootstrapped smoothers
# shade: plot the shaded confidence region?
# shade.alpha: should the CI shading fade out at the edges? (by reducing alpha; 0 = no alpha decrease, 0.1 = medium alpha decrease, 0.5 = strong alpha decrease)
# spag: plot spaghetti lines?
# spag.color: color of spaghetti lines
# mweight: should the median smoother be visually weighted?
# show.lm: should the linear regresison line be plotted?
# show.CI: should the 95% CI limits be plotted?
# show.median: should the median smoother be plotted?
# median.col: color of the median smoother
# shape: shape of points
# method: the fitting function for the spaghettis; default: loess
# bw = TRUE: define a default b&w-palette
# slices: number of slices in x and y direction for the shaded region. Higher numbers make a smoother plot, but takes longer to draw. I wouldn'T go beyond 500
# palette: provide a custom color palette for the watercolors
# ylim: restrict range of the watercoloring
# quantize: either "continuous", or "SD". In the latter case, we get three color regions for 1, 2, and 3 SD (an idea of John Mashey)
# add: if add == FALSE, a new ggplot is returned. If add == TRUE, only the elements are returned, which can be added to an existing ggplot (with the '+' operator)
# ...: further parameters passed to the fitting function, in the case of loess, for example, "span = .9", or "family = 'symmetric'"
vwReg <- function(formula, data, title="", B=1000, shade=TRUE, shade.alpha=.1, spag=FALSE, spag.color="darkblue", mweight=TRUE, show.lm=FALSE, show.median = TRUE, median.col = "white", shape = 21, show.CI=FALSE, method=loess, bw=FALSE, slices=200, palette=colorRampPalette(c("#FFEDA0", "#DD0000"), bias=2)(20), ylim=NULL, quantize = "continuous", add=FALSE, ...) {
IV <- all.vars(formula)[2]
DV <- all.vars(formula)[1]
data <- na.omit(data[order(data[, IV]), c(IV, DV)])
if (bw == TRUE) {
palette <- colorRampPalette(c("#EEEEEE", "#999999", "#333333"), bias=2)(20)
}
print("Computing boostrapped smoothers ...")
newx <- data.frame(seq(min(data[, IV]), max(data[, IV]), length=slices))
colnames(newx) <- IV
l0.boot <- matrix(NA, nrow=nrow(newx), ncol=B)
l0 <- method(formula, data)
for (i in 1:B) {
data2 <- data[sample(nrow(data), replace=TRUE), ]
data2 <- data2[order(data2[, IV]), ]
if (class(l0)=="loess") {
m1 <- method(formula, data2, control = loess.control(surface = "i", statistics="a", trace.hat="a"), ...)
} else {
m1 <- method(formula, data2, ...)
}
l0.boot[, i] <- predict(m1, newdata=newx)
}
# compute median and CI limits of bootstrap
library(plyr)
library(reshape2)
CI.boot <- adply(l0.boot, 1, function(x) quantile(x, prob=c(.025, .5, .975, pnorm(c(-3, -2, -1, 0, 1, 2, 3))), na.rm=TRUE))[, -1]
colnames(CI.boot)[1:10] <- c("LL", "M", "UL", paste0("SD", 1:7))
CI.boot$x <- newx[, 1]
CI.boot$width <- CI.boot$UL - CI.boot$LL
# scale the CI width to the range 0 to 1 and flip it (bigger numbers = narrower CI)
CI.boot$w2 <- (CI.boot$width - min(CI.boot$width))
CI.boot$w3 <- 1-(CI.boot$w2/max(CI.boot$w2))
# convert bootstrapped spaghettis to long format
b2 <- melt(l0.boot)
b2$x <- newx[,1]
colnames(b2) <- c("index", "B", "value", "x")
library(ggplot2)
library(RColorBrewer)
# Construct ggplot
# All plot elements are constructed as a list, so they can be added to an existing ggplot
# if add == FALSE: provide the basic ggplot object
p0 <- ggplot(data, aes_string(x=IV, y=DV)) + theme_bw()
# initialize elements with NULL (if they are defined, they are overwritten with something meaningful)
gg.tiles <- gg.poly <- gg.spag <- gg.median <- gg.CI1 <- gg.CI2 <- gg.lm <- gg.points <- gg.title <- NULL
if (shade == TRUE) {
quantize <- match.arg(quantize, c("continuous", "SD"))
if (quantize == "continuous") {
print("Computing density estimates for each vertical cut ...")
flush.console()
if (is.null(ylim)) {
min_value <- min(min(l0.boot, na.rm=TRUE), min(data[, DV], na.rm=TRUE))
max_value <- max(max(l0.boot, na.rm=TRUE), max(data[, DV], na.rm=TRUE))
ylim <- c(min_value, max_value)
}
# vertical cross-sectional density estimate
d2 <- ddply(b2[, c("x", "value")], .(x), function(df) {
res <- data.frame(density(df$value, na.rm=TRUE, n=slices, from=ylim[1], to=ylim[2])[c("x", "y")])
#res <- data.frame(density(df$value, na.rm=TRUE, n=slices)[c("x", "y")])
colnames(res) <- c("y", "dens")
return(res)
}, .progress="text")
maxdens <- max(d2$dens)
mindens <- min(d2$dens)
d2$dens.scaled <- (d2$dens - mindens)/maxdens
## Tile approach
d2$alpha.factor <- d2$dens.scaled^shade.alpha
gg.tiles <- list(geom_tile(data=d2, aes(x=x, y=y, fill=dens.scaled, alpha=alpha.factor)), scale_fill_gradientn("dens.scaled", colours=palette), scale_alpha_continuous(range=c(0.001, 1)))
}
if (quantize == "SD") {
## Polygon approach
SDs <- melt(CI.boot[, c("x", paste0("SD", 1:7))], id.vars="x")
count <- 0
d3 <- data.frame()
col <- c(1,2,3,3,2,1)
for (i in 1:6) {
seg1 <- SDs[SDs$variable == paste0("SD", i), ]
seg2 <- SDs[SDs$variable == paste0("SD", i+1), ]
seg <- rbind(seg1, seg2[nrow(seg2):1, ])
seg$group <- count
seg$col <- col[i]
count <- count + 1
d3 <- rbind(d3, seg)
}
gg.poly <- list(geom_polygon(data=d3, aes(x=x, y=value, color=NULL, fill=col, group=group)), scale_fill_gradientn("dens.scaled", colours=palette, values=seq(-1, 3, 1)))
}
}
print("Build ggplot figure ...")
flush.console()
if (spag==TRUE) {
gg.spag <- geom_path(data=b2, aes(x=x, y=value, group=B), size=0.7, alpha=10/B, color=spag.color)
}
if (show.median == TRUE) {
if (mweight == TRUE) {
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M, alpha=w3^3), size=.6, linejoin="mitre", color=median.col)
} else {
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M), size = 0.6, linejoin="mitre", color=median.col)
}
}
# Confidence limits
if (show.CI == TRUE) {
gg.CI1 <- geom_path(data=CI.boot, aes(x=x, y=UL), size=1, color="red")
gg.CI2 <- geom_path(data=CI.boot, aes(x=x, y=LL), size=1, color="red")
}
# plain linear regression line
if (show.lm==TRUE) {gg.lm <- geom_smooth(method="lm", color="darkgreen", se=FALSE)}
gg.points <- geom_point(data=data, aes_string(x=IV, y=DV), size=2, shape=shape, fill="tan", color="black")
if (title != "") {
gg.title <- theme(title=title)
}
gg.elements <- list(gg.tiles, gg.poly, gg.spag, gg.median, gg.CI1, gg.CI2, gg.lm, gg.points, gg.title, theme(legend.position="none"))
if (add == FALSE) {
return(p0 + gg.elements)
} else {
return(gg.elements)
}
}
#######################################################
vwReg(SumP1~days, as.data.frame(WeeklySumP1), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) # I like these better
vwReg(SumP2~days, as.data.frame(WeeklySumP2), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu"))
vwReg(SumP3~days, as.data.frame(WeeklySumP3), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu"))
#
# those P3s are increasing over time but the distribution looks the same, let's fit a linear model and display eqn on the plot
# I've copied this handy function from http://stackoverflow.com/questions/7549694/ggplot2-adding-regression-line-equation-and-r2-on-graph
#
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
P3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)]
ggplot(P3, aes(x=week, y=SumP3)) +
geom_point() +
geom_smooth(method=lm) +
geom_text(x=as.integer(as.Date("2013-05-20")), y=70, label = lm_eqn(lm(SumP3 ~ week, P3)), parse=T)
#
# so I think approach is this
#
# create an empty plot
# iterate over some number of trials
# each trial is a sample with replacement going out some number of weeks eg 3*52
# but should we account for decreasing rate of P1s and P2s; and increasing rate of P3s?
# suggest safest to keep P1/P2 rate the same and allow for increasing P3 rate with linear model
# calculate cost each week (cost of P1s + cost of P2s + cost of P3s)
# add a line to a plot
#
# but first, lets plot historic data
#
mergedWeeklyIncidents <-merge(merge(WeeklySumP1, WeeklySumP2), WeeklySumP3)
mergedWeeklyIncidents$cost <- 100000*mergedWeeklyIncidents$SumP1 +
10000*mergedWeeklyIncidents$SumP2 +
1000*mergedWeeklyIncidents$SumP3
# seems that language parsing for cbind() and c() is easier in either case if we
# don't put expressions inside, so let's build some intermediate steps
projecting=3*52 # 3 years in week intervals
projectedWeeks = seq(max(mergedWeeklyIncidents$week), by="week", length.out=projecting)
trials=10
CostOfP1 = 100000
CostOfP2 = 10000
CostOfP3 = 1000
plot(mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(0,x2), xlab="week", ylab="Cost/week")
for(i in 1:trials) {
P1Cost=CostOfP1*sample(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1], 1000, replace=T)
P2Cost=CostOfP2*sample(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2], 1000, replace=T)
P3Cost=CostOfP3*sample(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3], 1000, replace=T)
projectedWeeklyCost <- data.frame(cbind(x=xs, y=P1Cost+P2Cost+P3Cost))
lines(projectedWeeklyCost, col=rgb(0,0,1, alpha=0.1))
}
# add an average line
avgWeeklyCost = CostOfP1*mean(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) +
CostOfP2*mean(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2]) +
CostOfP3*mean(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3])
lines(data.frame(cbind(x=xs, y=rep(avgWeeklyCost,1000))), col=rgb(1,0,0))
#
# that worked but it's clear that the distributions of P1s, P2s and P3s are changing with
# time, the P1s and P2s are decreasing a bit but the P3s increasing significantly
# how about we fit the weekly sum of the P3s to a linear model and extrapolate out?
#
# actually, how about we subtract the linear average off and redo the distribution, then take our
# samples and add in the linear contribution
# this way, we should get a cleaner distribution and better bootstrapping for future values
#
# just to double check the validity of this, first I want to plot estimated values against
# original
t<-WeeklySumP3
t$est <- 0.01042*t$days-121.30926
ggplot(t, aes(days, SumP3)) + geom_point() + geom_point(data=t, aes(days, est))
# yep, lm was correct
# so now, I'm going to create a new data set with the linear contribution subtracted
# and baselined at the beginning
t$base <- t$SumP3 - (t$days-min(t$days))*0.01042
# check we're right
ggplot(t, aes(days, SumP3)) + geom_point() +
geom_point(data=t, aes(days, est, colour='red')) +
geom_point(data=t, aes(days, base, colour='blue'))
# weird, colours got mixed up. The aesthetics must be getting changed
# let's try fitting that adjusted distribution to something from which we can calculate values
library(MASS)
# we'll try a few...:) (why are colours getting mixed up?)
fitdistr(t$SumP3, 'Poisson')
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rpois(1000,37)))) +
geom_histogram(aes(original, fill="red", alpha=0.2, binwidth=10)) +
geom_histogram(aes(predict, fill="blue", alpha=0.2, binwidth=10))
fitdistr(t$SumP3, 'weibull')
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rweibull(1000, 3.16, 41.2)))) +
geom_histogram(aes(x=original, fill="red", alpha=0.2, binwidth=10)) +
geom_histogram(aes(x=predict, fill="blue", alpha=0.2, binwidth=10))
fitdistr(t$SumP3, 'log-normal')
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rlnorm(1000, meanlog=3.5, sdlog=0.42)))) +
geom_histogram(colour="red", aes(x=original, alpha=0.2, binwidth=10)) +
geom_histogram(colour="blue", aes(x=predict, alpha=0.2, binwidth=10))
# nice, but perhaps better to just bootstrap from real data and apply the linear adjustment
# first, replot the historical data - note that I originally had this graph generated
# from a vector representation of the cost - this had the data sorted in reverse order
# lesson - always make sure you have x and y!
plot(mergedWeeklyIncidents$week, mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(min(mergedWeeklyIncidents$week), max(projectedWeeks)), xlab="week", ylab="Cost/week")
# then project out a number of weeks using variable 'projecting'
for (trial in 1:trials) {
projectedWeeklyCost = data.frame(week=projectedWeeks,
P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T),
P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T))
# this next one is a little tricky, the sample gets a linear contribution
projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T)
projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042
projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted
projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost
lines(projectedWeeklyCost$week, projectedWeeklyCost$TotalWeeklyCost, col=rgb(0,0,1, alpha=0.2))
}
#
# that worked but I want to make this work with ggplot
#
library(scales)
p <- ggplot() + scale_y_continuous(labels=dollar, "Weekly Cost") + xlab("Year") + theme(text=element_text(size=20))
p <- p + geom_line(data=mergedWeeklyIncidents, colour="green", aes(x = week, y = cost)) +
xlim(min(mergedWeeklyIncidents$week),max(projectedWeeklyCost$week))
# then project out a number of weeks using variable 'projecting'
for (trial in 1:trials) {
projectedWeeklyCost = data.frame(week=projectedWeeks,
P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T),
P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T))
# this next one is a little tricky, the sample gets a linear contribution
projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T)
projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042
projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted
projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost
p <- p + geom_line(data=projectedWeeklyCost, colour="blue", alpha=0.1, linewidth=0.1, aes(x = week, y = as.integer(TotalWeeklyCost)))
}
print(p)
# and a histogram of the projected costs
qplot(as.numeric(TotalWeeklyCost), data=projectedWeeklyCost, geom="histogram", binwidth=20000) + scale_x_continuous(labels=dollar, "Weekly Cost")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment