Skip to content

Instantly share code, notes, and snippets.

@wetherc
Last active April 25, 2018 19:09
Show Gist options
  • Save wetherc/3a2645b8c82e88aeb2578b2f56025b1d to your computer and use it in GitHub Desktop.
Save wetherc/3a2645b8c82e88aeb2578b2f56025b1d to your computer and use it in GitHub Desktop.
create dummy rating dataset
rateFactors <- list(
credit = list(
"A" = 0.7,
"B" = 0.9,
"C" = 1.0,
"D" = 1.1,
"E" = 1.3
),
drvVehCount = list(
"1d1v" = 1.0,
"2d1v" = 1.1,
"3d1v" = 1.3,
"1d2v" = 0.9,
"2d2v" = 1.0,
"3d2v" = 1.1,
"1d3v" = 0.8,
"2d3v" = 0.9,
"3d3v" = 1.0,
"4d3v" = 1.05
),
points = list(
"0" = 1.0,
"2" = 1.1,
"3" = 1.5,
"4" = 1.8,
"5" = 2.5
),
pop = list(
"0" = 1.1,
"1" = 0.9
),
homeowner = list(
"0" = 0.8,
"1" = 1.0
),
singleCar = list(
"0" = 0.8,
"1" = 1.0
)
)
baseRate <- 214.60
grid <- cbind(
expand.grid(
sapply(
rateFactors,
function(x) {
unlist(names(x))
}
)
),
expand.grid(
sapply(rateFactors, unlist)
)
)
colnames(grid) <- c(
colnames(grid)[1:length(rateFactors)],
paste0(
colnames(grid)[1:length(rateFactors)],
"Factor"
)
)
grid$issuedPremium <- apply(
grid[, (length(rateFactors) + 1):ncol(grid)],
1,
function(x) {
rate <- prod(x, baseRate)
return(max((baseRate * 0.5), rate))
}
)
grid <- grid[order(grid$issuedPremium), ]
grid$probability <- dnorm(
grid$issuedPremium,
mean = mean(grid$issuedPremium),
sd = sd(grid$issuedPremium) / 2
)
set.seed(18356)
policies <- grid[
sample(
x = 1:nrow(grid),
size = 100000,
replace = TRUE,
prob = grid$probability
),
1:(ncol(grid) - 1)
]
policies$monthsRetained <- round(
rnorm(
n = nrow(policies),
mean = 12 * 7,
sd = 8
),
0
)
policies$earnedPremium <- round(
policies$issuedPremium / 6 * policies$monthsRetained,
0
)
policies$earnedCarYears <- round(
policies$monthsRetained / 12,
2
)
policies$exposureCount <- sample(
x = 0:6,
size = nrow(policies),
replace = TRUE,
prob = c(0.356, 0.15, 0.125, 0.1, 0.075, 0.075, 0.065)
)
policies$incurredLoss <- round(
policies$exposureCount * 1500 * rnorm(
n = nrow(policies),
mean = 1,
sd = 0.1
),
2
)
# glance at our loss ratio — we're running a little hot!
# (Let's assume a .68 permissible loss ratio)
sum(policies$incurredLoss) / sum(policies$earnedPremium)
policies$frequency <- policies$exposureCount / policies$earnedCarYears
policies$severity <- policies$incurredLoss / policies$exposureCount
# The following two commands remove any previously installed H2O packages for R.
if("package:h2o" %in% search()) {
detach("package:h2o", unload=TRUE)
}
if("h2o" %in% rownames(installed.packages())) {
remove.packages("h2o")
}
# Next, we download packages that H2O depends on.
pkgs <- c("RCurl","jsonlite")
for (pkg in pkgs) {
if(!(pkg %in% rownames(installed.packages()))) {
install.packages(pkg)
}
}
# Now we download, install and initialize the H2O package for R.
install.packages(
"h2o",
type="source",
repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/8/R"
)
# Finally, let's load H2O and start up an H2O cluster
library(h2o)
h2o.init()
policies.hex <- as.h2o(policies)
################################################################################
#
# Split into train/test data
#
################################################################################
# Split into train, test data frames
#
# Outputs a list of H2OFrames
# in order specified by destination_frames
# parameter
split <- h2o.splitFrame(
policies.hex,
ratios = c(0.75),
destination_frames = c(
"rating_train_0.75",
"rating_test_0.25"
)
)
rating_train_0.75 <- split[[1]]
rating_test_0.25 <- split[[2]]
################################################################################
#
# Build Severity model
#
################################################################################
severity_model <- h2o.glm(
model_id = "severity_model",
x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
"creditFactor",
"drvVehCountFactor",
"pointsFactor",
"popFactor",
"homeownerFactor",
"singleCarFactor",
"issuedPremium",
"monthsRetained",
"earnedPremium",
"earnedCarYears",
"exposureCount",
"incurredLoss",
"frequency",
"severity"
)],
y = "severity",
training_frame = rating_train_0.75,
validation_frame = rating_test_0.25,
nfolds = 5,
keep_cross_validation_predictions = FALSE,
keep_cross_validation_fold_assignment = FALSE,
family = "gamma",
missing_values_handling = "Skip",
intercept = TRUE,
link = "log"
)
################################################################################
#
# Build Frequency model
#
################################################################################
frequency_model <- h2o.glm(
model_id = "frequency_model",
x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
"creditFactor",
"drvVehCountFactor",
"pointsFactor",
"popFactor",
"homeownerFactor",
"singleCarFactor",
"issuedPremium",
"monthsRetained",
"earnedPremium",
"earnedCarYears",
"exposureCount",
"incurredLoss",
"frequency",
"severity"
)],
y = "frequency",
training_frame = rating_train_0.75,
validation_frame = rating_test_0.25,
nfolds = 5,
keep_cross_validation_predictions = FALSE,
keep_cross_validation_fold_assignment = FALSE,
family = "poisson",
missing_values_handling = "Skip",
intercept = TRUE,
link = "log"
)
################################################################################
#
# get updated factors
#
################################################################################
updated_factors <- cbind(
as.data.frame(
exp(severity_model@model$coefficients)
),
as.data.frame(
exp(frequency_model@model$coefficients)
)
)
updated_factors$factor <- updated_factors[, 1] * updated_factors[, 2]
View(updated_factors)
################################################################################
#
# Explain why all factors converged on 1
#
################################################################################
out <- aggregate(
x = list(
"loss" = policies$incurredLoss,
"prem" = policies$earnedPremium),
by = list("points" = policies$points),
FUN = "sum")
out$lr <- out$loss/out$prem
plot(out$points, out$lr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment