wetherc/createData.R

## createData.R
rateFactors <- list(
  credit = list(
    "A" = 0.7,
    "B" = 0.9,
    "C" = 1.0,
    "D" = 1.1,
    "E" = 1.3
  ),
  drvVehCount = list(
    "1d1v" = 1.0,
    "2d1v" = 1.1,
    "3d1v" = 1.3,
    "1d2v" = 0.9,
    "2d2v" = 1.0,
    "3d2v" = 1.1,
    "1d3v" = 0.8,
    "2d3v" = 0.9,
    "3d3v" = 1.0,
    "4d3v" = 1.05
  ),
  points = list(
    "0" = 1.0,
    "2" = 1.1,
    "3" = 1.5,
    "4" = 1.8,
    "5" = 2.5
  ),
  pop = list(
    "0" = 1.1,
    "1" = 0.9
  ),
  homeowner = list(
    "0" = 0.8,
    "1" = 1.0
  ),
  singleCar = list(
    "0" = 0.8,
    "1" = 1.0
  )
)

baseRate <- 214.60

grid <- cbind(
  expand.grid(
    sapply(
      rateFactors,
      function(x) {
        unlist(names(x))
      }
    )
  ),
  expand.grid(
    sapply(rateFactors, unlist)
  )
)
colnames(grid) <- c(
  colnames(grid)[1:length(rateFactors)],
  paste0(
    colnames(grid)[1:length(rateFactors)],
    "Factor"
  )
)

grid$issuedPremium <- apply(
  grid[, (length(rateFactors) + 1):ncol(grid)],
  1,
  function(x) {
    rate <- prod(x, baseRate)
    return(max((baseRate * 0.5), rate))
  }
)

grid <- grid[order(grid$issuedPremium), ]
grid$probability <- dnorm(
  grid$issuedPremium,
  mean = mean(grid$issuedPremium),
  sd = sd(grid$issuedPremium) / 2
)

set.seed(18356)
policies <- grid[
  sample(
    x = 1:nrow(grid),
    size = 100000,
    replace = TRUE,
    prob = grid$probability
  ),
  1:(ncol(grid) - 1)
  ]

policies$monthsRetained <- round(
  rnorm(
    n = nrow(policies),
    mean = 12 * 7,
    sd = 8
  ),
  0
)

policies$earnedPremium <- round(
  policies$issuedPremium / 6 * policies$monthsRetained,
  0
)

policies$earnedCarYears <- round(
  policies$monthsRetained / 12,
  2
)

policies$exposureCount <- sample(
  x = 0:6,
  size = nrow(policies),
  replace = TRUE,
  prob = c(0.356, 0.15, 0.125, 0.1, 0.075, 0.075, 0.065)
)
policies$incurredLoss <- round(
  policies$exposureCount * 1500 * rnorm(
    n = nrow(policies),
    mean = 1,
    sd = 0.1
  ),
  2
)

# glance at our loss ratio — we're running a little hot!
# (Let's assume a .68 permissible loss ratio)
sum(policies$incurredLoss) / sum(policies$earnedPremium)

policies$frequency <- policies$exposureCount / policies$earnedCarYears
policies$severity <- policies$incurredLoss / policies$exposureCount

## trainModel.R
# The following two commands remove any previously installed H2O packages for R.
if("package:h2o" %in% search()) {
  detach("package:h2o", unload=TRUE)
}
if("h2o" %in% rownames(installed.packages())) {
  remove.packages("h2o")
}

# Next, we download packages that H2O depends on.
pkgs <- c("RCurl","jsonlite")
for (pkg in pkgs) {
  if(!(pkg %in% rownames(installed.packages()))) {
    install.packages(pkg)
  }
}

# Now we download, install and initialize the H2O package for R.
install.packages(
  "h2o",
  type="source",
  repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/8/R"
)

# Finally, let's load H2O and start up an H2O cluster
library(h2o)
h2o.init()

policies.hex <- as.h2o(policies)

################################################################################
#
# Split into train/test data
#
################################################################################

# Split into train, test data frames
#
# Outputs a list of H2OFrames
# in order specified by destination_frames
# parameter
split <- h2o.splitFrame(
  policies.hex,
  ratios = c(0.75),
  destination_frames = c(
    "rating_train_0.75",
    "rating_test_0.25"
  )
)

rating_train_0.75 <- split[[1]]
rating_test_0.25 <- split[[2]]


################################################################################
#
# Build Severity model
#
################################################################################

severity_model <- h2o.glm(
  model_id = "severity_model",
  x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
    "creditFactor",
    "drvVehCountFactor",
    "pointsFactor",
    "popFactor",
    "homeownerFactor",
    "singleCarFactor",
    "issuedPremium",
    "monthsRetained",
    "earnedPremium",
    "earnedCarYears",
    "exposureCount",
    "incurredLoss",
    "frequency",
    "severity"
  )],
  y = "severity",
  training_frame = rating_train_0.75,
  validation_frame = rating_test_0.25,
  nfolds = 5,
  keep_cross_validation_predictions = FALSE,
  keep_cross_validation_fold_assignment = FALSE,
  family = "gamma",
  missing_values_handling = "Skip",
  intercept = TRUE,
  link = "log"
)

################################################################################
#
# Build Frequency model
#
################################################################################

frequency_model <- h2o.glm(
  model_id = "frequency_model",
  x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
    "creditFactor",
    "drvVehCountFactor",
    "pointsFactor",
    "popFactor",
    "homeownerFactor",
    "singleCarFactor",
    "issuedPremium",
    "monthsRetained",
    "earnedPremium",
    "earnedCarYears",
    "exposureCount",
    "incurredLoss",
    "frequency",
    "severity"
  )],
  y = "frequency",
  training_frame = rating_train_0.75,
  validation_frame = rating_test_0.25,
  nfolds = 5,
  keep_cross_validation_predictions = FALSE,
  keep_cross_validation_fold_assignment = FALSE,
  family = "poisson",
  missing_values_handling = "Skip",
  intercept = TRUE,
  link = "log"
)

################################################################################
#
# get updated factors
#
################################################################################

updated_factors <- cbind(
  as.data.frame(
    exp(severity_model@model$coefficients)
  ),
  as.data.frame(
    exp(frequency_model@model$coefficients)
  )
)
updated_factors$factor <- updated_factors[, 1] * updated_factors[, 2]

View(updated_factors)

################################################################################
#
# Explain why all factors converged on 1
#
################################################################################

out <- aggregate(
  x = list(
    "loss" = policies$incurredLoss,
    "prem" = policies$earnedPremium),
  by = list("points" = policies$points),
  FUN = "sum")
out$lr <- out$loss/out$prem
plot(out$points, out$lr)
	rateFactors <- list(
	credit = list(
	"A" = 0.7,
	"B" = 0.9,
	"C" = 1.0,
	"D" = 1.1,
	"E" = 1.3
	),
	drvVehCount = list(
	"1d1v" = 1.0,
	"2d1v" = 1.1,
	"3d1v" = 1.3,
	"1d2v" = 0.9,
	"2d2v" = 1.0,
	"3d2v" = 1.1,
	"1d3v" = 0.8,
	"2d3v" = 0.9,
	"3d3v" = 1.0,
	"4d3v" = 1.05
	),
	points = list(
	"0" = 1.0,
	"2" = 1.1,
	"3" = 1.5,
	"4" = 1.8,
	"5" = 2.5
	),
	pop = list(
	"0" = 1.1,
	"1" = 0.9
	),
	homeowner = list(
	"0" = 0.8,
	"1" = 1.0
	),
	singleCar = list(
	"0" = 0.8,
	"1" = 1.0
	)
	)

	baseRate <- 214.60

	grid <- cbind(
	expand.grid(
	sapply(
	rateFactors,
	function(x) {
	unlist(names(x))
	}
	)
	),
	expand.grid(
	sapply(rateFactors, unlist)
	)
	)
	colnames(grid) <- c(
	colnames(grid)[1:length(rateFactors)],
	paste0(
	colnames(grid)[1:length(rateFactors)],
	"Factor"
	)
	)

	grid$issuedPremium <- apply(
	grid[, (length(rateFactors) + 1):ncol(grid)],
	1,
	function(x) {
	rate <- prod(x, baseRate)
	return(max((baseRate * 0.5), rate))
	}
	)

	grid <- grid[order(grid$issuedPremium), ]
	grid$probability <- dnorm(
	grid$issuedPremium,
	mean = mean(grid$issuedPremium),
	sd = sd(grid$issuedPremium) / 2
	)

	set.seed(18356)
	policies <- grid[
	sample(
	x = 1:nrow(grid),
	size = 100000,
	replace = TRUE,
	prob = grid$probability
	),
	1:(ncol(grid) - 1)
	]

	policies$monthsRetained <- round(
	rnorm(
	n = nrow(policies),
	mean = 12 * 7,
	sd = 8
	),
	0
	)

	policies$earnedPremium <- round(
	policies$issuedPremium / 6 * policies$monthsRetained,
	0
	)

	policies$earnedCarYears <- round(
	policies$monthsRetained / 12,
	2
	)

	policies$exposureCount <- sample(
	x = 0:6,
	size = nrow(policies),
	replace = TRUE,
	prob = c(0.356, 0.15, 0.125, 0.1, 0.075, 0.075, 0.065)
	)
	policies$incurredLoss <- round(
	policies$exposureCount * 1500 * rnorm(
	n = nrow(policies),
	mean = 1,
	sd = 0.1
	),
	2
	)

	# glance at our loss ratio — we're running a little hot!
	# (Let's assume a .68 permissible loss ratio)
	sum(policies$incurredLoss) / sum(policies$earnedPremium)

	policies$frequency <- policies$exposureCount / policies$earnedCarYears
	policies$severity <- policies$incurredLoss / policies$exposureCount
	# The following two commands remove any previously installed H2O packages for R.
	if("package:h2o" %in% search()) {
	detach("package:h2o", unload=TRUE)
	}
	if("h2o" %in% rownames(installed.packages())) {
	remove.packages("h2o")
	}

	# Next, we download packages that H2O depends on.
	pkgs <- c("RCurl","jsonlite")
	for (pkg in pkgs) {
	if(!(pkg %in% rownames(installed.packages()))) {
	install.packages(pkg)
	}
	}

	# Now we download, install and initialize the H2O package for R.
	install.packages(
	"h2o",
	type="source",
	repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/8/R"
	)

	# Finally, let's load H2O and start up an H2O cluster
	library(h2o)
	h2o.init()

	policies.hex <- as.h2o(policies)

	################################################################################
	#
	# Split into train/test data
	#
	################################################################################

	# Split into train, test data frames
	#
	# Outputs a list of H2OFrames
	# in order specified by destination_frames
	# parameter
	split <- h2o.splitFrame(
	policies.hex,
	ratios = c(0.75),
	destination_frames = c(
	"rating_train_0.75",
	"rating_test_0.25"
	)
	)

	rating_train_0.75 <- split[[1]]
	rating_test_0.25 <- split[[2]]


	################################################################################
	#
	# Build Severity model
	#
	################################################################################

	severity_model <- h2o.glm(
	model_id = "severity_model",
	x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
	"creditFactor",
	"drvVehCountFactor",
	"pointsFactor",
	"popFactor",
	"homeownerFactor",
	"singleCarFactor",
	"issuedPremium",
	"monthsRetained",
	"earnedPremium",
	"earnedCarYears",
	"exposureCount",
	"incurredLoss",
	"frequency",
	"severity"
	)],
	y = "severity",
	training_frame = rating_train_0.75,
	validation_frame = rating_test_0.25,
	nfolds = 5,
	keep_cross_validation_predictions = FALSE,
	keep_cross_validation_fold_assignment = FALSE,
	family = "gamma",
	missing_values_handling = "Skip",
	intercept = TRUE,
	link = "log"
	)

	################################################################################
	#
	# Build Frequency model
	#
	################################################################################

	frequency_model <- h2o.glm(
	model_id = "frequency_model",
	x = colnames(rating_train_0.75)[!colnames(rating_train_0.75) %in% c(
	"creditFactor",
	"drvVehCountFactor",
	"pointsFactor",
	"popFactor",
	"homeownerFactor",
	"singleCarFactor",
	"issuedPremium",
	"monthsRetained",
	"earnedPremium",
	"earnedCarYears",
	"exposureCount",
	"incurredLoss",
	"frequency",
	"severity"
	)],
	y = "frequency",
	training_frame = rating_train_0.75,
	validation_frame = rating_test_0.25,
	nfolds = 5,
	keep_cross_validation_predictions = FALSE,
	keep_cross_validation_fold_assignment = FALSE,
	family = "poisson",
	missing_values_handling = "Skip",
	intercept = TRUE,
	link = "log"
	)

	################################################################################
	#
	# get updated factors
	#
	################################################################################

	updated_factors <- cbind(
	as.data.frame(
	exp(severity_model@model$coefficients)
	),
	as.data.frame(
	exp(frequency_model@model$coefficients)
	)
	)
	updated_factors$factor <- updated_factors[, 1] * updated_factors[, 2]

	View(updated_factors)

	################################################################################
	#
	# Explain why all factors converged on 1
	#
	################################################################################

	out <- aggregate(
	x = list(
	"loss" = policies$incurredLoss,
	"prem" = policies$earnedPremium),
	by = list("points" = policies$points),
	FUN = "sum")
	out$lr <- out$loss/out$prem
	plot(out$points, out$lr)