Skip to content

Instantly share code, notes, and snippets.

@SimonCoulombe
Created April 25, 2019 14:32
Show Gist options
  • Save SimonCoulombe/8828ea17262b982ca613d2ad37b0576e to your computer and use it in GitHub Desktop.
Save SimonCoulombe/8828ea17262b982ca613d2ad37b0576e to your computer and use it in GitHub Desktop.
#ça c'est mon programme où j'essaie de modifier le package mlr pour qu'il prenne base_margin
# comme option
library(xgboost)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(tidyverse)
#install.packages("mlr")
library(mlr)
set.seed(1234)
data(dataCar)
mydb <- dataCar %>% select(numclaims, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "numclaims"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
# create xgb.matrix for xgboost consumption
mydb_xgbmatrix <- xgb.DMatrix(
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = mydb_dummy %>% pull(label_var),
missing = "NAN")
#base_margin: base margin is the base prediction Xgboost will boost from (ie: exposure)
setinfo(mydb_xgbmatrix,"base_margin", mydb %>% pull(offset_var) %>% log() )
# random constraint, just to show how it can be used
myConstraint <- data_frame(Variable = feature_vars_dummy) %>%
mutate(sens = ifelse(Variable == "veh_age", -1, 0))
# cv folds
cv_folds = rBayesianOptimization::KFold(mydb_dummy$numclaims,
nfolds= 3,
stratified = TRUE,
seed= 0)
cv <- xgb.cv(params = list(
booster = "gbtree",
eta = 0.01,
max_depth = 2,
min_child_weight = 2,
gamma = 0,
subsample = 0.6,
colsample_bytree = 0.6,
objective = 'count:poisson',
eval_metric = "poisson-nloglik"),
data = mydb_xgbmatrix,
nround = 50,
folds= cv_folds,
monotone_constraints = myConstraint$sens,
prediction = FALSE,
showsd = TRUE,
early_stopping_rounds = 20,
verbose = 0)
cv$evaluation_log[, max(test_poisson_nloglik_mean)]
# http://rstudio-pubs-static.s3.amazonaws.com/336732_52d1b0e682634b5eae42cf86e1fc2a98.html
trainTask <- makeRegrTask(data = mydb_dummy %>% select(label_var, feature_vars_dummy ),
target = "numclaims")
testTask <- makeRegrTask(data = mydb_dummy %>% head(50000) %>% select(label_var, feature_vars_dummy),
target = "numclaims")
# Create an xgboost learner that is classification based and outputs
# labels (as opposed to probabilities)
set.seed(1)
xgb_learner <- makeLearner(
cl = "regr.xgboost",
predict.type = "response",
par.vals = list(
objective = "count:poisson",
eval_metric = "poisson-nloglik",
nrounds = 50
)
)
xgb_model1 <- train(xgb_learner, task = trainTask)
result1 <- predict(xgb_model1, testTask)
set.seed(1)
xgb_learner <- makeLearner(
cl = "regr.xgboost",
predict.type = "response",
par.vals = list(
objective = "count:poisson",
eval_metric = "poisson-nloglik",
nrounds = 50,
base_margin= rep(1,nrow(mydb_dummy))
)
)
xgb_model2 <- train(xgb_learner, task = trainTask)
result2 <- predict(xgb_model2, testTask)
set.seed(1)
xgb_learner <- makeLearner(
cl = "regr.xgboost",
predict.type = "response",
par.vals = list(
objective = "count:poisson",
eval_metric = "poisson-nloglik",
nrounds = 50,
base_margin= seq(1:nrow(mydb_dummy)) / nrow(mydb_dummy)
)
)
xgb_model3 <- train(xgb_learner, task = trainTask)
result3 <- predict(xgb_model3, testTask)
sum(result$data$response)
sum(result2$data$response)
sum(result3$data$response)
sum(mydb_dummy %>% head(50000) %>% pull(numclaims))
##
getParamSet("regr.xgboost")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment