Created
April 25, 2019 14:32
-
-
Save SimonCoulombe/8828ea17262b982ca613d2ad37b0576e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ça c'est mon programme où j'essaie de modifier le package mlr pour qu'il prenne base_margin | |
# comme option | |
library(xgboost) | |
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf | |
library(tidyverse) | |
#install.packages("mlr") | |
library(mlr) | |
set.seed(1234) | |
data(dataCar) | |
mydb <- dataCar %>% select(numclaims, exposure, veh_value, veh_body, | |
veh_age, gender, area, agecat) | |
label_var <- "numclaims" | |
offset_var <- "exposure" | |
feature_vars <- mydb %>% | |
select(-one_of(c(label_var, offset_var))) %>% | |
colnames() | |
#preparing data for xgboost (one hot encoding of categorical (factor) data | |
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula() | |
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE) | |
dummyVars.df <- predict(dummyFier,newdata = mydb) | |
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))), | |
dummyVars.df) | |
rm(myformula, dummyFier, dummyVars.df) | |
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames() | |
# create xgb.matrix for xgboost consumption | |
mydb_xgbmatrix <- xgb.DMatrix( | |
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix, | |
label = mydb_dummy %>% pull(label_var), | |
missing = "NAN") | |
#base_margin: base margin is the base prediction Xgboost will boost from (ie: exposure) | |
setinfo(mydb_xgbmatrix,"base_margin", mydb %>% pull(offset_var) %>% log() ) | |
# random constraint, just to show how it can be used | |
myConstraint <- data_frame(Variable = feature_vars_dummy) %>% | |
mutate(sens = ifelse(Variable == "veh_age", -1, 0)) | |
# cv folds | |
cv_folds = rBayesianOptimization::KFold(mydb_dummy$numclaims, | |
nfolds= 3, | |
stratified = TRUE, | |
seed= 0) | |
cv <- xgb.cv(params = list( | |
booster = "gbtree", | |
eta = 0.01, | |
max_depth = 2, | |
min_child_weight = 2, | |
gamma = 0, | |
subsample = 0.6, | |
colsample_bytree = 0.6, | |
objective = 'count:poisson', | |
eval_metric = "poisson-nloglik"), | |
data = mydb_xgbmatrix, | |
nround = 50, | |
folds= cv_folds, | |
monotone_constraints = myConstraint$sens, | |
prediction = FALSE, | |
showsd = TRUE, | |
early_stopping_rounds = 20, | |
verbose = 0) | |
cv$evaluation_log[, max(test_poisson_nloglik_mean)] | |
# http://rstudio-pubs-static.s3.amazonaws.com/336732_52d1b0e682634b5eae42cf86e1fc2a98.html | |
trainTask <- makeRegrTask(data = mydb_dummy %>% select(label_var, feature_vars_dummy ), | |
target = "numclaims") | |
testTask <- makeRegrTask(data = mydb_dummy %>% head(50000) %>% select(label_var, feature_vars_dummy), | |
target = "numclaims") | |
# Create an xgboost learner that is classification based and outputs | |
# labels (as opposed to probabilities) | |
set.seed(1) | |
xgb_learner <- makeLearner( | |
cl = "regr.xgboost", | |
predict.type = "response", | |
par.vals = list( | |
objective = "count:poisson", | |
eval_metric = "poisson-nloglik", | |
nrounds = 50 | |
) | |
) | |
xgb_model1 <- train(xgb_learner, task = trainTask) | |
result1 <- predict(xgb_model1, testTask) | |
set.seed(1) | |
xgb_learner <- makeLearner( | |
cl = "regr.xgboost", | |
predict.type = "response", | |
par.vals = list( | |
objective = "count:poisson", | |
eval_metric = "poisson-nloglik", | |
nrounds = 50, | |
base_margin= rep(1,nrow(mydb_dummy)) | |
) | |
) | |
xgb_model2 <- train(xgb_learner, task = trainTask) | |
result2 <- predict(xgb_model2, testTask) | |
set.seed(1) | |
xgb_learner <- makeLearner( | |
cl = "regr.xgboost", | |
predict.type = "response", | |
par.vals = list( | |
objective = "count:poisson", | |
eval_metric = "poisson-nloglik", | |
nrounds = 50, | |
base_margin= seq(1:nrow(mydb_dummy)) / nrow(mydb_dummy) | |
) | |
) | |
xgb_model3 <- train(xgb_learner, task = trainTask) | |
result3 <- predict(xgb_model3, testTask) | |
sum(result$data$response) | |
sum(result2$data$response) | |
sum(result3$data$response) | |
sum(mydb_dummy %>% head(50000) %>% pull(numclaims)) | |
## | |
getParamSet("regr.xgboost") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment