Skip to content

Instantly share code, notes, and snippets.

@jtilly jtilly/xgb-missings.md
Last active Oct 11, 2018

Embed
What would you like to do?
Treatment of missing values with and without sparse matrices
library(xgboost)
library(dplyr)

params = list(min_child_weight = 0.00001, lambda = 0 )
nrounds = 1

# sparse ---

# treat missing as NA_real

dat = data_frame(
    y = c(0, 1, 1),
    x = c(NA_real_, 0, 1)
)

options(na.action = na.pass)
model_matrix = Matrix::sparse.model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2

# treat missing as zero

dat = data_frame(
    y = c(0, 1, 1),
    x = c(0, 0, 1)
)

options(na.action = na.pass)
model_matrix = Matrix::sparse.model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2

# dense ---

# treat missing as NA_real

dat = data_frame(
    y = c(0, 1, 1),
    x = c(NA_real_, 0, 1)
)

options(na.action = na.pass)
model_matrix = model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    2
#> [2,]    1
#> [3,]    1

# treat missing as zero

dat = data_frame(
    y = c(0, 1, 1),
    x = c(0, 0, 1)
)

options(na.action = na.pass)
model_matrix = model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.