Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@jtilly
Last active October 11, 2018 13:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtilly/9568a2a01863caea7d9f6cf67acf087b to your computer and use it in GitHub Desktop.
Save jtilly/9568a2a01863caea7d9f6cf67acf087b to your computer and use it in GitHub Desktop.
Treatment of missing values with and without sparse matrices
library(xgboost)
library(dplyr)

params = list(min_child_weight = 0.00001, lambda = 0 )
nrounds = 1

# sparse ---

# treat missing as NA_real

dat = data_frame(
    y = c(0, 1, 1),
    x = c(NA_real_, 0, 1)
)

options(na.action = na.pass)
model_matrix = Matrix::sparse.model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2

# treat missing as zero

dat = data_frame(
    y = c(0, 1, 1),
    x = c(0, 0, 1)
)

options(na.action = na.pass)
model_matrix = Matrix::sparse.model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2

# dense ---

# treat missing as NA_real

dat = data_frame(
    y = c(0, 1, 1),
    x = c(NA_real_, 0, 1)
)

options(na.action = na.pass)
model_matrix = model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    2
#> [2,]    1
#> [3,]    1

# treat missing as zero

dat = data_frame(
    y = c(0, 1, 1),
    x = c(0, 0, 1)
)

options(na.action = na.pass)
model_matrix = model.matrix(data = dat, y ~ x)
dat_xgb = xgb.DMatrix(model_matrix, label = dat$y, missing = NA_real_)
set.seed(12345)
xgb_mdl = xgb.train(params = params, data = dat_xgb, nrounds = nrounds, objective = "binary:logistic")
pred_leaf = predict(xgb_mdl, newdata = dat_xgb, predleaf = TRUE)
print(pred_leaf)
#>      [,1]
#> [1,]    1
#> [2,]    1
#> [3,]    2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment