Skip to content

Instantly share code, notes, and snippets.

@jtilly
Last active May 12, 2018 08:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtilly/f3fc89e2ab32efcfc15e94ccc101081f to your computer and use it in GitHub Desktop.
Save jtilly/f3fc89e2ab32efcfc15e94ccc101081f to your computer and use it in GitHub Desktop.
dim(preds) vs. dim(predict(...)): what the hell is LightGBM doing?
library(lightgbm)
library(tidyverse)
rm(list = ls())
# We load the default iris dataset shipped with R
data(iris)
iris = as_data_frame(iris) %>%
mutate(Species = as.numeric(factor(Species)) - 1) %>%
filter(Species < 2) %>%
group_by(Species) %>%
dplyr::slice(1:2) %>%
ungroup()
x = as.matrix(iris %>% select(-Species))
y = iris %>% pull(Species)
dtrain <- lgb.Dataset(data = x, label = y)
custom_multiclass_obj = function(preds, dtrain) {
labels = getinfo(dtrain, "label")
# print preds
print(data_frame(preds))
# preds is a matrix with rows corresponding to samples and colums corresponding to choices
preds = matrix(preds, nrow = length(labels))
# to prevent overflow, normalize preds by row
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
# compute gradient
grad = prob
grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1
# compute hessian (approximation)
hess = 2 * prob * (1 - prob)
return(list(grad = grad, hess = hess))
}
# define custom metric
custom_multiclass_metric = function(preds, dtrain) {
labels = getinfo(dtrain, "label")
preds = matrix(preds, nrow = length(labels))
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
return(list(name = "error",
value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
higher_better = FALSE))
}
setinfo(dtrain, "init_score", c(0, 0, 0, 0, 0, 0, 0, 0))
# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
model1 <- lgb.train(list(),
dtrain,
nrounds = 2,
min_data = 1,
learning_rate = 1,
num_leaves = 2,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 2)
# # A tibble: 8 x 1
# preds
# <dbl>
# 1 0.333
# 2 0.333
# 3 -1.000
# 4 0.333
# 5 -0.333
# 6 -0.333
# 7 1.000
# 8 -0.333
# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
# should be identical to the predictions from above
model2 <- lgb.train(list(),
dtrain,
nrounds = 1,
min_data = 1,
learning_rate = 1,
num_leaves = 2,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 2)
print(data_frame(predict(model2, x)))
# `predict(model2, x)`
# <dbl>
# 1 0.333
# 2 -0.333
# 3 0.333
# 4 -0.333
# 5 -1.000
# 6 1.000
# 7 0.333
# 8 -0.333
# Note that the order is wrong! We need a reshape:
print(data_frame(reshape = predict(model2, x, reshape = TRUE) %>% as.vector()))
# # A tibble: 8 x 1
# reshape
# <dbl>
# 1 0.333
# 2 0.333
# 3 -1.000
# 4 0.333
# 5 -0.333
# 6 -0.333
# 7 1.000
# 8 -0.333
# Conclusion: predictions that we obtain from R's predict function and the vector called `preds`
# inside the custom objective function are stored in different orders!
# This also affects how we deal with base margins! Base margins follow the logic of R's predict
# function.
# All of this is due to the lovely helper functions RowFunctionFromDenseMatric in C Api that
# appears to be applied inconsistently.
# Check base margins
# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
model1 <- lgb.train(list(),
dtrain,
nrounds = 2,
min_data = 1,
learning_rate = 1,
num_leaves = 2,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 2)
predict(model1, data = x, num_iteration = 1, reshape = TRUE) %>% as.vector()
# [1] 0.3333333 0.3333333 -1.0000000 0.3333333 -0.3333333 -0.3333333 1.0000000 -0.3333333
base_margin = c(0.333, 0.333, -1.000, 0.333, -0.333, -0.333, 1.000, -0.333)
setinfo(dtrain, "init_score", base_margin)
# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
# should be identical to the predictions from above
model2 <- lgb.train(list(),
dtrain,
nrounds = 1,
min_data = 1,
learning_rate = 1,
num_leaves = 2,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 2)
print(data_frame(predict(model1, x, reshape = TRUE) %>% as.vector()))
print(data_frame(predict(model2, x, reshape = TRUE) %>% as.vector() + base_margin))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment