jtilly/check-lgb-multiclass.R

## check-lgb-multiclass.R
library(lightgbm)
library(tidyverse)

rm(list = ls())

# We load the default iris dataset shipped with R
data(iris)

iris = as_data_frame(iris) %>%
    mutate(Species = as.numeric(factor(Species)) - 1) %>%
    filter(Species < 2) %>%
    group_by(Species) %>%
    dplyr::slice(1:2) %>%
    ungroup()

x = as.matrix(iris %>% select(-Species))
y = iris %>% pull(Species)

dtrain <- lgb.Dataset(data = x, label = y)

custom_multiclass_obj = function(preds, dtrain) {
    labels = getinfo(dtrain, "label")

    # print preds
    print(data_frame(preds))

    # preds is a matrix with rows corresponding to samples and colums corresponding to choices
    preds = matrix(preds, nrow = length(labels))

    # to prevent overflow, normalize preds by row
    preds = preds - apply(preds, 1, max)
    prob = exp(preds) / rowSums(exp(preds))

    # compute gradient
    grad = prob
    grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1

    # compute hessian (approximation)
    hess = 2 * prob * (1 - prob)

    return(list(grad = grad, hess = hess))
}

# define custom metric
custom_multiclass_metric = function(preds, dtrain) {

    labels = getinfo(dtrain, "label")
    preds = matrix(preds, nrow = length(labels))
    preds = preds - apply(preds, 1, max)
    prob = exp(preds) / rowSums(exp(preds))

    return(list(name = "error",
                value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
                higher_better = FALSE))

}

setinfo(dtrain, "init_score", c(0, 0, 0, 0, 0, 0, 0, 0))

# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
model1 <- lgb.train(list(),
                    dtrain,
                    nrounds = 2,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

# # A tibble: 8 x 1
# preds
# <dbl>
# 1  0.333
# 2  0.333
# 3 -1.000
# 4  0.333
# 5 -0.333
# 6 -0.333
# 7  1.000
# 8 -0.333

# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
# should be identical to the predictions from above
model2 <- lgb.train(list(),
                    dtrain,
                    nrounds = 1,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

print(data_frame(predict(model2, x)))

# `predict(model2, x)`
# <dbl>
# 1                0.333
# 2               -0.333
# 3                0.333
# 4               -0.333
# 5               -1.000
# 6                1.000
# 7                0.333
# 8               -0.333

# Note that the order is wrong! We need a reshape:

print(data_frame(reshape = predict(model2, x, reshape = TRUE) %>% as.vector()))

# # A tibble: 8 x 1
# reshape
# <dbl>
# 1   0.333
# 2   0.333
# 3  -1.000
# 4   0.333
# 5  -0.333
# 6  -0.333
# 7   1.000
# 8  -0.333

# Conclusion: predictions that we obtain from R's predict function and the vector called `preds`
# inside the custom objective function are stored in different orders!

# This also affects how we deal with base margins! Base margins follow the logic of R's predict
# function.

# All of this is due to the lovely helper functions RowFunctionFromDenseMatric in C Api that
# appears to be applied inconsistently.

# Check base margins

# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
model1 <- lgb.train(list(),
                    dtrain,
                    nrounds = 2,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

predict(model1, data = x, num_iteration = 1, reshape = TRUE) %>% as.vector()
# [1]  0.3333333  0.3333333 -1.0000000  0.3333333 -0.3333333 -0.3333333  1.0000000 -0.3333333

base_margin = c(0.333, 0.333, -1.000, 0.333, -0.333, -0.333, 1.000, -0.333)
setinfo(dtrain, "init_score", base_margin)

# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
# should be identical to the predictions from above
model2 <- lgb.train(list(),
                    dtrain,
                    nrounds = 1,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

print(data_frame(predict(model1, x, reshape = TRUE) %>% as.vector()))
print(data_frame(predict(model2, x, reshape = TRUE) %>% as.vector() + base_margin))
	library(lightgbm)
	library(tidyverse)

	rm(list = ls())

	# We load the default iris dataset shipped with R
	data(iris)

	iris = as_data_frame(iris) %>%
	mutate(Species = as.numeric(factor(Species)) - 1) %>%
	filter(Species < 2) %>%
	group_by(Species) %>%
	dplyr::slice(1:2) %>%
	ungroup()

	x = as.matrix(iris %>% select(-Species))
	y = iris %>% pull(Species)

	dtrain <- lgb.Dataset(data = x, label = y)

	custom_multiclass_obj = function(preds, dtrain) {
	labels = getinfo(dtrain, "label")

	# print preds
	print(data_frame(preds))

	# preds is a matrix with rows corresponding to samples and colums corresponding to choices
	preds = matrix(preds, nrow = length(labels))

	# to prevent overflow, normalize preds by row
	preds = preds - apply(preds, 1, max)
	prob = exp(preds) / rowSums(exp(preds))

	# compute gradient
	grad = prob
	grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1

	# compute hessian (approximation)
	hess = 2 * prob * (1 - prob)

	return(list(grad = grad, hess = hess))
	}

	# define custom metric
	custom_multiclass_metric = function(preds, dtrain) {

	labels = getinfo(dtrain, "label")
	preds = matrix(preds, nrow = length(labels))
	preds = preds - apply(preds, 1, max)
	prob = exp(preds) / rowSums(exp(preds))

	return(list(name = "error",
	value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
	higher_better = FALSE))

	}

	setinfo(dtrain, "init_score", c(0, 0, 0, 0, 0, 0, 0, 0))

	# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
	model1 <- lgb.train(list(),
	dtrain,
	nrounds = 2,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	# # A tibble: 8 x 1
	# preds
	# <dbl>
	# 1 0.333
	# 2 0.333
	# 3 -1.000
	# 4 0.333
	# 5 -0.333
	# 6 -0.333
	# 7 1.000
	# 8 -0.333

	# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
	# should be identical to the predictions from above
	model2 <- lgb.train(list(),
	dtrain,
	nrounds = 1,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	print(data_frame(predict(model2, x)))

	# `predict(model2, x)`
	# <dbl>
	# 1 0.333
	# 2 -0.333
	# 3 0.333
	# 4 -0.333
	# 5 -1.000
	# 6 1.000
	# 7 0.333
	# 8 -0.333

	# Note that the order is wrong! We need a reshape:

	print(data_frame(reshape = predict(model2, x, reshape = TRUE) %>% as.vector()))

	# # A tibble: 8 x 1
	# reshape
	# <dbl>
	# 1 0.333
	# 2 0.333
	# 3 -1.000
	# 4 0.333
	# 5 -0.333
	# 6 -0.333
	# 7 1.000
	# 8 -0.333

	# Conclusion: predictions that we obtain from R's predict function and the vector called `preds`
	# inside the custom objective function are stored in different orders!

	# This also affects how we deal with base margins! Base margins follow the logic of R's predict
	# function.

	# All of this is due to the lovely helper functions RowFunctionFromDenseMatric in C Api that
	# appears to be applied inconsistently.

	# Check base margins

	# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
	model1 <- lgb.train(list(),
	dtrain,
	nrounds = 2,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	predict(model1, data = x, num_iteration = 1, reshape = TRUE) %>% as.vector()
	# [1] 0.3333333 0.3333333 -1.0000000 0.3333333 -0.3333333 -0.3333333 1.0000000 -0.3333333

	base_margin = c(0.333, 0.333, -1.000, 0.333, -0.333, -0.333, 1.000, -0.333)
	setinfo(dtrain, "init_score", base_margin)

	# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
	# should be identical to the predictions from above
	model2 <- lgb.train(list(),
	dtrain,
	nrounds = 1,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	print(data_frame(predict(model1, x, reshape = TRUE) %>% as.vector()))
	print(data_frame(predict(model2, x, reshape = TRUE) %>% as.vector() + base_margin))