Created
June 6, 2017 07:29
-
-
Save ytsaig/a596feec53d2a024ac69f5ae5a83d8f7 to your computer and use it in GitHub Desktop.
Multiclass classification (softmax regression) via xgboost custom objective
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import datasets | |
from sklearn.metrics import confusion_matrix | |
from sklearn.preprocessing import OneHotEncoder | |
import xgboost as xgb | |
def softmax(z): | |
z -= np.max(z) | |
sm = (np.exp(z).T / np.sum(np.exp(z), axis=1)).T | |
return sm | |
def softmaxobj(preds, dtrain): | |
"""Softmax objective. | |
Args: | |
preds: (N, K) array, N = #data, K = #classes. | |
dtrain: DMatrix object with training data. | |
Returns: | |
grad: N*K array with gradient values. | |
hess: N*K array with second-order gradient values. | |
""" | |
# Label is a vector of class indices for each input example | |
labels = dtrain.get_label() | |
# When objective=softprob, preds has shape (N, K) | |
labels = OneHotEncoder(sparse=False).fit_transform(labels.reshape(-1, 1)) | |
grad = preds - labels | |
hess = 2.0 * preds * (1.0-preds) | |
# Return as 1-d vectors | |
return grad.flatten(), hess.flatten() | |
# Iris dataset | |
iris = datasets.load_iris() | |
X, Ymc = iris.data, iris.target | |
Y = OneHotEncoder(sparse=False).fit_transform(Ymc.reshape(-1, 1)) | |
"""xgboost softmax regression""" | |
dtrain = xgb.DMatrix(X, label=Ymc) | |
params = {'max_depth': 2, 'eta': 0.1, 'silent': 1, | |
'objective': 'multi:softprob', 'num_class': len(np.unique(Ymc))} | |
# Fit | |
model = xgb.train(params, dtrain, 100) | |
# Evalute | |
yhat = model.predict(dtrain) | |
yhat_labels = np.argmax(yhat, axis=1) | |
confusion_matrix(Ymc, yhat_labels) | |
"""xgboost softmax regression via custom loss""" | |
dtrain = xgb.DMatrix(X, label=Ymc) | |
params = {'max_depth': 2, 'eta': 0.1, 'silent': 1, | |
'objective': 'multi:softprob', 'num_class': len(np.unique(Ymc))} | |
# Fit | |
model = xgb.Booster(params, [dtrain]) | |
for _ in range(100): | |
pred = model.predict(dtrain) | |
g, h = softmaxobj(pred, dtrain) | |
model.boost(dtrain, g, h) | |
# Evalute | |
yhat1 = model.predict(dtrain) | |
yhat1_labels = np.argmax(yhat, axis=1) | |
print(confusion_matrix(Ymc, yhat1_labels)) | |
# Compare the two approaches | |
print(confusion_matrix(yhat_labels, yhat1_labels)) | |
np.sum((yhat-yhat1)**2) | |
Here the Hessian happens to be diagonal, and therefore could be represented by a 1D array (per instance). What about the general case, where the Hessian is not diagonal?
It seems that something is off in the custom softmax
objective functions that I have seen online. I tried to replicate the result of the multi:softprob
but it does not work. Here is a reproducible example similar to above:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss
import xgboost as xgb
# Iris dataset
iris = datasets.load_iris()
X, Ymc = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, Ymc, test_size=0.30, random_state=2019)
def softmaxobj(preds, dtrain):
labels = dtrain.get_label()
labels = labels.reshape(-1, 1)
labels = OneHotEncoder(sparse=False, categories='auto').fit_transform(labels)
grad = preds - labels
hess = 2.0 * preds * (1.0 - preds)
return grad.flatten('F'), hess.flatten('F')
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(X_test)
eval_set=[(dvalid, 'eval')]
param = {'max_depth': '6', 'objective':'multi:softprob',
'tree_method':'hist', 'num_class': 3,
'eval_metric': 'mlogloss'}
bst = xgb.train(param, dtrain, num_boost_round=100,
verbose_eval=10, evals=eval_set)
preds=bst.predict(dtest)
print(log_loss(y_test, preds))
[0] eval-mlogloss:0.749971
[10] eval-mlogloss:0.111583
[20] eval-mlogloss:0.107522
[30] eval-mlogloss:0.116207
[40] eval-mlogloss:0.119493
[50] eval-mlogloss:0.125152
[60] eval-mlogloss:0.130457
[70] eval-mlogloss:0.134239
[80] eval-mlogloss:0.138156
[90] eval-mlogloss:0.141714
[99] eval-mlogloss:0.144838
custom objective logloss0.14483839407071677
#train xgboost with same custom loss function
param = {'max_depth': '6', 'objective':'multi:softprob',
'tree_method':'hist', 'num_class': 3,
'eval_metric': 'mlogloss'}
bst_cust = xgb.train(param, dtrain, num_boost_round=100,
verbose_eval=10, evals=eval_set,
obj=softmaxobj)
preds_cust=bst_cust.predict(dtest)
print('custom objective {}'.format(log_loss(y_test, preds_cust)))
[0] eval-mlogloss:1.16279
[10] eval-mlogloss:2.63613
[20] eval-mlogloss:5.60595
[30] eval-mlogloss:10.83
[40] eval-mlogloss:15.2783
[50] eval-mlogloss:17.3254
[60] eval-mlogloss:18.8124
[70] eval-mlogloss:19.3257
[80] eval-mlogloss:18.956
[90] eval-mlogloss:17.9979
[99] eval-mlogloss:17.567
custom objective 16.74834724995825
Does any one see anything wrong with my implementation? I was able to get the binary
custom objective function to work with no issues.
I figured out the issue with my softmaxobj
it seems that converting grad.flatten()
and hess.flatten()
to grad.flatten('F')
and hess.flatten('F')
greatly impact the results.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Also setting gradient and hessian to zero in softmaxobj
grad = 0 * labels * (1.0 - preds)
hess = 0*2.0 * labels * preds * (1.0 - preds)
didn't change overall result