Skip to content

Instantly share code, notes, and snippets.

@ytsaig
Created June 6, 2017 07:29
Show Gist options
  • Save ytsaig/a596feec53d2a024ac69f5ae5a83d8f7 to your computer and use it in GitHub Desktop.
Save ytsaig/a596feec53d2a024ac69f5ae5a83d8f7 to your computer and use it in GitHub Desktop.
Multiclass classification (softmax regression) via xgboost custom objective
import numpy as np
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
def softmax(z):
z -= np.max(z)
sm = (np.exp(z).T / np.sum(np.exp(z), axis=1)).T
return sm
def softmaxobj(preds, dtrain):
"""Softmax objective.
Args:
preds: (N, K) array, N = #data, K = #classes.
dtrain: DMatrix object with training data.
Returns:
grad: N*K array with gradient values.
hess: N*K array with second-order gradient values.
"""
# Label is a vector of class indices for each input example
labels = dtrain.get_label()
# When objective=softprob, preds has shape (N, K)
labels = OneHotEncoder(sparse=False).fit_transform(labels.reshape(-1, 1))
grad = preds - labels
hess = 2.0 * preds * (1.0-preds)
# Return as 1-d vectors
return grad.flatten(), hess.flatten()
# Iris dataset
iris = datasets.load_iris()
X, Ymc = iris.data, iris.target
Y = OneHotEncoder(sparse=False).fit_transform(Ymc.reshape(-1, 1))
"""xgboost softmax regression"""
dtrain = xgb.DMatrix(X, label=Ymc)
params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
'objective': 'multi:softprob', 'num_class': len(np.unique(Ymc))}
# Fit
model = xgb.train(params, dtrain, 100)
# Evalute
yhat = model.predict(dtrain)
yhat_labels = np.argmax(yhat, axis=1)
confusion_matrix(Ymc, yhat_labels)
"""xgboost softmax regression via custom loss"""
dtrain = xgb.DMatrix(X, label=Ymc)
params = {'max_depth': 2, 'eta': 0.1, 'silent': 1,
'objective': 'multi:softprob', 'num_class': len(np.unique(Ymc))}
# Fit
model = xgb.Booster(params, [dtrain])
for _ in range(100):
pred = model.predict(dtrain)
g, h = softmaxobj(pred, dtrain)
model.boost(dtrain, g, h)
# Evalute
yhat1 = model.predict(dtrain)
yhat1_labels = np.argmax(yhat, axis=1)
print(confusion_matrix(Ymc, yhat1_labels))
# Compare the two approaches
print(confusion_matrix(yhat_labels, yhat1_labels))
np.sum((yhat-yhat1)**2)
@sergey-lebedev
Copy link

sergey-lebedev commented Oct 10, 2018

Also setting gradient and hessian to zero in softmaxobj
grad = 0 * labels * (1.0 - preds)
hess = 0*2.0 * labels * preds * (1.0 - preds)
didn't change overall result

@chargeshivers
Copy link

Here the Hessian happens to be diagonal, and therefore could be represented by a 1D array (per instance). What about the general case, where the Hessian is not diagonal?

@rydevera3
Copy link

rydevera3 commented May 14, 2019

It seems that something is off in the custom softmax objective functions that I have seen online. I tried to replicate the result of the multi:softprob but it does not work. Here is a reproducible example similar to above:

import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss
import xgboost as xgb

# Iris dataset
iris = datasets.load_iris()
X, Ymc = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, Ymc, test_size=0.30, random_state=2019)

def softmaxobj(preds, dtrain):
    
    labels = dtrain.get_label()
    labels = labels.reshape(-1, 1)
    labels = OneHotEncoder(sparse=False, categories='auto').fit_transform(labels)
    
    grad = preds - labels
    hess = 2.0 * preds * (1.0 - preds)
    return grad.flatten('F'), hess.flatten('F')

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(X_test)
eval_set=[(dvalid, 'eval')]

param = {'max_depth': '6', 'objective':'multi:softprob', 
         'tree_method':'hist', 'num_class': 3,
         'eval_metric': 'mlogloss'}

bst = xgb.train(param, dtrain, num_boost_round=100, 
                verbose_eval=10, evals=eval_set)

preds=bst.predict(dtest)
print(log_loss(y_test, preds))

[0]	eval-mlogloss:0.749971
[10]	eval-mlogloss:0.111583
[20]	eval-mlogloss:0.107522
[30]	eval-mlogloss:0.116207
[40]	eval-mlogloss:0.119493
[50]	eval-mlogloss:0.125152
[60]	eval-mlogloss:0.130457
[70]	eval-mlogloss:0.134239
[80]	eval-mlogloss:0.138156
[90]	eval-mlogloss:0.141714
[99]	eval-mlogloss:0.144838
custom objective logloss0.14483839407071677

#train xgboost with same custom loss function
param = {'max_depth': '6', 'objective':'multi:softprob', 
         'tree_method':'hist', 'num_class': 3,
         'eval_metric': 'mlogloss'}

bst_cust = xgb.train(param, dtrain, num_boost_round=100, 
                     verbose_eval=10, evals=eval_set,
                     obj=softmaxobj)

preds_cust=bst_cust.predict(dtest)
print('custom objective {}'.format(log_loss(y_test, preds_cust)))

[0]	eval-mlogloss:1.16279
[10]	eval-mlogloss:2.63613
[20]	eval-mlogloss:5.60595
[30]	eval-mlogloss:10.83
[40]	eval-mlogloss:15.2783
[50]	eval-mlogloss:17.3254
[60]	eval-mlogloss:18.8124
[70]	eval-mlogloss:19.3257
[80]	eval-mlogloss:18.956
[90]	eval-mlogloss:17.9979
[99]	eval-mlogloss:17.567
custom objective 16.74834724995825

Does any one see anything wrong with my implementation? I was able to get the binary custom objective function to work with no issues.

I figured out the issue with my softmaxobj it seems that converting grad.flatten() and hess.flatten() to grad.flatten('F') and hess.flatten('F') greatly impact the results.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment