Chandrak1907/xgboost.py

## xgboost.py
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# import lightgbm as lgb
from sklearn.metrics import log_loss
import xgboost as xgb

# Iris dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target
from sklearn.model_selection import train_test_split
seed = 111
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(X_test)
eval_set=[(dvalid, 'eval')]


COST_MATRIX = np.matrix([[0, 2, 4],
                         [2, 0, 2],
                         [4, 2, 0]])

def softmaxobj(preds, dtrain):

    labels = dtrain.get_label()
    labels = labels.reshape(-1, 1)
    labels = OneHotEncoder(sparse=False, categories='auto').fit_transform(labels)
    grad = preds - labels
    weights = np.dot(labels,COST_MATRIX)
    grad = np.array(np.multiply(grad, weights) )
    hess = 2.0 * preds * (1.0 - preds)
    hess = np.array(np.multiply(hess, weights))
    return grad.flatten(), hess.flatten()

#train xgboost with same custom loss function
param = {'max_depth': '6', 'objective':'multi:softprob',
         'tree_method':'hist', 'num_class': 3,
         'eval_metric': 'mlogloss'}

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 5  # the number of training iterations

bst_cust = xgb.train(param, dtrain, num_boost_round=10,
                     verbose_eval=1, evals=eval_set,
                     obj=softmaxobj)

preds_cust=bst_cust.predict(dtest)
print('custom objective {}'.format(log_loss(y_test, preds_cust)))


import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds_cust])
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, best_preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

print(confusion_matrix(y_test, best_preds))
	import numpy as np
	from sklearn import datasets
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix
	# import lightgbm as lgb
	from sklearn.metrics import log_loss
	import xgboost as xgb

	# Iris dataset
	iris = datasets.load_iris()
	X, y = iris.data, iris.target
	from sklearn.model_selection import train_test_split
	seed = 111
	test_size = 0.2
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)



	dtrain = xgb.DMatrix(X_train, label=y_train)
	dvalid = xgb.DMatrix(X_test, label=y_test)
	dtest = xgb.DMatrix(X_test)
	eval_set=[(dvalid, 'eval')]


	COST_MATRIX = np.matrix([[0, 2, 4],
	[2, 0, 2],
	[4, 2, 0]])

	def softmaxobj(preds, dtrain):

	labels = dtrain.get_label()
	labels = labels.reshape(-1, 1)
	labels = OneHotEncoder(sparse=False, categories='auto').fit_transform(labels)
	grad = preds - labels
	weights = np.dot(labels,COST_MATRIX)
	grad = np.array(np.multiply(grad, weights) )
	hess = 2.0 * preds * (1.0 - preds)
	hess = np.array(np.multiply(hess, weights))
	return grad.flatten(), hess.flatten()

	#train xgboost with same custom loss function
	param = {'max_depth': '6', 'objective':'multi:softprob',
	'tree_method':'hist', 'num_class': 3,
	'eval_metric': 'mlogloss'}

	param = {
	'max_depth': 3, # the maximum depth of each tree
	'eta': 0.3, # the training step for each iteration
	'silent': 1, # logging mode - quiet
	'objective': 'multi:softprob', # error evaluation for multiclass training
	'num_class': 3} # the number of classes that exist in this datset
	num_round = 5 # the number of training iterations

	bst_cust = xgb.train(param, dtrain, num_boost_round=10,
	verbose_eval=1, evals=eval_set,
	obj=softmaxobj)

	preds_cust=bst_cust.predict(dtest)
	print('custom objective {}'.format(log_loss(y_test, preds_cust)))



	import numpy as np
	best_preds = np.asarray([np.argmax(line) for line in preds_cust])
	from sklearn.metrics import accuracy_score
	accuracy = accuracy_score(y_test, best_preds)
	print("Accuracy: %.2f%%" % (accuracy * 100.0))

	print(confusion_matrix(y_test, best_preds))