Skip to content

Instantly share code, notes, and snippets.

@Ekeany
Last active December 6, 2023 11:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Ekeany/80748f4bfa1ede0e6455304963d3921a to your computer and use it in GitHub Desktop.
Save Ekeany/80748f4bfa1ede0e6455304963d3921a to your computer and use it in GitHub Desktop.
A numpy/pandas implementation of XGBoost
import numpy as np
import pandas as pd
from math import e
class Node:
'''
A node object that is recursivly called within itslef to construct a regression tree. Based on Tianqi Chen's XGBoost
the internal gain used to find the optimal split value uses both the gradient and hessian. Also a weighted quantlie sketch
and optimal leaf values all follow Chen's description in "XGBoost: A Scalable Tree Boosting System" the only thing not
implemented in this version is sparsity aware fitting or the ability to handle NA values with a default direction.
Inputs
------------------------------------------------------------------------------------------------------------------
x: pandas datframe of the training data
gradient: negative gradient of the loss function
hessian: second order derivative of the loss function
idxs: used to keep track of samples within the tree structure
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
(complexity parameter)
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
depth: limits the number of layers in the tree
lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score).
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to
(1 / sketch_eps) number of bins
Outputs
--------------------------------------------------------------------------------------------------------------------
A single tree object that will be used for gradient boosintg.
'''
def __init__(self, x, gradient, hessian, idxs, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1):
self.x, self.gradient, self.hessian = x, gradient, hessian
self.idxs = idxs
self.depth = depth
self.min_leaf = min_leaf
self.lambda_ = lambda_
self.gamma = gamma
self.min_child_weight = min_child_weight
self.row_count = len(idxs)
self.col_count = x.shape[1]
self.subsample_cols = subsample_cols
self.eps = eps
self.column_subsample = np.random.permutation(self.col_count)[:round(self.subsample_cols*self.col_count)]
self.val = self.compute_gamma(self.gradient[self.idxs], self.hessian[self.idxs])
self.score = float('-inf')
self.find_varsplit()
def compute_gamma(self, gradient, hessian):
'''
Calculates the optimal leaf value equation (5) in "XGBoost: A Scalable Tree Boosting System"
'''
return(-np.sum(gradient)/(np.sum(hessian) + self.lambda_))
def find_varsplit(self):
'''
Scans through every column and calcuates the best split point.
The node is then split at this point and two new nodes are created.
Depth is only parameter to change as we have added a new layer to tre structure.
If no split is better than the score initalised at the begining then no splits further splits are made
'''
for c in self.column_subsample: self.find_greedy_split(c)
if self.is_leaf: return
x = self.split_col
lhs = np.nonzero(x <= self.split)[0]
rhs = np.nonzero(x > self.split)[0]
self.lhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[lhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols)
self.rhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[rhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols)
def find_greedy_split(self, var_idx):
'''
For a given feature greedily calculates the gain at each split.
Globally updates the best score and split point if a better split point is found
'''
x = self.x.values[self.idxs, var_idx]
for r in range(self.row_count):
lhs = x <= x[r]
rhs = x > x[r]
lhs_indices = np.nonzero(x <= x[r])[0]
rhs_indices = np.nonzero(x > x[r])[0]
if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf
or self.hessian[lhs_indices].sum() < self.min_child_weight
or self.hessian[rhs_indices].sum() < self.min_child_weight): continue
curr_score = self.gain(lhs, rhs)
if curr_score > self.score:
self.var_idx = var_idx
self.score = curr_score
self.split = x[r]
def weighted_qauntile_sketch(self, var_idx):
'''
XGBOOST Mini-Version
Yiyang "Joe" Zeng
Is an approximation to the eact greedy approach faster for bigger datasets wher it is not feasible
to calculate the gain at every split point. Uses equation (8) and (9) from "XGBoost: A Scalable Tree Boosting System"
'''
x = self.x.values[self.idxs, var_idx]
hessian_ = self.hessian[self.idxs]
df = pd.DataFrame({'feature':x,'hess':hessian_})
df.sort_values(by=['feature'], ascending = True, inplace = True)
hess_sum = df['hess'].sum()
df['rank'] = df.apply(lambda x : (1/hess_sum)*sum(df[df['feature'] < x['feature']]['hess']), axis=1)
for row in range(df.shape[0]-1):
# look at the current rank and the next ran
rk_sk_j, rk_sk_j_1 = df['rank'].iloc[row:row+2]
diff = abs(rk_sk_j - rk_sk_j_1)
if(diff >= self.eps):
continue
split_value = (df['rank'].iloc[row+1] + df['rank'].iloc[row])/2
lhs = x <= split_value
rhs = x > split_value
lhs_indices = np.nonzero(x <= split_value)[0]
rhs_indices = np.nonzero(x > split_value)[0]
if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf
or self.hessian[lhs_indices].sum() < self.min_child_weight
or self.hessian[rhs_indices].sum() < self.min_child_weight): continue
curr_score = self.gain(lhs, rhs)
if curr_score > self.score:
self.var_idx = var_idx
self.score = curr_score
self.split = split_value
def gain(self, lhs, rhs):
'''
Calculates the gain at a particular split point bases on equation (7) from
"XGBoost: A Scalable Tree Boosting System"
'''
gradient = self.gradient[self.idxs]
hessian = self.hessian[self.idxs]
lhs_gradient = gradient[lhs].sum()
lhs_hessian = hessian[lhs].sum()
rhs_gradient = gradient[rhs].sum()
rhs_hessian = hessian[rhs].sum()
gain = 0.5 *( (lhs_gradient**2/(lhs_hessian + self.lambda_)) + (rhs_gradient**2/(rhs_hessian + self.lambda_)) - ((lhs_gradient + rhs_gradient)**2/(lhs_hessian + rhs_hessian + self.lambda_))) - self.gamma
return(gain)
@property
def split_col(self):
'''
splits a column
'''
return self.x.values[self.idxs , self.var_idx]
@property
def is_leaf(self):
'''
checks if node is a leaf
'''
return self.score == float('-inf') or self.depth <= 0
def predict(self, x):
return np.array([self.predict_row(xi) for xi in x])
def predict_row(self, xi):
if self.is_leaf:
return(self.val)
node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
return node.predict_row(xi)
class XGBoostTree:
'''
Wrapper class that provides a scikit learn interface to the recursive regression tree above
Inputs
------------------------------------------------------------------------------------------------------------------
x: pandas datframe of the training data
gradient: negative gradient of the loss function
hessian: second order derivative of the loss function
idxs: used to keep track of samples within the tree structure
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
(complexity parameter)
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
depth: limits the number of layers in the tree
lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score).
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to
(1 / sketch_eps) number of bins
Outputs
--------------------------------------------------------------------------------------------------------------------
A single tree object that will be used for gradient boosintg.
'''
def fit(self, x, gradient, hessian, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1):
self.dtree = Node(x, gradient, hessian, np.array(np.arange(len(x))), subsample_cols, min_leaf, min_child_weight, depth, lambda_, gamma, eps)
return self
def predict(self, X):
return self.dtree.predict(X.values)
class XGBoostClassifier:
'''
Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for
Binary Classification.
Inputs
------------------------------------------------------------------------------------------------------------------
x: pandas datframe of the training data
gradient: negative gradient of the loss function
hessian: second order derivative of the loss function
idxs: used to keep track of samples within the tree structure
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
(complexity parameter)
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
depth: limits the number of layers in the tree
lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score).
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to
(1 / sketch_eps) number of bins
Outputs
--------------------------------------------------------------------------------------------------------------------
A single tree object that will be used for gradient boosintg.
'''
def __init__(self):
self.estimators = []
@staticmethod
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# first order gradient logLoss
def grad(self, preds, labels):
preds = self.sigmoid(preds)
return(preds - labels)
# second order gradient logLoss
def hess(self, preds, labels):
preds = self.sigmoid(preds)
return(preds * (1 - preds))
@staticmethod
def log_odds(column):
binary_yes = np.count_nonzero(column == 1)
binary_no = np.count_nonzero(column == 0)
return(np.log(binary_yes/binary_no))
def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1):
self.X, self.y = X, y.values
self.depth = depth
self.subsample_cols = subsample_cols
self.eps = eps
self.min_child_weight = min_child_weight
self.min_leaf = min_leaf
self.learning_rate = learning_rate
self.boosting_rounds = boosting_rounds
self.lambda_ = lambda_
self.gamma = gamma
self.base_pred = np.full((X.shape[0], 1), 1).flatten().astype('float64')
for booster in range(self.boosting_rounds):
Grad = self.grad(self.base_pred, self.y)
Hess = self.hess(self.base_pred, self.y)
boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols)
self.base_pred += self.learning_rate * boosting_tree.predict(self.X)
self.estimators.append(boosting_tree)
def predict_proba(self, X):
pred = np.zeros(X.shape[0])
for estimator in self.estimators:
pred += self.learning_rate * estimator.predict(X)
return(self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred))
def predict(self, X):
pred = np.zeros(X.shape[0])
for estimator in self.estimators:
pred += self.learning_rate * estimator.predict(X)
predicted_probas = self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred)
preds = np.where(predicted_probas > np.mean(predicted_probas), 1, 0)
return(preds)
class XGBoostRegressor:
'''
Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for
regression.
Inputs
------------------------------------------------------------------------------------------------------------------
x: pandas datframe of the training data
gradient: negative gradient of the loss function
hessian: second order derivative of the loss function
idxs: used to keep track of samples within the tree structure
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
(complexity parameter)
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
depth: limits the number of layers in the tree
lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score).
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to
(1 / sketch_eps) number of bins
Outputs
--------------------------------------------------------------------------------------------------------------------
A single tree object that will be used for gradient boosintg.
'''
def __init__(self):
self.estimators = []
# first order gradient mean squared error
@staticmethod
def grad(preds, labels):
return(2*(preds-labels))
# second order gradient logLoss
@staticmethod
def hess(preds, labels):
'''
hessian of mean squared error is a constant value of two
returns an array of twos
'''
return(np.full((preds.shape[0], 1), 2).flatten().astype('float64'))
def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1):
self.X, self.y = X, y.values
self.depth = depth
self.subsample_cols = subsample_cols
self.eps = eps
self.min_child_weight = min_child_weight
self.min_leaf = min_leaf
self.learning_rate = learning_rate
self.boosting_rounds = boosting_rounds
self.lambda_ = lambda_
self.gamma = gamma
self.base_pred = np.full((X.shape[0], 1), np.mean(y)).flatten().astype('float64')
for booster in range(self.boosting_rounds):
Grad = self.grad(self.base_pred, self.y)
Hess = self.hess(self.base_pred, self.y)
boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols)
self.base_pred += self.learning_rate * boosting_tree.predict(self.X)
self.estimators.append(boosting_tree)
def predict(self, X):
pred = np.zeros(X.shape[0])
for estimator in self.estimators:
pred += self.learning_rate * estimator.predict(X)
return np.full((X.shape[0], 1), np.mean(y)).flatten().astype('float64') + pred
@PutraResearch
Copy link

Hello I have a question. Why on line 35 self.hessian = x. Is there any explanation. Thank you

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment