Skip to content

Instantly share code, notes, and snippets.

@loiseaujc
Last active February 15, 2021 11:36
Show Gist options
  • Save loiseaujc/bcf289881fc0ddccc1d7eba1451b5bb9 to your computer and use it in GitHub Desktop.
Save loiseaujc/bcf289881fc0ddccc1d7eba1451b5bb9 to your computer and use it in GitHub Desktop.
Simple implementation of SoftMax regression using gradient descent with quasi-optimal adaptive learning rate.
# --> Import standard Python libraries.
import numpy as np
from scipy.special import softmax
from scipy.linalg import norm
from scipy.optimize import line_search, minimize_scalar
# --> Import sklearn utility functions.
from sklearn.base import BaseEstimator, ClassifierMixin
def SoftMax(x):
"""
Protected SoftMax function to avoid overflow due to
exponentiating large numbers.
"""
# --> Add a feature associated to the neglected class.
x = np.insert(x, x.shape[1], 0, axis=1)
# --> Max-normalization to avoid overflow.
x -= np.max(x)
return softmax(x, axis=1)
def cross_entropy(W, X, y, epsilon=1e-10):
"""
Implementation of the categorical cross-entropy.
"""
# --> Evaluate model.
ypred = SoftMax(X @ W)
# --> Compute the cross-entropy.
return -np.mean(y * np.log(ypred + epsilon))
def cross_entropy_gradient(W, X, y):
"""
Evalute the gradient of the cross-entropy function.
"""
# --> Number of training examples
m = X.shape[0]
# --> Evaluate model.
ypred = SoftMax(X @ W)
# --> Compute the gradient.
gradient = X.T @ (ypred[:, :-1] - y[:, :-1]) / m
return gradient
class SoftMax_regression(BaseEstimator, ClassifierMixin):
"""
Implementation of the SoftMax regression. Minimization is performed
using gradient descent with adaptive learning rate obtained from an
inexact line search. Note that we assume a unit-term has been prepended
to X for the sake of simplicity.
"""
def __init__(self, maxiter=100000, tol=1e-6):
# --> Maximum number of iterations.
self.maxiter = maxiter
# --> Tolerance for the optimizer.
self.tol = tol
def fit(self, X, y):
"""
Implementation of the gradient descent with inexact line search
for the learning rate performed by Brent's method.
INPUT
-----
X : numpy 2D array. Each row corresponds to one training example.
y : numpy 2D array. One-hot encoded vector of labels.
OUTPUT
------
self : The trained SoftMax regression model.
"""
# --> Add the bias.
X = np.insert(X, 0, 1, axis=1)
# --> Number of examples and features.
m, n = X.shape
# --> Number of classes - 1.
k = y.shape[1] -1
# --> Initialize the weights matrix.
W = np.zeros((n, k))
# --> Maximum value for the learning rate derived from the Lipschitz constant
# of the categorical cross-entropy function.
amax = (k+1)*m / (k*norm(X))
# --> Training using gradient descent and optimal stepsize.
for _ in range(self.maxiter):
# --> Compte the gradient.
p = cross_entropy_gradient(W, X, y)
# --> Check for convergence.
if norm(p)**2 < self.tol:
break
# --> Compute a quasi-optimal stepsize using 1D minimization.
res = minimize_scalar(
lambda alpha : cross_entropy(W - alpha*p, X, y),
bracket=(0, amax),
method="brent",
tol=0.1*self.tol*amax
)
# --> Worst-case scenario : Brent's method failed (or returned negative values).
# Switch back to the inverse Lipschitz constant.
if res["success"] and (res["x"] > 0):
alpha = res["x"]
else:
alpha = 0.5*amax
# --> Update the weights.
W -= alpha*p
# --> Final weights and biases.
self.bias, self.W = W[0, :], W[1:, :]
return self
def predict_proba(self, X):
return SoftMax(X @ self.W + self.bias)
def predict(self, X):
return self.predict_proba(X).argmax(axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment