Last active
February 15, 2021 11:36
-
-
Save loiseaujc/bcf289881fc0ddccc1d7eba1451b5bb9 to your computer and use it in GitHub Desktop.
Simple implementation of SoftMax regression using gradient descent with quasi-optimal adaptive learning rate.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --> Import standard Python libraries. | |
import numpy as np | |
from scipy.special import softmax | |
from scipy.linalg import norm | |
from scipy.optimize import line_search, minimize_scalar | |
# --> Import sklearn utility functions. | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
def SoftMax(x): | |
""" | |
Protected SoftMax function to avoid overflow due to | |
exponentiating large numbers. | |
""" | |
# --> Add a feature associated to the neglected class. | |
x = np.insert(x, x.shape[1], 0, axis=1) | |
# --> Max-normalization to avoid overflow. | |
x -= np.max(x) | |
return softmax(x, axis=1) | |
def cross_entropy(W, X, y, epsilon=1e-10): | |
""" | |
Implementation of the categorical cross-entropy. | |
""" | |
# --> Evaluate model. | |
ypred = SoftMax(X @ W) | |
# --> Compute the cross-entropy. | |
return -np.mean(y * np.log(ypred + epsilon)) | |
def cross_entropy_gradient(W, X, y): | |
""" | |
Evalute the gradient of the cross-entropy function. | |
""" | |
# --> Number of training examples | |
m = X.shape[0] | |
# --> Evaluate model. | |
ypred = SoftMax(X @ W) | |
# --> Compute the gradient. | |
gradient = X.T @ (ypred[:, :-1] - y[:, :-1]) / m | |
return gradient | |
class SoftMax_regression(BaseEstimator, ClassifierMixin): | |
""" | |
Implementation of the SoftMax regression. Minimization is performed | |
using gradient descent with adaptive learning rate obtained from an | |
inexact line search. Note that we assume a unit-term has been prepended | |
to X for the sake of simplicity. | |
""" | |
def __init__(self, maxiter=100000, tol=1e-6): | |
# --> Maximum number of iterations. | |
self.maxiter = maxiter | |
# --> Tolerance for the optimizer. | |
self.tol = tol | |
def fit(self, X, y): | |
""" | |
Implementation of the gradient descent with inexact line search | |
for the learning rate performed by Brent's method. | |
INPUT | |
----- | |
X : numpy 2D array. Each row corresponds to one training example. | |
y : numpy 2D array. One-hot encoded vector of labels. | |
OUTPUT | |
------ | |
self : The trained SoftMax regression model. | |
""" | |
# --> Add the bias. | |
X = np.insert(X, 0, 1, axis=1) | |
# --> Number of examples and features. | |
m, n = X.shape | |
# --> Number of classes - 1. | |
k = y.shape[1] -1 | |
# --> Initialize the weights matrix. | |
W = np.zeros((n, k)) | |
# --> Maximum value for the learning rate derived from the Lipschitz constant | |
# of the categorical cross-entropy function. | |
amax = (k+1)*m / (k*norm(X)) | |
# --> Training using gradient descent and optimal stepsize. | |
for _ in range(self.maxiter): | |
# --> Compte the gradient. | |
p = cross_entropy_gradient(W, X, y) | |
# --> Check for convergence. | |
if norm(p)**2 < self.tol: | |
break | |
# --> Compute a quasi-optimal stepsize using 1D minimization. | |
res = minimize_scalar( | |
lambda alpha : cross_entropy(W - alpha*p, X, y), | |
bracket=(0, amax), | |
method="brent", | |
tol=0.1*self.tol*amax | |
) | |
# --> Worst-case scenario : Brent's method failed (or returned negative values). | |
# Switch back to the inverse Lipschitz constant. | |
if res["success"] and (res["x"] > 0): | |
alpha = res["x"] | |
else: | |
alpha = 0.5*amax | |
# --> Update the weights. | |
W -= alpha*p | |
# --> Final weights and biases. | |
self.bias, self.W = W[0, :], W[1:, :] | |
return self | |
def predict_proba(self, X): | |
return SoftMax(X @ self.W + self.bias) | |
def predict(self, X): | |
return self.predict_proba(X).argmax(axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment