loiseaujc/SoftMax_regression.py

## SoftMax_regression.py
# --> Import standard Python libraries.
import numpy as np
from scipy.special import softmax
from scipy.linalg import norm
from scipy.optimize import line_search, minimize_scalar

# --> Import sklearn utility functions.
from sklearn.base import BaseEstimator, ClassifierMixin

def SoftMax(x):
    """
    Protected SoftMax function to avoid overflow due to
    exponentiating large numbers.
    """

    # --> Add a feature associated to the neglected class.
    x = np.insert(x, x.shape[1], 0, axis=1)

    # --> Max-normalization to avoid overflow.
    x -= np.max(x)

    return softmax(x, axis=1)

def cross_entropy(W, X, y, epsilon=1e-10):
    """
    Implementation of the categorical cross-entropy.
    """

    # --> Evaluate model.
    ypred  = SoftMax(X @ W)

    # --> Compute the cross-entropy.
    return -np.mean(y * np.log(ypred + epsilon))

def cross_entropy_gradient(W, X, y):
    """
    Evalute the gradient of the cross-entropy function.
    """

    # --> Number of training examples
    m = X.shape[0]

    # --> Evaluate model.
    ypred = SoftMax(X @ W)

    # --> Compute the gradient.
    gradient = X.T @ (ypred[:, :-1] - y[:, :-1]) / m

    return gradient

class SoftMax_regression(BaseEstimator, ClassifierMixin):
    """
    Implementation of the SoftMax regression. Minimization is performed
    using gradient descent with adaptive learning rate obtained from an
    inexact line search. Note that we assume a unit-term has been prepended
    to X for the sake of simplicity.
    """

    def __init__(self, maxiter=100000, tol=1e-6):
        # --> Maximum number of iterations.
        self.maxiter = maxiter

        # --> Tolerance for the optimizer.
        self.tol = tol

    def fit(self, X, y):
        """
        Implementation of the gradient descent with inexact line search
        for the learning rate performed by Brent's method.

        INPUT
        -----

        X : numpy 2D array. Each row corresponds to one training example.

        y : numpy 2D array. One-hot encoded vector of labels.

        OUTPUT
        ------

        self : The trained SoftMax regression model.
        """

        # --> Add the bias.
        X = np.insert(X, 0, 1, axis=1)

        # --> Number of examples and features.
        m, n = X.shape

        # --> Number of classes - 1.
        k = y.shape[1] -1

        # --> Initialize the weights matrix.
        W = np.zeros((n, k))

        # --> Maximum value for the learning rate derived from the Lipschitz constant
        #     of the categorical cross-entropy function.
        amax = (k+1)*m / (k*norm(X))

        # --> Training using gradient descent and optimal stepsize.
        for _ in range(self.maxiter):
            # --> Compte the gradient.
            p = cross_entropy_gradient(W, X, y)

            # --> Check for convergence.
            if norm(p)**2 < self.tol:
                break

            # --> Compute a quasi-optimal stepsize using 1D minimization.
            res = minimize_scalar(
                lambda alpha : cross_entropy(W - alpha*p, X, y),
                bracket=(0, amax),
                method="brent",
                tol=0.1*self.tol*amax
            )

            # --> Worst-case scenario : Brent's method failed (or returned negative values).
            #     Switch back to the inverse Lipschitz constant.
            if res["success"] and (res["x"] > 0):
                alpha = res["x"]
            else:
                alpha = 0.5*amax

            # --> Update the weights.
            W -= alpha*p

        # --> Final weights and biases.
        self.bias, self.W = W[0, :], W[1:, :]

        return self

    def predict_proba(self, X):
        return SoftMax(X @ self.W + self.bias)

    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)
	# --> Import standard Python libraries.
	import numpy as np
	from scipy.special import softmax
	from scipy.linalg import norm
	from scipy.optimize import line_search, minimize_scalar

	# --> Import sklearn utility functions.
	from sklearn.base import BaseEstimator, ClassifierMixin

	def SoftMax(x):
	"""
	Protected SoftMax function to avoid overflow due to
	exponentiating large numbers.
	"""

	# --> Add a feature associated to the neglected class.
	x = np.insert(x, x.shape[1], 0, axis=1)

	# --> Max-normalization to avoid overflow.
	x -= np.max(x)

	return softmax(x, axis=1)

	def cross_entropy(W, X, y, epsilon=1e-10):
	"""
	Implementation of the categorical cross-entropy.
	"""

	# --> Evaluate model.
	ypred = SoftMax(X @ W)

	# --> Compute the cross-entropy.
	return -np.mean(y * np.log(ypred + epsilon))

	def cross_entropy_gradient(W, X, y):
	"""
	Evalute the gradient of the cross-entropy function.
	"""

	# --> Number of training examples
	m = X.shape[0]

	# --> Evaluate model.
	ypred = SoftMax(X @ W)

	# --> Compute the gradient.
	gradient = X.T @ (ypred[:, :-1] - y[:, :-1]) / m

	return gradient

	class SoftMax_regression(BaseEstimator, ClassifierMixin):
	"""
	Implementation of the SoftMax regression. Minimization is performed
	using gradient descent with adaptive learning rate obtained from an
	inexact line search. Note that we assume a unit-term has been prepended
	to X for the sake of simplicity.
	"""

	def __init__(self, maxiter=100000, tol=1e-6):
	# --> Maximum number of iterations.
	self.maxiter = maxiter

	# --> Tolerance for the optimizer.
	self.tol = tol

	def fit(self, X, y):
	"""
	Implementation of the gradient descent with inexact line search
	for the learning rate performed by Brent's method.

	INPUT
	-----

	X : numpy 2D array. Each row corresponds to one training example.

	y : numpy 2D array. One-hot encoded vector of labels.

	OUTPUT
	------

	self : The trained SoftMax regression model.
	"""

	# --> Add the bias.
	X = np.insert(X, 0, 1, axis=1)

	# --> Number of examples and features.
	m, n = X.shape

	# --> Number of classes - 1.
	k = y.shape[1] -1

	# --> Initialize the weights matrix.
	W = np.zeros((n, k))

	# --> Maximum value for the learning rate derived from the Lipschitz constant
	# of the categorical cross-entropy function.
	amax = (k+1)m / (knorm(X))

	# --> Training using gradient descent and optimal stepsize.
	for _ in range(self.maxiter):
	# --> Compte the gradient.
	p = cross_entropy_gradient(W, X, y)

	# --> Check for convergence.
	if norm(p)**2 < self.tol:
	break

	# --> Compute a quasi-optimal stepsize using 1D minimization.
	res = minimize_scalar(
	lambda alpha : cross_entropy(W - alpha*p, X, y),
	bracket=(0, amax),
	method="brent",
	tol=0.1self.tolamax
	)

	# --> Worst-case scenario : Brent's method failed (or returned negative values).
	# Switch back to the inverse Lipschitz constant.
	if res["success"] and (res["x"] > 0):
	alpha = res["x"]
	else:
	alpha = 0.5*amax

	# --> Update the weights.
	W -= alpha*p

	# --> Final weights and biases.
	self.bias, self.W = W[0, :], W[1:, :]

	return self

	def predict_proba(self, X):
	return SoftMax(X @ self.W + self.bias)

	def predict(self, X):
	return self.predict_proba(X).argmax(axis=1)