Skip to content

Instantly share code, notes, and snippets.

@JanSellner
Created May 2, 2021 20:34
Show Gist options
  • Save JanSellner/85b76e13940661b0f250b46ee284fd5f to your computer and use it in GitHub Desktop.
Save JanSellner/85b76e13940661b0f250b46ee284fd5f to your computer and use it in GitHub Desktop.
Fisher's Linear Discriminant Analysis (LDA)
import numpy as np
def LDA(data: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Performs Fisher's Linear Discriminant Analysis. See https://en.wikipedia.org/wiki/Linear_discriminant_analysis#Fisher's_linear_discriminant and http://www.facweb.iitkgp.ac.in/~sudeshna/courses/ml08/lda.pdf for descriptions of the method.
>>> data = np.array([# First class
... [1, 2],
... [1.5, 2.7],
... [2.2, 2],
... [2.5, 3],
... [3.5, 3.1],
... [4, 3.5],
... # Second class
... [2.5, 4.5],
... [3.2, 4.8],
... [5, 5.2],
... [5.5, 5],
... [5.9, 5.2],
... [6.6, 6],
... ])
>>> labels = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
>>> coeff, proj, latent = LDA(data, labels)
>>> latent # Only the first axis is relevant when using two classes
array([42.4619955, 0. ])
>>> latent / np.sum(latent) # Explained discrimination
array([1., 0.])
>>> coeff[:, 0] # First eigenvector
array([ 0.24558339, -0.96937547])
>>> proj[0, 0] # Projection of the first data point
-1.6931675538135342
>>> np.matmul([[1, 2], [1.5, 1.5]], coeff[:, 0]) # The eigenvectors are stored in the columns of coeff (here only the first column is meaningful)
array([-1.69316755, -1.08568813])
Args:
data: matrix containing all data points (observations in rows, variables in columns) [nObservations x nVariables].
labels: vector with the class labels [nObservations].
Returns:
coeff: matrix with the eigenvectors as columns [nVariables x nVariables].
proj: projection of the data points onto the new axes [nObservations x nVariables].
latent: vector with the eigenvalues [nVariables].
"""
assert data.shape[0] == len(labels), 'Each data point (= row of the data matrix) must have an associated label'
assert len(np.unique(labels)) >= 2, 'At least two classes are required'
# Gather data
nFeatures = data.shape[1]
withinScatter = np.zeros((nFeatures, nFeatures))
betweenScatter = np.zeros((nFeatures, nFeatures))
overallMean = np.mean(data, axis=0)
# Iterate over all classes to calculate the combined scatter matrices
for c in np.unique(labels):
dataClass = data[labels == c, :]
# Variance to the common mean value
meanClass = np.mean(dataClass, axis=0)
meanDiff = meanClass - overallMean
betweenScatter = betweenScatter + dataClass.shape[0] * np.outer(meanDiff, meanDiff)
# Variance in each class
covClass = np.cov(dataClass, rowvar=False)
withinScatter = withinScatter + covClass
# Perform LDA
scatter = np.matmul(np.linalg.pinv(withinScatter), betweenScatter) # Using the pseudo-inverse matrix gives stabler results
# Sort the eigenvalues descendingly (https://stackoverflow.com/questions/8092920/sort-eigenvalues-and-associated-eigenvectors-after-using-numpy-linalg-eig-in-pyt)
eval, evec = np.linalg.eig(scatter)
idx = eval.argsort()[::-1]
eval = eval[idx]
evec = evec[:, idx]
return evec, np.matmul(data, evec), eval
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment