jacoblevine/correspondence_analysis.py

## correspondence_analysis.py
import numpy as np
def correspondence_analysis(x):
    """Correspondence Analysis on frequency table x
    as described in Härdle & Simar (2012) "Applied Multivariate Statistical Analysis"

    Parameters
    ----------
    x : np.ndarray
        n-by-m table of counts or frequencies of m categories in n samples

    Returns
    -------
    r : np.ndarray
        n-by-k array of row (sample) projections. k = min(n, m)
    s : np.ndarray
        m-by-k array of column (category) projections
    percent_explained : np.ndarray
        length k array of cumulative percent explained by each CA component
    """
    # Prepare chi-squared matrix
    a, b = x.sum(axis=1), x.sum(axis=0)  # row sums, column sums
    e = np.outer(a, b) / x.sum()  # expected value of each entry in x is the normalized outer product of row and column sums
    cc = (x - e) / e ** .5  # the chi-squared matrix

    # SVD of the chi-squared matrix
    # columns of g are eigenvectors of cc*cc.T
    # rows of d are eigenvectors of cc.T*cc
    g, l, d = np.linalg.svd(cc)
    k = len(l)
    r = l * g[:, :k] / np.sqrt(a).reshape(-1, 1)  # row projections
    s = l * d[:k].T / np.sqrt(b).reshape(-1, 1)  # column projections
    percent_explained = (l ** 2).cumsum() / (l ** 2).sum()
    return r, s, percent_explained
	import numpy as np
	def correspondence_analysis(x):
	"""Correspondence Analysis on frequency table x
	as described in Härdle & Simar (2012) "Applied Multivariate Statistical Analysis"

	Parameters
	----------
	x : np.ndarray
	n-by-m table of counts or frequencies of m categories in n samples

	Returns
	-------
	r : np.ndarray
	n-by-k array of row (sample) projections. k = min(n, m)
	s : np.ndarray
	m-by-k array of column (category) projections
	percent_explained : np.ndarray
	length k array of cumulative percent explained by each CA component
	"""
	# Prepare chi-squared matrix
	a, b = x.sum(axis=1), x.sum(axis=0) # row sums, column sums
	e = np.outer(a, b) / x.sum() # expected value of each entry in x is the normalized outer product of row and column sums
	cc = (x - e) / e ** .5 # the chi-squared matrix

	# SVD of the chi-squared matrix
	# columns of g are eigenvectors of cc*cc.T
	# rows of d are eigenvectors of cc.T*cc
	g, l, d = np.linalg.svd(cc)
	k = len(l)
	r = l * g[:, :k] / np.sqrt(a).reshape(-1, 1) # row projections
	s = l * d[:k].T / np.sqrt(b).reshape(-1, 1) # column projections
	percent_explained = (l 2).cumsum() / (l 2).sum()
	return r, s, percent_explained