fdabl/unsupervised.jl

## unsupervised.jl
function PCA(X)
    ## Y = PX

    # required:
    #- P such that
    #- we get rid of 2nd order correlation, Cov(Y) = D [some diagonal matrix]

    # method:
    #- project onto an orthogonal space spanned by the
    #- eigenvectors with variances equal to the eigenvalues

    # maths:
    #- Aνi = λiνi ... νi -> some eigenvector, λi -> some eigenvalue
    #- AU = UD ... col(U) = ν's, diag(D) = λ
    #- for symmetric matrices (e.g., covariance matrices), U' = inv(U)
    #- thus A = UDU' ... eigen-decomposition of a matrix

    #- we zero center the data and compute
    #- Cov(Y) = Cov(U'X) = E[U'X(U'X)'] = U'E[XX']U = U'(UDU')U = D
    #--- and we're done


    X = X .- mean(X)
    D, E = eig(cov(X'))

    Y = E'*X

    # papers:
    #- see here: http://arxiv.org/pdf/1404.1100.pdf
end


function ICA(X)
    ## x = As

    # required:
    #- s = Wx ... W such that we can (approximately) recover s [W = inv(A)]

    # method:
    #- assume that s are statistically independent
    #- assume that cov(s) = I


    # maths:
    #- A = UDV' ... via a singular value decomposition
    #- W = inv(A) = inv(UDV') = Vinv(D)U'
    #- Cov(x) = E[As(As)'] = AE[ss']A' = AIA' = UDV'(UDV')'
    #- = UDV'VDU' = UD^2U' = EDE' [the last step by an eigen-decomposition of Cov(X)]
    #- thus W = VD^{-1/2}E' because Cov(Wx) = WCov(x)W' = VD^{-1/2}E' (EDE') ED^{-1/2}V' = I

    #- so W, the 'unmixing matrix', is determined up to a rotation V
    #- let Xw = D^{-1/2}E'X, i.e., whiten the data (Cov(Xw) = I)
    #- then s = VXw

    #- new objective: find V that minimizes the dependencies between s
    #- from information theory: V = argmin I[p(s)] ... where I[p(s)] is the multi-information
    #- it is easy to show that I[p(s)] = ∑_k h[p(s_k)] - h[p(S)], i.e., is the difference
    #- between the sum of the marginal entropies and the joint entropy;

    #- instead of minimizing this, however, we minimize kurtosis (the fourth moment) below
    #- (this is not robust against outliers, however)

    # note:
    #- reduces to PCA when the sources s are Gaussian
    #- (no correlation implies statistical independence)

    d, m = size(X)
    X = X .- mean(X, 2)

    # get the eigenvectors and eigenvalues
    D, E = eig(cov(X'))
    D = diagm(D)

    # whiten the data
    Xw = sqrtm(inv(D)) * E' * X

    # maximize the kurtosis
    V, s, u = svd( (repmat(sum(Xw .* Xw, 1), d, 1) .* Xw) * Xw' )

    # W is the inverse of A
    W = V * sqrtm(inv(D)) * E'
    S = W * X

    W, S

    # papers:
    #- see here: http://arxiv.org/pdf/1404.2986.pdf
    #- and here: http://www.stat.ucla.edu/~yuille/courses/Stat161-261-Spring14/HyvO00-icatut.pdf
end
	function PCA(X)
	## Y = PX

	# required:
	#- P such that
	#- we get rid of 2nd order correlation, Cov(Y) = D [some diagonal matrix]

	# method:
	#- project onto an orthogonal space spanned by the
	#- eigenvectors with variances equal to the eigenvalues

	# maths:
	#- Aνi = λiνi ... νi -> some eigenvector, λi -> some eigenvalue
	#- AU = UD ... col(U) = ν's, diag(D) = λ
	#- for symmetric matrices (e.g., covariance matrices), U' = inv(U)
	#- thus A = UDU' ... eigen-decomposition of a matrix

	#- we zero center the data and compute
	#- Cov(Y) = Cov(U'X) = E[U'X(U'X)'] = U'E[XX']U = U'(UDU')U = D
	#--- and we're done


	X = X .- mean(X)
	D, E = eig(cov(X'))

	Y = E'*X

	# papers:
	#- see here: http://arxiv.org/pdf/1404.1100.pdf
	end


	function ICA(X)
	## x = As

	# required:
	#- s = Wx ... W such that we can (approximately) recover s [W = inv(A)]

	# method:
	#- assume that s are statistically independent
	#- assume that cov(s) = I


	# maths:
	#- A = UDV' ... via a singular value decomposition
	#- W = inv(A) = inv(UDV') = Vinv(D)U'
	#- Cov(x) = E[As(As)'] = AE[ss']A' = AIA' = UDV'(UDV')'
	#- = UDV'VDU' = UD^2U' = EDE' [the last step by an eigen-decomposition of Cov(X)]
	#- thus W = VD^{-1/2}E' because Cov(Wx) = WCov(x)W' = VD^{-1/2}E' (EDE') ED^{-1/2}V' = I

	#- so W, the 'unmixing matrix', is determined up to a rotation V
	#- let Xw = D^{-1/2}E'X, i.e., whiten the data (Cov(Xw) = I)
	#- then s = VXw

	#- new objective: find V that minimizes the dependencies between s
	#- from information theory: V = argmin I[p(s)] ... where I[p(s)] is the multi-information
	#- it is easy to show that I[p(s)] = ∑_k h[p(s_k)] - h[p(S)], i.e., is the difference
	#- between the sum of the marginal entropies and the joint entropy;

	#- instead of minimizing this, however, we minimize kurtosis (the fourth moment) below
	#- (this is not robust against outliers, however)

	# note:
	#- reduces to PCA when the sources s are Gaussian
	#- (no correlation implies statistical independence)

	d, m = size(X)
	X = X .- mean(X, 2)

	# get the eigenvectors and eigenvalues
	D, E = eig(cov(X'))
	D = diagm(D)

	# whiten the data
	Xw = sqrtm(inv(D)) * E' * X

	# maximize the kurtosis
	V, s, u = svd( (repmat(sum(Xw .* Xw, 1), d, 1) .* Xw) * Xw' )

	# W is the inverse of A
	W = V * sqrtm(inv(D)) * E'
	S = W * X

	W, S

	# papers:
	#- see here: http://arxiv.org/pdf/1404.2986.pdf
	#- and here: http://www.stat.ucla.edu/~yuille/courses/Stat161-261-Spring14/HyvO00-icatut.pdf
	end