agramfort/binPCA.py

## binPCA.py
"""
Author: Oliver Mitevski

References:
A Generalized Linear Model for Principal Component Analysis of Binary Data,
Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar

The code was translated and adapted from Jakob Verbeek's
"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB

"""
import sys
import numpy as np
from scipy import linalg
# from scipy import *
# from numpy import *
# from scipy.linalg import diagsvd, svd
# from scipy.sparse import linalg
# import cPickle, gzip
import time
import pylab as pl


def bpca(X, L=2, max_iters=30):
    X = np.asanyarray(X, dtype=np.float)
    N, D = X.shape

    # x = X
    X = 2*X - 1

    delta = np.random.permutation(N)
    Delta = X[delta[0], :] / 100
    U = 1e-4 * np.random.random( (N, L) )
    V = 1e-4 * np.random.random( (L, D) )
    #for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end;

    Th = np.zeros((N, D))
    Th = np.dot(U, V) + Delta[None,:] # with broadcasting

    for iter in range(max_iters):
        print iter
        # Update U
        T = np.tanh(Th/2) / Th

        B = np.dot(V, (X - T*Delta[None,:]).T)

        for n in range(N):
            A = np.dot(V * T[n,:][None,:], V.T)
            U[n,:] = linalg.solve(A, B[:,n]).T

        Th = np.dot(U, V) + Delta[None,:] # with broadcasting

        Q = np.random.random(N)
        # normalize it
        # Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float
        Q = linalg.norm(Q) # gives the same result

        # Update V
        T = np.tanh(Th/2) / Th
        U2 = np.c_[U, np.ones((N,1))]
        # U2 = U2 * np.tile(Q, (L+1, 1)).T
        U2 *= Q # XXX : equivalent as above if Q is a float
        B = np.dot(U2.T, X)

        Uo = np.c_[U, np.ones((N, 1))]
        for d in range(D):
            A = np.dot(U2.T * T[:, d][None,:], Uo)
            V2 = linalg.solve(A, B[:, d])
            Delta[d] = V2[-1]

            if L > 0:
                V[:, d] = V2[0:-1]

        Th = np.dot(U, V) + Delta[None,:] # with broadcasting

    print U.shape

    # plotM1(U[0:10000:1,0:2], labels[0:10000:10])

    U1, S, V = linalg.svd(U)
    Vh = V.T
    U1 = np.mat(U1[:, 0:L])
    # Vh = np.mat(Vh[0:L, :])
    Vh = Vh[0:L, :]
    codes = np.dot(U, Vh.T)

    return codes


def main():
    # Load the dataset
    from save_load import save_file, load_file
    try:
        input_data = load_file(sys.argv[1])
        sparse = False
    except:
        print 'loading sparse'
        sparse = True
        from scipy.io import mmio as mm
        input_data = mm.mmread(sys.argv[1])
        input_data = np.array(input_data.todense())

    N, D = input_data.shape
    print N, D

# make data binary
#     for i in range(N):
#          for j in range(D):
#              if input_data[i,j] > 0.0:
#                  input_data[i,j] = 1.0
#      save_file('news-10-stemmed.index2200/binDict', input_data)
#     print 'saved binary'

    X = input_data

    labels = load_file(sys.argv[2])

    start = time.clock()

    codes = bpca(X, L=2, max_iters = 20)

    print "Time for computing binary PCA took", time.clock()-start

    save_file(sys.argv[3], codes)


if __name__ == "__main__":
    n_samples, n_features = 10, 30
    np.random.seed(0)
    X = np.random.randn(n_samples, n_features)
    codes = bpca(X, L=2, max_iters=20)
    print codes
    # main()
	"""
	Author: Oliver Mitevski

	References:
	A Generalized Linear Model for Principal Component Analysis of Binary Data,
	Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar

	The code was translated and adapted from Jakob Verbeek's
	"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB

	"""
	import sys
	import numpy as np
	from scipy import linalg
	# from scipy import *
	# from numpy import *
	# from scipy.linalg import diagsvd, svd
	# from scipy.sparse import linalg
	# import cPickle, gzip
	import time
	import pylab as pl


	def bpca(X, L=2, max_iters=30):
	X = np.asanyarray(X, dtype=np.float)
	N, D = X.shape

	# x = X
	X = 2*X - 1

	delta = np.random.permutation(N)
	Delta = X[delta[0], :] / 100
	U = 1e-4 * np.random.random( (N, L) )
	V = 1e-4 * np.random.random( (L, D) )
	#for c=1:C; Th(:,:,c) = U(:,:,c)V(:,:,c) + ones(N,1)Delta(c,:); end;

	Th = np.zeros((N, D))
	Th = np.dot(U, V) + Delta[None,:] # with broadcasting

	for iter in range(max_iters):
	print iter
	# Update U
	T = np.tanh(Th/2) / Th

	B = np.dot(V, (X - T*Delta[None,:]).T)

	for n in range(N):
	A = np.dot(V * T[n,:][None,:], V.T)
	U[n,:] = linalg.solve(A, B[:,n]).T

	Th = np.dot(U, V) + Delta[None,:] # with broadcasting

	Q = np.random.random(N)
	# normalize it
	# Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float
	Q = linalg.norm(Q) # gives the same result

	# Update V
	T = np.tanh(Th/2) / Th
	U2 = np.c_[U, np.ones((N,1))]
	# U2 = U2 * np.tile(Q, (L+1, 1)).T
	U2 *= Q # XXX : equivalent as above if Q is a float
	B = np.dot(U2.T, X)

	Uo = np.c_[U, np.ones((N, 1))]
	for d in range(D):
	A = np.dot(U2.T * T[:, d][None,:], Uo)
	V2 = linalg.solve(A, B[:, d])
	Delta[d] = V2[-1]

	if L > 0:
	V[:, d] = V2[0:-1]

	Th = np.dot(U, V) + Delta[None,:] # with broadcasting

	print U.shape

	# plotM1(U[0:10000:1,0:2], labels[0:10000:10])

	U1, S, V = linalg.svd(U)
	Vh = V.T
	U1 = np.mat(U1[:, 0:L])
	# Vh = np.mat(Vh[0:L, :])
	Vh = Vh[0:L, :]
	codes = np.dot(U, Vh.T)

	return codes


	def main():
	# Load the dataset
	from save_load import save_file, load_file
	try:
	input_data = load_file(sys.argv[1])
	sparse = False
	except:
	print 'loading sparse'
	sparse = True
	from scipy.io import mmio as mm
	input_data = mm.mmread(sys.argv[1])
	input_data = np.array(input_data.todense())

	N, D = input_data.shape
	print N, D

	# make data binary
	# for i in range(N):
	# for j in range(D):
	# if input_data[i,j] > 0.0:
	# input_data[i,j] = 1.0
	# save_file('news-10-stemmed.index2200/binDict', input_data)
	# print 'saved binary'

	X = input_data

	labels = load_file(sys.argv[2])

	start = time.clock()

	codes = bpca(X, L=2, max_iters = 20)

	print "Time for computing binary PCA took", time.clock()-start

	save_file(sys.argv[3], codes)


	if __name__ == "__main__":
	n_samples, n_features = 10, 30
	np.random.seed(0)
	X = np.random.randn(n_samples, n_features)
	codes = bpca(X, L=2, max_iters=20)
	print codes
	# main()