Skip to content

Instantly share code, notes, and snippets.

@agramfort
Forked from omitevski/binPCA.py
Created December 3, 2010 04:13
Show Gist options
  • Save agramfort/726574 to your computer and use it in GitHub Desktop.
Save agramfort/726574 to your computer and use it in GitHub Desktop.
"""
Author: Oliver Mitevski
References:
A Generalized Linear Model for Principal Component Analysis of Binary Data,
Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar
The code was translated and adapted from Jakob Verbeek's
"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB
"""
import sys
import numpy as np
from scipy import linalg
# from scipy import *
# from numpy import *
# from scipy.linalg import diagsvd, svd
# from scipy.sparse import linalg
# import cPickle, gzip
import time
import pylab as pl
def bpca(X, L=2, max_iters=30):
X = np.asanyarray(X, dtype=np.float)
N, D = X.shape
# x = X
X = 2*X - 1
delta = np.random.permutation(N)
Delta = X[delta[0], :] / 100
U = 1e-4 * np.random.random( (N, L) )
V = 1e-4 * np.random.random( (L, D) )
#for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end;
Th = np.zeros((N, D))
Th = np.dot(U, V) + Delta[None,:] # with broadcasting
for iter in range(max_iters):
print iter
# Update U
T = np.tanh(Th/2) / Th
B = np.dot(V, (X - T*Delta[None,:]).T)
for n in range(N):
A = np.dot(V * T[n,:][None,:], V.T)
U[n,:] = linalg.solve(A, B[:,n]).T
Th = np.dot(U, V) + Delta[None,:] # with broadcasting
Q = np.random.random(N)
# normalize it
# Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float
Q = linalg.norm(Q) # gives the same result
# Update V
T = np.tanh(Th/2) / Th
U2 = np.c_[U, np.ones((N,1))]
# U2 = U2 * np.tile(Q, (L+1, 1)).T
U2 *= Q # XXX : equivalent as above if Q is a float
B = np.dot(U2.T, X)
Uo = np.c_[U, np.ones((N, 1))]
for d in range(D):
A = np.dot(U2.T * T[:, d][None,:], Uo)
V2 = linalg.solve(A, B[:, d])
Delta[d] = V2[-1]
if L > 0:
V[:, d] = V2[0:-1]
Th = np.dot(U, V) + Delta[None,:] # with broadcasting
print U.shape
# plotM1(U[0:10000:1,0:2], labels[0:10000:10])
U1, S, V = linalg.svd(U)
Vh = V.T
U1 = np.mat(U1[:, 0:L])
# Vh = np.mat(Vh[0:L, :])
Vh = Vh[0:L, :]
codes = np.dot(U, Vh.T)
return codes
def main():
# Load the dataset
from save_load import save_file, load_file
try:
input_data = load_file(sys.argv[1])
sparse = False
except:
print 'loading sparse'
sparse = True
from scipy.io import mmio as mm
input_data = mm.mmread(sys.argv[1])
input_data = np.array(input_data.todense())
N, D = input_data.shape
print N, D
# make data binary
# for i in range(N):
# for j in range(D):
# if input_data[i,j] > 0.0:
# input_data[i,j] = 1.0
# save_file('news-10-stemmed.index2200/binDict', input_data)
# print 'saved binary'
X = input_data
labels = load_file(sys.argv[2])
start = time.clock()
codes = bpca(X, L=2, max_iters = 20)
print "Time for computing binary PCA took", time.clock()-start
save_file(sys.argv[3], codes)
if __name__ == "__main__":
n_samples, n_features = 10, 30
np.random.seed(0)
X = np.random.randn(n_samples, n_features)
codes = bpca(X, L=2, max_iters=20)
print codes
# main()
@omitevski
Copy link

It seems to work. I know what you mean, but it's not clear to me either how the transform could be done, I should go back to the paper. If this method turns to be non-parametric, then it's not so great. I kinda lost interest since it performs better for non-binary data than for binary.

@agramfort
Copy link
Author

good if it (sill) works. I wanted to use it to find latent components for a binary matrix. How did you evaluate the performance?

@omitevski
Copy link

just by visualization of the labels of the 20-newsgroups. They seemed kind of clustered, so it's working.

@agramfort
Copy link
Author

that's a fairly empiric validation :) I would have expected "it reproduces the results form the paper" :)

@omitevski
Copy link

maybe it should be tested on different datasets. The one I used (tf-idf feature vectors) may not be the most suitable. Nevertheless it didn't fail, so there's hope ;)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment