-
-
Save agramfort/726574 to your computer and use it in GitHub Desktop.
""" | |
Author: Oliver Mitevski | |
References: | |
A Generalized Linear Model for Principal Component Analysis of Binary Data, | |
Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar | |
The code was translated and adapted from Jakob Verbeek's | |
"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB | |
""" | |
import sys | |
import numpy as np | |
from scipy import linalg | |
# from scipy import * | |
# from numpy import * | |
# from scipy.linalg import diagsvd, svd | |
# from scipy.sparse import linalg | |
# import cPickle, gzip | |
import time | |
import pylab as pl | |
def bpca(X, L=2, max_iters=30): | |
X = np.asanyarray(X, dtype=np.float) | |
N, D = X.shape | |
# x = X | |
X = 2*X - 1 | |
delta = np.random.permutation(N) | |
Delta = X[delta[0], :] / 100 | |
U = 1e-4 * np.random.random( (N, L) ) | |
V = 1e-4 * np.random.random( (L, D) ) | |
#for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end; | |
Th = np.zeros((N, D)) | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
for iter in range(max_iters): | |
print iter | |
# Update U | |
T = np.tanh(Th/2) / Th | |
B = np.dot(V, (X - T*Delta[None,:]).T) | |
for n in range(N): | |
A = np.dot(V * T[n,:][None,:], V.T) | |
U[n,:] = linalg.solve(A, B[:,n]).T | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
Q = np.random.random(N) | |
# normalize it | |
# Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float | |
Q = linalg.norm(Q) # gives the same result | |
# Update V | |
T = np.tanh(Th/2) / Th | |
U2 = np.c_[U, np.ones((N,1))] | |
# U2 = U2 * np.tile(Q, (L+1, 1)).T | |
U2 *= Q # XXX : equivalent as above if Q is a float | |
B = np.dot(U2.T, X) | |
Uo = np.c_[U, np.ones((N, 1))] | |
for d in range(D): | |
A = np.dot(U2.T * T[:, d][None,:], Uo) | |
V2 = linalg.solve(A, B[:, d]) | |
Delta[d] = V2[-1] | |
if L > 0: | |
V[:, d] = V2[0:-1] | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
print U.shape | |
# plotM1(U[0:10000:1,0:2], labels[0:10000:10]) | |
U1, S, V = linalg.svd(U) | |
Vh = V.T | |
U1 = np.mat(U1[:, 0:L]) | |
# Vh = np.mat(Vh[0:L, :]) | |
Vh = Vh[0:L, :] | |
codes = np.dot(U, Vh.T) | |
return codes | |
def main(): | |
# Load the dataset | |
from save_load import save_file, load_file | |
try: | |
input_data = load_file(sys.argv[1]) | |
sparse = False | |
except: | |
print 'loading sparse' | |
sparse = True | |
from scipy.io import mmio as mm | |
input_data = mm.mmread(sys.argv[1]) | |
input_data = np.array(input_data.todense()) | |
N, D = input_data.shape | |
print N, D | |
# make data binary | |
# for i in range(N): | |
# for j in range(D): | |
# if input_data[i,j] > 0.0: | |
# input_data[i,j] = 1.0 | |
# save_file('news-10-stemmed.index2200/binDict', input_data) | |
# print 'saved binary' | |
X = input_data | |
labels = load_file(sys.argv[2]) | |
start = time.clock() | |
codes = bpca(X, L=2, max_iters = 20) | |
print "Time for computing binary PCA took", time.clock()-start | |
save_file(sys.argv[3], codes) | |
if __name__ == "__main__": | |
n_samples, n_features = 10, 30 | |
np.random.seed(0) | |
X = np.random.randn(n_samples, n_features) | |
codes = bpca(X, L=2, max_iters=20) | |
print codes | |
# main() |
great. Double check that I did not introduce any bug.
could you put this code in the same API as PCA in the scikit-learn? with fit and transform? It's not clear from your code what are the components estimated and how you would transform a new dataset ie. compute the PCA scores aka codes on a new X. Do you have access to something similar as the explained variance of each component?
It seems to work. I know what you mean, but it's not clear to me either how the transform could be done, I should go back to the paper. If this method turns to be non-parametric, then it's not so great. I kinda lost interest since it performs better for non-binary data than for binary.
good if it (sill) works. I wanted to use it to find latent components for a binary matrix. How did you evaluate the performance?
just by visualization of the labels of the 20-newsgroups. They seemed kind of clustered, so it's working.
that's a fairly empiric validation :) I would have expected "it reproduces the results form the paper" :)
maybe it should be tested on different datasets. The one I used (tf-idf feature vectors) may not be the most suitable. Nevertheless it didn't fail, so there's hope ;)
thanks perfect!
I'll give it a try soon.