-
-
Save agramfort/726574 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author: Oliver Mitevski | |
References: | |
A Generalized Linear Model for Principal Component Analysis of Binary Data, | |
Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar | |
The code was translated and adapted from Jakob Verbeek's | |
"Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB | |
""" | |
import sys | |
import numpy as np | |
from scipy import linalg | |
# from scipy import * | |
# from numpy import * | |
# from scipy.linalg import diagsvd, svd | |
# from scipy.sparse import linalg | |
# import cPickle, gzip | |
import time | |
import pylab as pl | |
def bpca(X, L=2, max_iters=30): | |
X = np.asanyarray(X, dtype=np.float) | |
N, D = X.shape | |
# x = X | |
X = 2*X - 1 | |
delta = np.random.permutation(N) | |
Delta = X[delta[0], :] / 100 | |
U = 1e-4 * np.random.random( (N, L) ) | |
V = 1e-4 * np.random.random( (L, D) ) | |
#for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end; | |
Th = np.zeros((N, D)) | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
for iter in range(max_iters): | |
print iter | |
# Update U | |
T = np.tanh(Th/2) / Th | |
B = np.dot(V, (X - T*Delta[None,:]).T) | |
for n in range(N): | |
A = np.dot(V * T[n,:][None,:], V.T) | |
U[n,:] = linalg.solve(A, B[:,n]).T | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
Q = np.random.random(N) | |
# normalize it | |
# Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float | |
Q = linalg.norm(Q) # gives the same result | |
# Update V | |
T = np.tanh(Th/2) / Th | |
U2 = np.c_[U, np.ones((N,1))] | |
# U2 = U2 * np.tile(Q, (L+1, 1)).T | |
U2 *= Q # XXX : equivalent as above if Q is a float | |
B = np.dot(U2.T, X) | |
Uo = np.c_[U, np.ones((N, 1))] | |
for d in range(D): | |
A = np.dot(U2.T * T[:, d][None,:], Uo) | |
V2 = linalg.solve(A, B[:, d]) | |
Delta[d] = V2[-1] | |
if L > 0: | |
V[:, d] = V2[0:-1] | |
Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
print U.shape | |
# plotM1(U[0:10000:1,0:2], labels[0:10000:10]) | |
U1, S, V = linalg.svd(U) | |
Vh = V.T | |
U1 = np.mat(U1[:, 0:L]) | |
# Vh = np.mat(Vh[0:L, :]) | |
Vh = Vh[0:L, :] | |
codes = np.dot(U, Vh.T) | |
return codes | |
def main(): | |
# Load the dataset | |
from save_load import save_file, load_file | |
try: | |
input_data = load_file(sys.argv[1]) | |
sparse = False | |
except: | |
print 'loading sparse' | |
sparse = True | |
from scipy.io import mmio as mm | |
input_data = mm.mmread(sys.argv[1]) | |
input_data = np.array(input_data.todense()) | |
N, D = input_data.shape | |
print N, D | |
# make data binary | |
# for i in range(N): | |
# for j in range(D): | |
# if input_data[i,j] > 0.0: | |
# input_data[i,j] = 1.0 | |
# save_file('news-10-stemmed.index2200/binDict', input_data) | |
# print 'saved binary' | |
X = input_data | |
labels = load_file(sys.argv[2]) | |
start = time.clock() | |
codes = bpca(X, L=2, max_iters = 20) | |
print "Time for computing binary PCA took", time.clock()-start | |
save_file(sys.argv[3], codes) | |
if __name__ == "__main__": | |
n_samples, n_features = 10, 30 | |
np.random.seed(0) | |
X = np.random.randn(n_samples, n_features) | |
codes = bpca(X, L=2, max_iters=20) | |
print codes | |
# main() |
good if it (sill) works. I wanted to use it to find latent components for a binary matrix. How did you evaluate the performance?
just by visualization of the labels of the 20-newsgroups. They seemed kind of clustered, so it's working.
that's a fairly empiric validation :) I would have expected "it reproduces the results form the paper" :)
maybe it should be tested on different datasets. The one I used (tf-idf feature vectors) may not be the most suitable. Nevertheless it didn't fail, so there's hope ;)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It seems to work. I know what you mean, but it's not clear to me either how the transform could be done, I should go back to the paper. If this method turns to be non-parametric, then it's not so great. I kinda lost interest since it performs better for non-binary data than for binary.