Skip to content

Instantly share code, notes, and snippets.

@tansey
Last active June 1, 2020 21:55
Show Gist options
  • Save tansey/4322e514c1f89f347b91467444c576e8 to your computer and use it in GitHub Desktop.
Save tansey/4322e514c1f89f347b91467444c576e8 to your computer and use it in GitHub Desktop.
Quick and dirty binary matrix factorization via alternating logistic regression
import numpy as np
from functools import partial
from scipy.optimize import fmin_l_bfgs_b
from sklearn.linear_model import LogisticRegression
def binary_mf(Y, nembeds, lam=None, lams=30, cv=5, max_steps=30, tol=1e-4, verbose=False):
# Convert to a log-space grid
if lam is None and isinstance(lams, int):
lams = np.exp(np.linspace(np.log(1e-2), np.log(1), lams))
# If there is no lambda provided, select it via CV
if lam is None:
from sklearn.model_selection import KFold
# Cross-validation setup
cv_scores = np.zeros((len(lams), cv))
indices = np.array([[i,j] for i,j in np.ndindex(Y.shape) if not np.isnan(Y[i,j])])
kf = KFold(n_splits=cv, shuffle=True)
for cv_idx, (train_index, test_index) in enumerate(kf.split(indices)):
if verbose:
print('Fold {}/{}'.format(cv_idx+1, cv))
W_prev, V_prev = None, None
for lam_idx, cur_lam in enumerate(lams):
# Get the training data
Y_train = np.copy(Y)
for i,j in indices[test_index]:
Y_train[i,j] = np.nan
# Fit the model
W, V = binary_mf(Y_train, nembeds, lam=cur_lam, verbose=verbose>1)
# Evaluate on held out indices
Mu = ilogit(W.dot(V.T))
Y_test = np.array([Y[i,j] for i,j in indices[test_index]])
Mu_test = np.array([Mu[i,j] for i,j in indices[test_index]])
cv_scores[lam_idx, cv_idx] = cross_entropy(Y_test, Mu_test)
if verbose:
print('\tLam {}/{} ({:.4f}) loss: {:.6f}'.format(lam_idx+1, len(lams), cur_lam, cv_scores[lam_idx, cv_idx]))
W_prev, V_prev = W, V
# Select the best lambda, fit, and return
best_lam = lams[np.argmax(cv_scores.mean(axis=1))]
if verbose:
print('Best lam: {:.6f}'.format(best_lam))
return binary_mf(Y, nembeds, lam=best_lam, verbose=verbose)
# Initialize the embeddings
W = np.random.normal(0, 1/np.sqrt(nembeds), size=(Y.shape[0], nembeds))
V = np.random.normal(0, 1/np.sqrt(nembeds), size=(Y.shape[1], nembeds))
# Setup the LR optimizer
clf = LogisticRegression(C=lam, fit_intercept=False, solver='lbfgs')
# Measure the reconstruction loss
prev_loss = cross_entropy(Y, ilogit(W.dot(V.T)))
missing = np.isnan(Y)
for step in range(max_steps):
if verbose:
print('Step {}/{}'.format(step+1, max_steps))
# W-step
for i in range(Y.shape[0]):
clf.fit(V[~missing[i]], Y[i,~missing[i]])
W[i] = clf.coef_[0]
# V-step
for i in range(Y.shape[1]):
clf.fit(W[~missing[:,i]], Y[~missing[:,i],i])
V[i] = clf.coef_[0]
# Track progress and stop early if converged
loss = cross_entropy(Y, ilogit(W.dot(V.T)))
if verbose:
print('Loss: {:.6f}'.format(loss))
if loss - prev_loss < tol:
break
prev_loss = loss
return W, V
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment