Skip to content

Instantly share code, notes, and snippets.

@vene
Last active May 22, 2018 07:16
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save vene/3197a2bae38f77799ea8 to your computer and use it in GitHub Desktop.
Save vene/3197a2bae38f77799ea8 to your computer and use it in GitHub Desktop.
""" Poisson-loss Factorization Machines with Numba
Follows the vanilla FM model from:
Steffen Rendle (2012): Factorization Machines with libFM.
In: ACM Trans. Intell. Syst. Technol., 3(3), May.
http://doi.acm.org/10.1145/2168752.2168771
See also: https://github.com/coreylynch/pyFM
"""
# Author: Vlad Niculae <vlad@vene.ro>
# License: Simplified BSD
from __future__ import print_function
import numpy as np
from numba import jit
from scipy import sparse as sp
from sklearn.utils import check_random_state, check_array, check_X_y
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
@jit("f8(f8[:], i4[:], f8[:], f8[:], f8[:,:], f8[:])", nopython=True)
def _fm_predict(row_data, row_indices, scaling, w, V, Vx):
"""Factorization machine forward pass"""
pred = 0
for i in range(row_indices.shape[0]):
pred += w[row_indices[i]] * row_data[i]
pred *= scaling[1] # pred = wscale * dot(x, w/wscale)
pred += scaling[0] # add bias
second_order = 0
for d in range(V.shape[1]):
Vx[d] = 0
for i in range(row_indices.shape[0]):
v_id = V[row_indices[i], d]
Vx[d] += v_id * row_data[i]
second_order -= (v_id ** 2) * (row_data[i] ** 2)
second_order += Vx[d] ** 2
Vx[d] *= scaling[2] # Vx = Vscale * dot(V/Vscale, x),
second_order *= 0.5 * scaling[2] ** 2
# second_order = 1/2 Vscale^2 * [dot(V/Vscale, x)^2 - (V/Vscale)^2 * x^2]
pred += second_order
return pred
@jit("f8[:](f8[:], i4[:], i4[:], f8[:], f8[:], f8[:,:], i4)")
def _fm_predict_batch_fast(X_data, X_indices, X_indptr, scaling, w, V,
n_samples):
res = np.zeros(n_samples, dtype=np.double)
for i in range(n_samples):
for k in range(X_indptr[i], X_indptr[i + 1]):
res[i] += w[X_indices[k]] * X_data[k]
res[i] *= scaling[1]
res[i] += scaling[0]
second_order = 0
for d in range(V.shape[1]):
vx = 0
for k in range(X_indptr[i], X_indptr[i + 1]):
v_id = V[X_indices[k], d]
vx += v_id * X_data[k]
second_order -= (v_id ** 2) * (X_data[i] ** 2)
second_order += vx ** 2
second_order *= 0.5 * scaling[2] ** 2
res[i] += second_order
return res
def _fm_predict_batch(X, scaling, w, V):
"""Factorization machine prediction over a batch."""
return _fm_predict_batch_fast(X.data, X.indices, X.indptr, scaling, w, V,
X.shape[0])
@jit('void(f8[:], i4[:], f8, f8[:], f8[:], f8[:, :], f8, f8, f8)',
nopython=True)
def _fm_poisson_step(data, indices, y, scaling, w, V, learning_rate, reg_w,
reg_V):
Vx = np.empty(V.shape[1], dtype=np.double) # byproduct of _fm_predict
z = _fm_predict(data, indices, scaling, w, V, Vx)
# loss: exp(z) - y * z + y! (y! = ln gamma(y + 1))
dloss_dz = np.exp(z) - y
# update bias:
scaling[0] -= learning_rate * dloss_dz
# update w: gradw_i = x_i
for k in range(indices.shape[0]):
w[indices[k]] -= learning_rate * dloss_dz * data[k] / scaling[1]
# update V: d indexes latent dims, i=indices[k] indexes nonzero features
# gradv_id = x_i * sum(v_jd x_j) - v_id * x_i^2
for d in range(V.shape[1]):
for k in range(indices.shape[0]):
grad_v = data[k] * Vx[d] / scaling[2]
grad_v -= V[indices[k], d] * data[k] ** 2
V[indices[k], d] -= learning_rate * dloss_dz * grad_v
# L2-regularize without touching the data
# Inspired by the scikit-learn stochastic gradient descent approach
# do not scale to negative values reg_* is too big: instead set to zero
scaling[1] *= max(0, 1.0 - learning_rate * reg_w)
scaling[2] *= max(0, 1.0 - learning_rate * reg_V)
@jit("void(f8[:], i4[:], i4[:], f8[:], f8[:], f8[:], f8[:, :], f8, f8, f8)",
nopython=True)
def _fm_poisson_epoch(X_data, X_indices, X_indptr, y, scaling, w, V,
learning_rate, reg_w, reg_V):
n_samples = y.shape[0]
for i in range(n_samples):
_fm_poisson_step(X_data[X_indptr[i]:X_indptr[i + 1]],
X_indices[X_indptr[i]:X_indptr[i + 1]],
y[i], scaling, w, V, learning_rate, reg_w, reg_V)
def factorization_machine_poisson(X, y, X_val=None, y_val=None, n_latent=3,
reg_w=1., reg_V=1., init_std=0.001,
warm_start=None, learning_rate=0.01,
n_iter=10, verbose=False, random_state=None):
"""Learns a factorization machine model with Poisson loss by SGD.
The response is modeled as:
y = exp(intercept + <w, x> + sum_i sum_j>i <x_i * v_i, x_j * v_j>,
where <., .> is the dot product,
w is a vector of length n_features capturing first-degree effects
and V is an array (n_features, n_latent) for second-degree interactions.
The Poisson loss is appropriate for count and rate outputs.
Parameters
----------
X: array-like or sparse, shape: (n_samples, n_features)
Training data points.
y: array, shape: (n_samples,)
Target labels.
X_val: array-like or sparse, shape: (n_samples_val, n_features), optional
Validation data points. Used only if verbose != False
y_val: array, shape: (n_samples_val,), optional
Validation labels. Used only if verbose != False
n_latent: int,
Latent dimensions of the second-degree model. Default: 3
reg_w: float,
Regularization amount for first-degree weights. Default: 1
reg_V: float,
Regularization amount for second-degree weights. Default: 1
init_std: float,
Initial scale of weight values. (Used only if warm_start is None.)
Weights are initialized from a centered gaussian with init_std scale.
Default: 0.001
warm_start: tuple (intercept, w, V) or None,
Initial setting for the weights.
learning_rate: float,
Step size for stochastic gradient descent. Default: 0.01
n_iter: int,
Maximum number of passes over the training data. Default: 10
verbose: int,
If > 0, prints statistics at every `verbose` iteration. Default: False.
random_state: int, RandomState or None:
Seed for the random number generator.
Returns
-------
scaling: tuple (intercept, w_scale, V_scale),
Scales of the learning parameters.
w: array, shape (n_features,):
First degree coefficients, divided by w_scale (see scaling above)
V: array, shape (n_features, n_latent):
Embedded second-degree interaction weights, divided by V_scale.
"""
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.double)
y = y.astype(np.double)
if not sp.issparse(X): # force sparse
X = sp.csr_matrix(X)
if X_val is not None:
X_val, y_val = check_X_y(X_val, y_val, accept_sparse='csr',
dtype=np.double)
if not sp.issparse(X):
X = sp.csr_matrix(X)
n_samples, n_features = X.shape
rng = check_random_state(random_state)
reg_w *= 0.5
reg_V *= 0.5 * n_latent # Unsure about the correct scaling
# initialize
if warm_start is not None:
bias, w, V = warm_start
scaling = np.array([bias, 1, 1], dtype=np.double)
else:
w = rng.randn(n_features)
V = rng.randn(n_features, n_latent)
scaling = np.array([0, init_std, init_std], dtype=np.double)
for it in range(n_iter):
if verbose and it % verbose == 0:
pred_tr = _fm_predict_batch(X, scaling, w, V)
pred_te = _fm_predict_batch(X_val, scaling, w, V)
np.exp(pred_tr, out=pred_tr)
np.exp(pred_te, out=pred_te)
dloss = np.mean(y - pred_tr)
print(
"{:>3}: dloss={:<8.4f} MSE tr={:<4.2f}, valid={:4.2f}".format(
it,
dloss,
mean_squared_error(y, pred_tr),
mean_squared_error(y_val, pred_te)),
end=" ")
_fm_poisson_epoch(X.data, X.indices, X.indptr, y, scaling, w, V,
learning_rate, reg_w, reg_V)
# rescale if needed
if scaling[1] <= 1e-9:
w *= scaling[1]
scaling[1] = 1.0
if scaling[2] <= 1e-9:
V *= scaling[2]
scaling[2] = 1.0
if verbose and it % verbose == 0:
print("w_0={:<6.2f} ||w||={:<6.2f} ||V||={:<6.2f}".format(
scaling[0],
np.linalg.norm(w * scaling[1]),
np.linalg.norm(V.ravel() * scaling[2])))
return scaling, w, V
def array_to_fm_format(X):
"""Converts a dense or sparse array X to factorization machine format.
If x[i, j] is represented (if X is sparse) or not nan (if dense)
the output will have a row:
[one_hot(i), one_hot(j)] -> x[i, j]
Parameters
----------
X, array-like or sparse
Input array
Returns
-------
X_one_hot, sparse array, shape (n_x_entries, X.shape[1] + X.shape[2])
Indices of non-empty values in X, in factorization machine format.
y: array, shape (n_x_entries,)
Non-empty values in X.
"""
X = check_array(X, accept_sparse='coo', force_all_finite=False)
n_rows, n_cols = X.shape
encoder = OneHotEncoder(n_values=[n_rows, n_cols])
if sp.issparse(X):
y = X.data
X_ix = np.column_stack([X.row, X.col])
else:
ix = np.isfinite(X)
X_ix = np.column_stack(np.where(ix))
y = X[ix].ravel()
X_oh = encoder.fit_transform(X_ix)
return X_oh, y
if __name__ == '__main__':
from scipy.stats.distributions import poisson
from sklearn.cross_validation import train_test_split
from time import time
X_dense = poisson.rvs(mu=3, size=(500, 600), random_state=0)
X, y = array_to_fm_format(X_dense)
X, X_val, y, y_val = train_test_split(X, y, random_state=0)
for reg in (0, 1e-5):
print("Regularization: {}".format(reg))
t0 = time()
w_0, w, V = factorization_machine_poisson(
X, y, X_val, y_val, n_latent=50,
reg_w=reg, reg_V=reg,
init_std=0.01, learning_rate=5e-3, n_iter=31, verbose=10,
random_state=0)
print("Done in {}".format(time() - t0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment