Skip to content

Instantly share code, notes, and snippets.

@jhemann
Last active September 21, 2016 12:30
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jhemann/5584536 to your computer and use it in GitHub Desktop.
Save jhemann/5584536 to your computer and use it in GitHub Desktop.
Code to evaluate speed improvements using numba versus cython on a matrix factorization problem used in the setting of a recommendation system.
In [1]: import mf_rec_test as mfrt

In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 2.9021
Factorization RMSE: 0.087

In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0028
Factorization RMSE: 0.081

In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0652
Factorization RMSE: 0.088

In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0031
Factorization RMSE: 0.084
# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code
An implementation of matrix factorization...
@INPUT:
R : a matrix to be factorized, dimension N x M
P : an initial matrix of dimension N x K
Q : an initial matrix of dimension M x K
K : the number of latent features
steps : the maximum number of steps to perform the optimisation
alpha : the learning rate
beta : the regularization parameter
@OUTPUT:
the final matrices P and Q, steps taken and error
'''
import numpy
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
half_beta = beta / 2.0
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i,j] > 0:
eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
for k in xrange(K):
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
e = 0
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i,j] > 0:
e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2
for k in xrange(K):
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])
if e < 0.001:
break
return P, Q, step, e
def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
Q = Q.T
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
e = 0
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
for k in xrange(K):
e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
if e < 0.001:
break
return P, Q.T, step, e
###############################################################################
if __name__ == "__main__":
R = [
[5,3,0,1],
[4,0,0,1],
[1,1,0,5],
[1,0,0,4],
[0,1,5,4],
]
R = numpy.array(R)
N = len(R)
M = len(R[0])
K = 2
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)
print nR
# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code
An implementation of matrix factorization...
@INPUT:
R : a matrix to be factorized, dimension N x M
P : an initial matrix of dimension N x K
Q : an initial matrix of dimension M x K
K : the number of latent features
steps : the maximum number of steps to perform the optimisation
alpha : the learning rate
beta : the regularization parameter
@OUTPUT:
the final matrices P and Q, steps taken and error
'''
from __future__ import division
cimport cython
cimport numpy as np
@cython.boundscheck(False)
@cython.wraparound(False)
def matrix_factorization(np.ndarray[double, ndim=2] R,
np.ndarray[double, ndim=2] P,
np.ndarray[double, ndim=2] Q,
int K,
int steps=5000,
double alpha=0.0002,
double beta=0.02):
cdef double half_beta = beta / 2.0
cdef int N = R.shape[0]
cdef int M = R.shape[1]
cdef double eij = 0.0
cdef double e = 0.0
cdef double two_eij = 0.0
cdef int step = 0
cdef int i = 0
cdef int j = 0
cdef int k = 0
cdef double temp = 0.0
for step in xrange(steps):
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
#Convert the line
# eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
#in mf.py to avoid using numpy such that numba can optimize
#the calculation (once execution is moved into C, e.g. via
#numpy, numba cannot optimize the corresponding code). This
#code is now equivalent to mf_numba.py for comparisons
eij = R[i,j]
for p in xrange(K):
eij -= P[i,p] * Q[j,p]
for k in xrange(K):
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
e = 0.0
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
temp = R[i,j]
for p in xrange(K):
temp -= P[i,p] * Q[j,p]
e = e + temp * temp
for k in xrange(K):
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])
if e < 0.001:
break
return P, Q, step, e
if __name__ == "__main__":
R = [
[5,3,0,1],
[4,0,0,1],
[1,1,0,5],
[1,0,0,4],
[0,1,5,4],
]
R = np.array(R)
N = len(R)
M = len(R[0])
K = 2
P = np.random.rand(N,K)
Q = np.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)
print nR
# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code
An implementation of matrix factorization...
@INPUT:
R : a matrix to be factorized, dimension N x M
P : an initial matrix of dimension N x K
Q : an initial matrix of dimension M x K
K : the number of latent features
steps : the maximum number of steps to perform the optimisation
alpha : the learning rate
beta : the regularization parameter
@OUTPUT:
the final matrices P and Q, steps taken and error
'''
from operator import mul
from itertools import imap, izip, starmap
import numpy as np
from numba import typeof, double, int_
from numba.decorators import autojit, jit
@autojit(locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization(R, P, Q, K):
steps = 5000
alpha = 0.0002
beta = 0.02
half_beta = beta / 2.0
N, M = R.shape
e = 0.0
for step in xrange(steps):
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
#Convert the line
# eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
#in mf.py to avoid using numpy such that numba can optimize
#the calculation (once execution is moved into C, e.g. via
#numpy, numba cannot optimize the corresponding code)
eij = R[i,j]
for p in xrange(K):
eij -= P[i,p] * Q[j,p]
for k in xrange(K):
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
e = 0.0
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
temp = R[i,j]
for p in xrange(K):
temp -= P[i,p] * Q[j,p]
e = e + temp * temp
for k in xrange(K):
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])
if e < 0.001:
break
return P, Q, step, e
@jit(argtypes=[double[:,:], double[:,:], double[:,:], int_],
restype=typeof((double[:,:], double[:,:], int_, double)),
locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization2(R, P, Q, K):
steps = 5000
alpha = 0.0002
beta = 0.02
half_beta = beta / 2.0
Q = Q.T
N, M = R.shape
dot = np.dot
for step in xrange(steps):
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
temp = double(dot(P[i,:], Q[:,j]))
#temp = sum(map(mul, P[i,:], Q[:,j]))
#temp = sum(imap(mul, P[i,:], Q[:,j]))
#temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
#temp = 0
#for p in xrange(M):
# temp = temp + P[i,p] * Q[p,j]
eij = R[i,j] - temp
for k in xrange(K):
P[i,k] += alpha * (2 * eij * Q[k,j] - beta * P[i,k])
Q[k,j] += alpha * (2 * eij * P[i,k] - beta * Q[k,j])
e = 0.0
for i in xrange(N):
for j in xrange(M):
if R[i,j] > 0:
temp = double(dot(P[i,:], Q[:,j]))
#temp = sum(map(mul, P[i,:], Q[:,j]))
#temp = sum(imap(mul, P[i,:], Q[:,j]))
#temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
#temp = 0
#for p in xrange(M):
# temp = temp + P[i,p] * Q[p,j]
e = e + (R[i,j] - temp)**2
for k in xrange(K):
e = e + half_beta * (P[i,k]**2 + Q[k,j]**2)
if e < 0.001:
break
return P, Q.T, step, e
import shelve
import time
import numpy as np
import matplotlib.pyplot as plt
def make_ratings_data(num_users, num_items, seed=8675309):
'''Returns a num_users x num_items array of fake ratings, where the ratings
can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0
indicates that the associated user has not rated the associated item. Like
most ratings data sets, the array will be sparse
'''
valid_ratings = np.arange(1, 5.5, 0.5)
valid_ratings = np.insert(valid_ratings, 0, 0)
#Make ~80% of the data be "missing", with the other 20% having a uniform
#distribution over the 1-5 star rating scale
probs = [0.20 / 9] * 9
probs.insert(0, 0.80)
pseudo_RNG = np.random.RandomState(seed)
ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True,
size=(num_users, num_items),
p=probs)
return ratings_matrix
def run(hist=False, method='cython', seed=1234567, num_users=100,
num_items=200, num_factors=8):
start_time = time.time()
R = make_ratings_data(num_users, num_items, seed=None)
N, M = R.shape
K = num_factors
pseudo_RNG = np.random.RandomState(seed)
P = pseudo_RNG.rand(N, K)
Q = pseudo_RNG.rand(M, K)
if method == 'cython':
import mf_cython
matrix_factorization = mf_cython.matrix_factorization
if method == 'numba':
import mf_numba
matrix_factorization = mf_numba.matrix_factorization
if method == 'python':
import mf
matrix_factorization = mf.matrix_factorization
nP, nQ, steps, error = matrix_factorization(R, P, Q, K)
print 'Convergence in %i steps' % steps
nR = np.dot(nP, nQ.T)
R_actual = np.ma.masked_equal(R, 0)
missing_mask = np.ma.getmaskarray(R_actual)
nR_actual = np.ma.masked_array(nR, mask=missing_mask)
fit_error = nR_actual - R_actual
fit_error_filled = fit_error.filled(-999)
actual_ratings = np.where(fit_error_filled > -999)
fit_diffs = np.asarray(fit_error[actual_ratings])
fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size)
if hist:
plt.hist(fit_diffs, bins=20)
plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)'
plt.ylabel = 'Number of Consumers'
plt.show()
#Save the output for later if we want to analyze results without re-running
#the code...
s = shelve.open('mf_rec_test_results_%s.db' % method)
s['nP'] = nP
s['nQ'] = nQ
s['nR'] = nR
s['R_actual'] = R_actual
s['nR_actual'] = nR_actual
s['fit_error'] = fit_error
s.close()
print 'Recommendation test time in minutes: %.4f' \
% ((time.time() - start_time) / 60.0)
print 'Factorization RMSE: %.3f' % fit_RMSE
return None
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 01 23:49:22 2012
@author: JoshuaHemann
"""
# Run with:
#
# vs_compiler
# python setup_mf_cython.py build_ext --inplace
#
# to compile Cython extensions in-place (useful during development)
from distutils.core import setup
from Cython.Distutils import build_ext
from Cython.Build import cythonize
import numpy as np
setup(cmdclass={'build_ext': build_ext},
include_dirs=[np.get_include()],
ext_modules=cythonize('mf_cython.pyx', annotate=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment