jhemann/MomentOfScience_blog_post.ipynb

## example_session.md

      
    Raw
  

              example_session.md
            
          
    In [1]: import mf_rec_test as mfrt

In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 2.9021
Factorization RMSE: 0.087

In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0028
Factorization RMSE: 0.081

In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0652
Factorization RMSE: 0.088

In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0031
Factorization RMSE: 0.084

  
## mf.py
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
import numpy


def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    half_beta = beta / 2.0
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i,j] > 0:
                    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i,j] > 0:
                    e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])
        if e < 0.001:
            break
    return P, Q, step, e


def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T, step, e

###############################################################################

if __name__ == "__main__":
    R = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]

    R = numpy.array(R)

    N = len(R)
    M = len(R[0])
    K = 2

    P = numpy.random.rand(N,K)
    Q = numpy.random.rand(M,K)

    nP, nQ = matrix_factorization(R, P, Q, K)
    nR = numpy.dot(nP, nQ.T)
    print nR

## mf_cython.pyx
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
from __future__ import division
cimport cython
cimport numpy as np


@cython.boundscheck(False)
@cython.wraparound(False)
def matrix_factorization(np.ndarray[double, ndim=2] R,
                         np.ndarray[double, ndim=2] P,
                         np.ndarray[double, ndim=2] Q,
                         int K,
                         int steps=5000,
                         double alpha=0.0002,
                         double beta=0.02):

    cdef double half_beta = beta / 2.0
    cdef int N = R.shape[0]
    cdef int M = R.shape[1]
    cdef double eij = 0.0
    cdef double e = 0.0
    cdef double two_eij = 0.0
    cdef int step = 0
    cdef int i = 0
    cdef int j = 0
    cdef int k = 0
    cdef double temp = 0.0

    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    #Convert the line
                    #    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    #in mf.py to avoid using numpy such that numba can optimize
                    #the calculation (once execution is moved into C, e.g. via
                    #numpy, numba cannot optimize the corresponding code). This
                    #code is now equivalent to mf_numba.py for comparisons
                    eij = R[i,j]
                    for p in xrange(K):
                        eij -= P[i,p] * Q[j,p]

                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    temp = R[i,j]
                    for p in xrange(K):
                        temp -= P[i,p] * Q[j,p]

                    e = e + temp * temp
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])

        if e < 0.001:
            break

    return P, Q, step, e


if __name__ == "__main__":
    R = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]

    R = np.array(R)

    N = len(R)
    M = len(R[0])
    K = 2

    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)

    nP, nQ = matrix_factorization(R, P, Q, K)
    nR = np.dot(nP, nQ.T)
    print nR

## mf_numba.py
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
from operator import mul
from itertools import imap, izip, starmap
import numpy as np
from numba import typeof, double, int_
from numba.decorators import autojit, jit


@autojit(locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization(R, P, Q, K):
    steps = 5000
    alpha = 0.0002
    beta = 0.02
    half_beta = beta / 2.0
    N, M = R.shape
    e = 0.0

    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    #Convert the line
                    #    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    #in mf.py to avoid using numpy such that numba can optimize
                    #the calculation (once execution is moved into C, e.g. via
                    #numpy, numba cannot optimize the corresponding code)
                    eij = R[i,j]
                    for p in xrange(K):
                        eij -= P[i,p] * Q[j,p]

                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])

        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    temp = R[i,j]
                    for p in xrange(K):
                        temp -= P[i,p] * Q[j,p]

                    e = e + temp * temp
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])

        if e < 0.001:
            break

    return P, Q, step, e


@jit(argtypes=[double[:,:], double[:,:], double[:,:], int_],
     restype=typeof((double[:,:], double[:,:], int_, double)),
     locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization2(R, P, Q, K):
    steps = 5000
    alpha = 0.0002
    beta = 0.02
    half_beta = beta / 2.0
    Q = Q.T
    N, M = R.shape
    dot = np.dot
    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:
                    temp = double(dot(P[i,:], Q[:,j]))
                    #temp = sum(map(mul, P[i,:], Q[:,j]))
                    #temp = sum(imap(mul, P[i,:], Q[:,j]))
                    #temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
                    #temp = 0
                    #for p in xrange(M):
                    #    temp = temp + P[i,p] * Q[p,j]
                    eij = R[i,j] - temp
                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[k,j] - beta * P[i,k])
                        Q[k,j] += alpha * (2 * eij * P[i,k] - beta * Q[k,j])
        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:
                    temp = double(dot(P[i,:], Q[:,j]))
                    #temp = sum(map(mul, P[i,:], Q[:,j]))
                    #temp = sum(imap(mul, P[i,:], Q[:,j]))
                    #temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
                    #temp = 0
                    #for p in xrange(M):
                    #    temp = temp + P[i,p] * Q[p,j]
                    e = e + (R[i,j] - temp)**2
                    for k in xrange(K):
                        e = e + half_beta * (P[i,k]**2 + Q[k,j]**2)

        if e < 0.001:
            break

    return P, Q.T, step, e

## mf_rec_test.py
import shelve
import time
import numpy as np
import matplotlib.pyplot as plt


def make_ratings_data(num_users, num_items, seed=8675309):
    '''Returns a num_users x num_items array of fake ratings, where the ratings
    can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0
    indicates that the associated user has not rated the associated item. Like
    most ratings data sets, the array will be sparse

    '''
    valid_ratings = np.arange(1, 5.5, 0.5)
    valid_ratings = np.insert(valid_ratings, 0, 0)
    #Make ~80% of the data be "missing", with the other 20% having a uniform
    #distribution over the 1-5 star rating scale
    probs = [0.20 / 9] * 9
    probs.insert(0, 0.80)
    pseudo_RNG = np.random.RandomState(seed)
    ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True,
                                       size=(num_users, num_items),
                                       p=probs)
    return ratings_matrix


def run(hist=False, method='cython', seed=1234567, num_users=100,
        num_items=200, num_factors=8):
    start_time = time.time()
    R = make_ratings_data(num_users, num_items, seed=None)
    N, M = R.shape
    K = num_factors

    pseudo_RNG = np.random.RandomState(seed)
    P = pseudo_RNG.rand(N, K)
    Q = pseudo_RNG.rand(M, K)

    if method == 'cython':
        import mf_cython
        matrix_factorization = mf_cython.matrix_factorization
    if method == 'numba':
        import mf_numba
        matrix_factorization = mf_numba.matrix_factorization
    if method == 'python':
        import mf
        matrix_factorization = mf.matrix_factorization

    nP, nQ, steps, error = matrix_factorization(R, P, Q, K)
    print 'Convergence in %i steps' % steps
    nR = np.dot(nP, nQ.T)

    R_actual = np.ma.masked_equal(R, 0)
    missing_mask = np.ma.getmaskarray(R_actual)
    nR_actual = np.ma.masked_array(nR, mask=missing_mask)
    fit_error = nR_actual - R_actual
    fit_error_filled = fit_error.filled(-999)
    actual_ratings = np.where(fit_error_filled > -999)
    fit_diffs = np.asarray(fit_error[actual_ratings])
    fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size)

    if hist:
        plt.hist(fit_diffs, bins=20)
        plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)'
        plt.ylabel = 'Number of Consumers'
        plt.show()

    #Save the output for later if we want to analyze results without re-running
    #the code...
    s = shelve.open('mf_rec_test_results_%s.db' % method)
    s['nP'] = nP
    s['nQ'] = nQ
    s['nR'] = nR
    s['R_actual'] = R_actual
    s['nR_actual'] = nR_actual
    s['fit_error'] = fit_error
    s.close()
    print 'Recommendation test time in minutes: %.4f' \
           % ((time.time() - start_time) / 60.0)
    print 'Factorization RMSE: %.3f' % fit_RMSE

    return None

## MomentOfScience_blog_post.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              MomentOfScience_blog_post.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## setup_mf_cython.py
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 01 23:49:22 2012

@author: JoshuaHemann
"""

# Run with:
#
#    vs_compiler
#    python setup_mf_cython.py build_ext --inplace
#
# to compile Cython extensions in-place (useful during development)

from distutils.core import setup
from Cython.Distutils import build_ext
from Cython.Build import cythonize
import numpy as np


setup(cmdclass={'build_ext': build_ext},
      include_dirs=[np.get_include()],
      ext_modules=cythonize('mf_cython.pyx', annotate=True))
	# -- coding: utf-8 --
	#!/usr/bin/python

	'''
	Adapted from example code by Albert Au Yeung (2010)
	http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

	An implementation of matrix factorization...

	@INPUT:
	R : a matrix to be factorized, dimension N x M
	P : an initial matrix of dimension N x K
	Q : an initial matrix of dimension M x K
	K : the number of latent features
	steps : the maximum number of steps to perform the optimisation
	alpha : the learning rate
	beta : the regularization parameter
	@OUTPUT:
	the final matrices P and Q, steps taken and error

	'''
	import numpy


	def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
	half_beta = beta / 2.0
	for step in xrange(steps):
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i,j] > 0:
	eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
	for k in xrange(K):
	P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
	Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
	e = 0
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i,j] > 0:
	e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2
	for k in xrange(K):
	e += half_beta * (P[i,k]P[i,k] + Q[j,k]Q[j,k])
	if e < 0.001:
	break
	return P, Q, step, e


	def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
	Q = Q.T
	for step in xrange(steps):
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i][j] > 0:
	eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
	for k in xrange(K):
	P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
	Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
	e = 0
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i][j] > 0:
	e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
	for k in xrange(K):
	e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
	if e < 0.001:
	break
	return P, Q.T, step, e

	###############################################################################

	if __name__ == "__main__":
	R = [
	[5,3,0,1],
	[4,0,0,1],
	[1,1,0,5],
	[1,0,0,4],
	[0,1,5,4],
	]

	R = numpy.array(R)

	N = len(R)
	M = len(R[0])
	K = 2

	P = numpy.random.rand(N,K)
	Q = numpy.random.rand(M,K)

	nP, nQ = matrix_factorization(R, P, Q, K)
	nR = numpy.dot(nP, nQ.T)
	print nR
	import shelve
	import time
	import numpy as np
	import matplotlib.pyplot as plt


	def make_ratings_data(num_users, num_items, seed=8675309):
	'''Returns a num_users x num_items array of fake ratings, where the ratings
	can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0
	indicates that the associated user has not rated the associated item. Like
	most ratings data sets, the array will be sparse

	'''
	valid_ratings = np.arange(1, 5.5, 0.5)
	valid_ratings = np.insert(valid_ratings, 0, 0)
	#Make ~80% of the data be "missing", with the other 20% having a uniform
	#distribution over the 1-5 star rating scale
	probs = [0.20 / 9] * 9
	probs.insert(0, 0.80)
	pseudo_RNG = np.random.RandomState(seed)
	ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True,
	size=(num_users, num_items),
	p=probs)
	return ratings_matrix


	def run(hist=False, method='cython', seed=1234567, num_users=100,
	num_items=200, num_factors=8):
	start_time = time.time()
	R = make_ratings_data(num_users, num_items, seed=None)
	N, M = R.shape
	K = num_factors

	pseudo_RNG = np.random.RandomState(seed)
	P = pseudo_RNG.rand(N, K)
	Q = pseudo_RNG.rand(M, K)

	if method == 'cython':
	import mf_cython
	matrix_factorization = mf_cython.matrix_factorization
	if method == 'numba':
	import mf_numba
	matrix_factorization = mf_numba.matrix_factorization
	if method == 'python':
	import mf
	matrix_factorization = mf.matrix_factorization

	nP, nQ, steps, error = matrix_factorization(R, P, Q, K)
	print 'Convergence in %i steps' % steps
	nR = np.dot(nP, nQ.T)

	R_actual = np.ma.masked_equal(R, 0)
	missing_mask = np.ma.getmaskarray(R_actual)
	nR_actual = np.ma.masked_array(nR, mask=missing_mask)
	fit_error = nR_actual - R_actual
	fit_error_filled = fit_error.filled(-999)
	actual_ratings = np.where(fit_error_filled > -999)
	fit_diffs = np.asarray(fit_error[actual_ratings])
	fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size)

	if hist:
	plt.hist(fit_diffs, bins=20)
	plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)'
	plt.ylabel = 'Number of Consumers'
	plt.show()

	#Save the output for later if we want to analyze results without re-running
	#the code...
	s = shelve.open('mf_rec_test_results_%s.db' % method)
	s['nP'] = nP
	s['nQ'] = nQ
	s['nR'] = nR
	s['R_actual'] = R_actual
	s['nR_actual'] = nR_actual
	s['fit_error'] = fit_error
	s.close()
	print 'Recommendation test time in minutes: %.4f' \
	% ((time.time() - start_time) / 60.0)
	print 'Factorization RMSE: %.3f' % fit_RMSE

	return None
	# -- coding: utf-8 --
	"""
	Created on Sun Apr 01 23:49:22 2012

	@author: JoshuaHemann
	"""

	# Run with:
	#
	# vs_compiler
	# python setup_mf_cython.py build_ext --inplace
	#
	# to compile Cython extensions in-place (useful during development)

	from distutils.core import setup
	from Cython.Distutils import build_ext
	from Cython.Build import cythonize
	import numpy as np


	setup(cmdclass={'build_ext': build_ext},
	include_dirs=[np.get_include()],
	ext_modules=cythonize('mf_cython.pyx', annotate=True))