jhemann/MomentOfScience_blog_post.ipynb

## example_session.md

      
    Raw
  

              example_session.md
            
          
    In [1]: import mf_rec_test as mfrt

In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 2.9021
Factorization RMSE: 0.087

In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0028
Factorization RMSE: 0.081

In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0652
Factorization RMSE: 0.088

In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0031
Factorization RMSE: 0.084

  
## mf.py
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
import numpy


def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    half_beta = beta / 2.0
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i,j] > 0:
                    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i,j] > 0:
                    e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])
        if e < 0.001:
            break
    return P, Q, step, e


def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T, step, e

###############################################################################

if __name__ == "__main__":
    R = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]

    R = numpy.array(R)

    N = len(R)
    M = len(R[0])
    K = 2

    P = numpy.random.rand(N,K)
    Q = numpy.random.rand(M,K)

    nP, nQ = matrix_factorization(R, P, Q, K)
    nR = numpy.dot(nP, nQ.T)
    print nR

## mf_cython.pyx
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
from __future__ import division
cimport cython
cimport numpy as np


@cython.boundscheck(False)
@cython.wraparound(False)
def matrix_factorization(np.ndarray[double, ndim=2] R,
                         np.ndarray[double, ndim=2] P,
                         np.ndarray[double, ndim=2] Q,
                         int K,
                         int steps=5000,
                         double alpha=0.0002,
                         double beta=0.02):

    cdef double half_beta = beta / 2.0
    cdef int N = R.shape[0]
    cdef int M = R.shape[1]
    cdef double eij = 0.0
    cdef double e = 0.0
    cdef double two_eij = 0.0
    cdef int step = 0
    cdef int i = 0
    cdef int j = 0
    cdef int k = 0
    cdef double temp = 0.0

    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    #Convert the line
                    #    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    #in mf.py to avoid using numpy such that numba can optimize
                    #the calculation (once execution is moved into C, e.g. via
                    #numpy, numba cannot optimize the corresponding code). This
                    #code is now equivalent to mf_numba.py for comparisons
                    eij = R[i,j]
                    for p in xrange(K):
                        eij -= P[i,p] * Q[j,p]

                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    temp = R[i,j]
                    for p in xrange(K):
                        temp -= P[i,p] * Q[j,p]

                    e = e + temp * temp
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])

        if e < 0.001:
            break

    return P, Q, step, e


if __name__ == "__main__":
    R = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]

    R = np.array(R)

    N = len(R)
    M = len(R[0])
    K = 2

    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)

    nP, nQ = matrix_factorization(R, P, Q, K)
    nR = np.dot(nP, nQ.T)
    print nR

## mf_numba.py
# -*- coding: utf-8 -*-
#!/usr/bin/python

'''
Adapted from example code by Albert Au Yeung (2010)
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

An implementation of matrix factorization...

@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q, steps taken and error

'''
from operator import mul
from itertools import imap, izip, starmap
import numpy as np
from numba import typeof, double, int_
from numba.decorators import autojit, jit


@autojit(locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization(R, P, Q, K):
    steps = 5000
    alpha = 0.0002
    beta = 0.02
    half_beta = beta / 2.0
    N, M = R.shape
    e = 0.0

    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    #Convert the line
                    #    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
                    #in mf.py to avoid using numpy such that numba can optimize
                    #the calculation (once execution is moved into C, e.g. via
                    #numpy, numba cannot optimize the corresponding code)
                    eij = R[i,j]
                    for p in xrange(K):
                        eij -= P[i,p] * Q[j,p]

                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])

        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:

                    temp = R[i,j]
                    for p in xrange(K):
                        temp -= P[i,p] * Q[j,p]

                    e = e + temp * temp
                    for k in xrange(K):
                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])

        if e < 0.001:
            break

    return P, Q, step, e


@jit(argtypes=[double[:,:], double[:,:], double[:,:], int_],
     restype=typeof((double[:,:], double[:,:], int_, double)),
     locals={'step': int_, 'e': double, 'err': double})
def matrix_factorization2(R, P, Q, K):
    steps = 5000
    alpha = 0.0002
    beta = 0.02
    half_beta = beta / 2.0
    Q = Q.T
    N, M = R.shape
    dot = np.dot
    for step in xrange(steps):
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:
                    temp = double(dot(P[i,:], Q[:,j]))
                    #temp = sum(map(mul, P[i,:], Q[:,j]))
                    #temp = sum(imap(mul, P[i,:], Q[:,j]))
                    #temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
                    #temp = 0
                    #for p in xrange(M):
                    #    temp = temp + P[i,p] * Q[p,j]
                    eij = R[i,j] - temp
                    for k in xrange(K):
                        P[i,k] += alpha * (2 * eij * Q[k,j] - beta * P[i,k])
                        Q[k,j] += alpha * (2 * eij * P[i,k] - beta * Q[k,j])
        e = 0.0
        for i in xrange(N):
            for j in xrange(M):
                if R[i,j] > 0:
                    temp = double(dot(P[i,:], Q[:,j]))
                    #temp = sum(map(mul, P[i,:], Q[:,j]))
                    #temp = sum(imap(mul, P[i,:], Q[:,j]))
                    #temp = sum(starmap(mul, izip(P[i,:], Q[:,j])))
                    #temp = 0
                    #for p in xrange(M):
                    #    temp = temp + P[i,p] * Q[p,j]
                    e = e + (R[i,j] - temp)**2
                    for k in xrange(K):
                        e = e + half_beta * (P[i,k]**2 + Q[k,j]**2)

        if e < 0.001:
            break

    return P, Q.T, step, e

## mf_rec_test.py
import shelve
import time
import numpy as np
import matplotlib.pyplot as plt


def make_ratings_data(num_users, num_items, seed=8675309):
    '''Returns a num_users x num_items array of fake ratings, where the ratings
    can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0
    indicates that the associated user has not rated the associated item. Like
    most ratings data sets, the array will be sparse

    '''
    valid_ratings = np.arange(1, 5.5, 0.5)
    valid_ratings = np.insert(valid_ratings, 0, 0)
    #Make ~80% of the data be "missing", with the other 20% having a uniform
    #distribution over the 1-5 star rating scale
    probs = [0.20 / 9] * 9
    probs.insert(0, 0.80)
    pseudo_RNG = np.random.RandomState(seed)
    ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True,
                                       size=(num_users, num_items),
                                       p=probs)
    return ratings_matrix


def run(hist=False, method='cython', seed=1234567, num_users=100,
        num_items=200, num_factors=8):
    start_time = time.time()
    R = make_ratings_data(num_users, num_items, seed=None)
    N, M = R.shape
    K = num_factors

    pseudo_RNG = np.random.RandomState(seed)
    P = pseudo_RNG.rand(N, K)
    Q = pseudo_RNG.rand(M, K)

    if method == 'cython':
        import mf_cython
        matrix_factorization = mf_cython.matrix_factorization
    if method == 'numba':
        import mf_numba
        matrix_factorization = mf_numba.matrix_factorization
    if method == 'python':
        import mf
        matrix_factorization = mf.matrix_factorization

    nP, nQ, steps, error = matrix_factorization(R, P, Q, K)
    print 'Convergence in %i steps' % steps
    nR = np.dot(nP, nQ.T)

    R_actual = np.ma.masked_equal(R, 0)
    missing_mask = np.ma.getmaskarray(R_actual)
    nR_actual = np.ma.masked_array(nR, mask=missing_mask)
    fit_error = nR_actual - R_actual
    fit_error_filled = fit_error.filled(-999)
    actual_ratings = np.where(fit_error_filled > -999)
    fit_diffs = np.asarray(fit_error[actual_ratings])
    fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size)

    if hist:
        plt.hist(fit_diffs, bins=20)
        plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)'
        plt.ylabel = 'Number of Consumers'
        plt.show()

    #Save the output for later if we want to analyze results without re-running
    #the code...
    s = shelve.open('mf_rec_test_results_%s.db' % method)
    s['nP'] = nP
    s['nQ'] = nQ
    s['nR'] = nR
    s['R_actual'] = R_actual
    s['nR_actual'] = nR_actual
    s['fit_error'] = fit_error
    s.close()
    print 'Recommendation test time in minutes: %.4f' \
           % ((time.time() - start_time) / 60.0)
    print 'Factorization RMSE: %.3f' % fit_RMSE

    return None

## MomentOfScience_blog_post.ipynb
{
 "metadata": {
  "name": "MomentOfScience_LLVM"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## A Moment of Science: LLVM may mean the best of both worlds is possible for analytic computing\n",
      "\n",
      "I got my start in programming with C++ (not counting hacking DOS video games like Duke Nukem in a hex editor). One summer in college I had an undergraduate grant with a professor who had me write a neural network algorithm using back propagation. The next summer this same professor had me recode it all in Java, as it was the hot new language. So from the start, my programming needs have been in the context of analytics, often exploratory in nature, which is why I quickly gravitated from [compiled languages](http://en.wikipedia.org/wiki/Compiled_language) like C++ and Java (statically compiled) towards [dynamic languages](http://en.wikipedia.org/wiki/Dynamic_programming_language) like MATLAB, R and Python (runtime compiled).\n",
      "\n",
      "The trade-off in using dynamic languages over compiled ones is about speed. Dynamic languages better enable the [exploratory programming](http://en.wikipedia.org/wiki/Exploratory_programming) needed when tackling new analytical problems, which leads to faster development of solutions. The trade-off is that this flexibility means that compilers do not have enough information about the code and data to optimize for run-time execution speed. In many settings, increasing computer time for saved human time is a good trade-off, but of course sometimes execution speed is critical, even in exploratory work (i.e., it is harder to iteratively refine a modeling approach when code takes many hours to run). Wouldn't it be great if there was a way to have the flexibility and expressiveness afforded by dynamic languages but with more of the execution performance of compiled languages?  \n",
      "\n",
      "This _wouldn't it be great if_ wish has been around for a while, but three events centering on the [LLVM compiler](http://en.wikipedia.org/wiki/LLVM) over the past year make this wish a lot closer to reality:\n",
      "\n",
      "1. [NVIDIA moving from an in-house compiler for their GPUs to LLVM](http://www.theregister.co.uk/2011/12/16/nvidia_llvm_cuda_app_dev/)\n",
      "2. The emergence of and excitement around the [Julia language](http://julialang.org/) for technical computing\n",
      "3. The emergence of and excitement around the [numba project](http://numba.pydata.org/) for compiling Python code to the LLVM\n",
      "\n",
      "While this seems like an unrelated mix of events, the common thread is that important tools in the modern technical computing stack are moving towards using LLVM. So, I figured it was about time I became more familiar with the topic and the rest of this post shows some of my recent foray.  \n",
      "\n",
      "## An example to dive in to: matrix factorization and recommendation systems\n",
      "\n",
      "[Matrix factorization](http://en.wikipedia.org/wiki/Matrix_decomposition) methods serve as the backbone of many statistical algorithms. Two particular approaches -- [singular value decomposition](http://en.wikipedia.org/wiki/Singular_value_decomposition) (SVD) and [non-negative matrix factorization](http://en.wikipedia.org/wiki/Non-negative_matrix_factorization) -- gained broader exposure with the 2009 [Netflix Prize](http://www.netflixprize.com/) for improving movie recommendations. These are both computationally expensive algorithms, the kind that you want implemented in a compiled language. But, there is [a nice tutorial on using matrix factorization for recommendation systems](http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/) that codes the algorithm in Python, making the algorithm easy to follow and test. I'll be using a slightly modified version of the code used in this tutorial as the basis for my explorations. \n",
      "\n",
      "Mathematically, what we are doing is taking an _N_ by _M_ matrix **R** of ratings by _N_ people over _M_ items and decomposing it into two constituent matrices, **P** and **Q** (I'll stick with the notation used in the tutorial). **P** will be an _N_ by _K_ matrix that we can think of as _N_ people's preference weightings over _K_ \"hidden\" (or latent) features of the items. **Q** will be a _K_ by _M_ matrix that we can think of as the \"loadings\" on the _K_ features by each of these _M_ items. The algorithm seeks **P** and **Q** such that the difference between **R** and **PQ** is minimized (in general, this solution is non-unique).   \n",
      "\n",
      "In what follows I'll compare algorithm speed across three scenarios:\n",
      "\n",
      "1. Python\n",
      "2. C\n",
      "3. Python on the LLVM using numba\n",
      "\n",
      "## Baseline code\n",
      "\n",
      "Here is the code that will serve as the baseline for our testing (again, a slightly modified version of what is [hosted on the tutorial](http://www.quuxlabs.com/wp-content/uploads/2010/09/mf.py_.txt)). We'll save this code as a file called `mf.py`: "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# -*- coding: utf-8 -*-\n",
      "#!/usr/bin/python\n",
      "\n",
      "'''\n",
      "Adapted from example code by Albert Au Yeung (2010)\n",
      "http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code\n",
      "\n",
      "An implementation of matrix factorization...\n",
      "\n",
      "@INPUT:\n",
      "    R     : a matrix to be factorized, dimension N x M\n",
      "    P     : an initial matrix of dimension N x K\n",
      "    Q     : an initial matrix of dimension M x K\n",
      "    K     : the number of latent features\n",
      "    steps : the maximum number of steps to perform the optimisation\n",
      "    alpha : the learning rate\n",
      "    beta  : the regularization parameter\n",
      "@OUTPUT:\n",
      "    the final matrices P and Q, steps taken and error\n",
      "\n",
      "'''\n",
      "import numpy \n",
      "\n",
      "\n",
      "def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):\n",
      "    half_beta = beta / 2.0\n",
      "    for step in xrange(steps):\n",
      "        for i in xrange(len(R)):\n",
      "            for j in xrange(len(R[i])):\n",
      "                if R[i,j] > 0:\n",
      "                    eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])\n",
      "                    for k in xrange(K):\n",
      "                        P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])\n",
      "                        Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])\n",
      "        e = 0\n",
      "        for i in xrange(len(R)):\n",
      "            for j in xrange(len(R[i])):\n",
      "                if R[i,j] > 0:\n",
      "                    e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2\n",
      "                    for k in xrange(K):\n",
      "                        e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k])\n",
      "        if e < 0.001:\n",
      "            break\n",
      "    return P, Q, step, e"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Right away, we can see this will probably run very slowly in a language like Python compared to C. We have deeply nested `for` loops and such looping is almost always a bottleneck in dynamically typed languages (relative to doing the exact same looping in a compiled language in which the compiler knows about the variable types and can optimize operations). \n",
      "\n",
      "In two parts of the code there is an important conditional statement to call out with respect to how the algorithm is adapted to this rating recommendation problem:\n",
      "\n",
      "    if R[i,j] > 0:\n",
      "\n",
      "In the **R** matrix of peoples' ratings on items, R[i,j] = 0 is a sentinel meaning that person _i_ has not rated item _j_. In many settings (like Netflix's inventory of thousands of movies), most elements of **R** will be 0 because most people have only interacted with a small subset of the items. The algorithm above takes this into account when assessing the factorization error by ignoring the differences between the estimated **R** and the actual **R** where the actual ratings are unknown. In this way, the algorithm tries to find **P** and **Q** such that the _known_ ratings are recovered as accurately as possible. Estimates for the unobserved ratings, i.e., the predicted ratings, can then be taken from **PQ** wherever **R** was zero. We assume that ratings are non-negative, e.g. 1-5 stars.\n",
      "\n",
      "One final thing deserves mention before we run the code: we have to pick _K_, the number of latent features. But if the features are latent, how do we know how many of them there are? There is no easy answer in most settings. Some amount of experimentation and domain expertise (to interpret the factors) is needed to confidently pick _K_, but this touches on an important issue that pervades statistical modeling: estimation error due to model _selection_ is often harder to assess and correct than error due to model _fitting_, but much more is written about the latter. For this example we'll use 8 factors.\n"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##Performance testing\n",
      "\n",
      "OK, finally we are on to testing performance. In support of this testing I am going to define two variants of the baseline code, and finally a piece of code to easily call all three variants.\n",
      "\n",
      "The first variant we'll save to a file called `mf_cython.pyx`. We'll take advantage of a [Cython](http://www.cython.org/), which essentially lets us add some type information to our Python code above and then machine-generate analogous C code that be compiled and execute (much) more quickly. I won't post this code here but will link to it an the end.\n",
      "\n",
      "The second variant we'll save to a file called `mf_numba.py` and will use numba to add type information and generate \"intermediate\" code that can be [JIT compiled](http://en.wikipedia.org/wiki/Just-in-time_compilation) by LLVM. One of the nice things about numba is that you can do this through a simple [function decorator](http://en.wikipedia.org/wiki/Python_syntax_and_semantics#Decorators). For example, instead of our `matrix_factorization` method beginning as\n",
      "\n",
      "    def matrix_factorization(R, P, Q, K):\n",
      "\n",
      "we import `autojit` from numba and then add this decorator. It auto-majically annotates the variables types in the code and prepares it for JIT compilation:\n",
      "\n",
      "    from numba.decorators import autojit\n",
      "    \n",
      "    @autojit\n",
      "    def matrix_factorization(R, P, Q, K):\n",
      "\n",
      "At this point, we have our baseline Python implementation of matrix factorization for recommendation problems, along with two variants that use C and LLVM for improving execution speed. Finally, we need a bit of code to drive the test, which I have in a file called `mf_rec_test.py`. This code creates some random ratings data for a desired number of people and items. The `run` method then calls one of the three matrix factorization variants we have and measures execution time along with the error in recovering the known ratings. Again, I won't post the code here but will link to it at the end."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "With everything in place we can drive the test across the three approaches an compare speed. Again, 8 factors will be used in the solution, and we'll pick a relatively small matrix of 30 people having ratings over 60 items...\n",
      "\n",
      "    In [1]: import mf_rec_test as mfrt\n",
      "    \n",
      "    In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)\n",
      "    Recommendation test time in minutes: 2.9021\n",
      "    Factorization RMSE: 0.087\n",
      "\n",
      "    In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)\n",
      "    Recommendation test time in minutes: 0.0028\n",
      "    Factorization RMSE: 0.081\n",
      "\n",
      "\n",
      "So, the Python code takes almost 3 minutes whereas the (machine-generated) C code executes in *one thousandth of the time*. For languages like Python and R much of the mathematical and statistical methods are actually calling underlying C and Fortran code, so you can get reasonably good execution speed to go along with the development productivity you get. But, as this example shows, there are cases where highly iterative algorithms can suffer horribly. How does the numba-ized code fair?\n",
      "\n",
      "    In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)\n",
      "    Recommendation test time in minutes: 0.0652\n",
      "    Factorization RMSE: 0.088\n",
      "\n",
      "Pretty impressive: for code that we basically added a one-line decorator to, we sped things up by a factor of 44. Now, watch what happens if we re-execute the exact same call:\n",
      "\n",
      "    In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)\n",
      "    Recommendation test time in minutes: 0.0031\n",
      "    Factorization RMSE: 0.084\n",
      "\n",
      "The numba-ized code now executes in the same time as the C code, speeding up again by a factor of 21. How so? The first time the numba-ized code is run it gets translated to byte code that LLVM can JIT compile. The next time we run the code, the results from the JIT compilation can be used and we get another big speed up."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Why LLVM matters\n",
      "\n",
      "Writing this post certainly pushed me past my comfort zone. I have spent most of my focus over the years on applying analytics to business problems and not on lower-level computing issues like compilers and memory management. But, as in many areas of computing these days, more data, more complex questions, and expectations of real-time results means that issues of the past have come back to the fore. For example, mobile developers have to deal with limited screen space and be hyper-focused on keeping memory consumption down, just as people doing _any_ kind of computing had to in the 1980s. It basically means we have to head the advice of [Peter Norvig](http://en.wikipedia.org/wiki/Peter_Norvig) to [remember that there is a \"computer\" in \"computer science\"](http://norvig.com/21-days.html). To do even exploratory analytical work nowadays means having to develop and maintain some level of maturity around technical computing. And then of course, there is the rest of being a _data scientist_, like keeping up with the evolutions in analytic methods, software implementations of said methods, domain knowledge for usefully applying said methods, techniques for visualizing results, and best practices for conveying these methods and their results to technical and non-technical audiences. Sigh...\n",
      "\n",
      "As overwhelming as this collection of issues feels sometimes, it motivates my excitement about projects like LLVM and numba: They provide a single, consistent abstraction layer on which I can maintain reasonably high performance code in languages that are very flexible and efficient for me to develop in. I can target execution against a single core on a CPU, multiple cores, or even completely different hardware like [NVIDIA's GPUs](http://nvidianews.nvidia.com/Releases/GPU-Accelerated-Computing-Reaches-Next-Generation-of-Programmers-With-Python-Support-of-NVIDIA-CUDA-950.aspx) that have thousands of cores, all through LLVM and numba. If I were to continue developing this matrix factorization algorithm for a recommendation system I would now have a way of iteratively testing approaches on much larger, more realistic data sets rather than waiting nearly 3 minutes every time I tested against a toy data set. This means I would be more likely to _actually_ explore and evaluate solutions, and that is what is most exciting."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Acknowledgements and More Reading\n",
      "\n",
      "* The community of folks on the [numba users discussion group on Google Groups](https://groups.google.com/a/continuum.io/forum/#!forum/numba-users) were very helpful in working through this example code and assessing behavior. Further, the many people who develop projects like LLVM, llvm-py, and numba in the first place\n",
      "* A [GitHub Gist](https://gist.github.com/jhemann/5584536) containing the code discussed here, as well as the complete post as an [IPython Notebook](http://ipython.org/notebook.html) (which can be viewed [here](http://nbviewer.ipython.org/5584536/MomentOfScience_blog_post.ipynb))\n",
      "* Travis Oliphant gave a talk this year at [PyCon](https://us.pycon.org/2013/about/what-is-pycon/) covering the vision for numba and LLVM ([slides](https://speakerdeck.com/pyconslides/numba-a-dynamic-python-compiler-\n",
      "  for-science-by-travis-e-oliphant-jon-riehl-mark-florisson-and-siu-kwan-lam))\n",
      "* [A great tutorial on LLVM](http://www.aosabook.org/en/llvm.html)\n",
      "\n",
      "[Back to FICO Labs]( http://ficolabsblog.fico.com/2013/05/a-moment-of-science-llvm-may-mean-the-best-of-both-worlds-is-possible-for-analytic-computing.html)"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}

## setup_mf_cython.py
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 01 23:49:22 2012

@author: JoshuaHemann
"""

# Run with:
#
#    vs_compiler
#    python setup_mf_cython.py build_ext --inplace
#
# to compile Cython extensions in-place (useful during development)

from distutils.core import setup
from Cython.Distutils import build_ext
from Cython.Build import cythonize
import numpy as np


setup(cmdclass={'build_ext': build_ext},
      include_dirs=[np.get_include()],
      ext_modules=cythonize('mf_cython.pyx', annotate=True))
	# -- coding: utf-8 --
	#!/usr/bin/python

	'''
	Adapted from example code by Albert Au Yeung (2010)
	http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code

	An implementation of matrix factorization...

	@INPUT:
	R : a matrix to be factorized, dimension N x M
	P : an initial matrix of dimension N x K
	Q : an initial matrix of dimension M x K
	K : the number of latent features
	steps : the maximum number of steps to perform the optimisation
	alpha : the learning rate
	beta : the regularization parameter
	@OUTPUT:
	the final matrices P and Q, steps taken and error

	'''
	import numpy


	def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
	half_beta = beta / 2.0
	for step in xrange(steps):
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i,j] > 0:
	eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])
	for k in xrange(K):
	P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])
	Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])
	e = 0
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i,j] > 0:
	e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2
	for k in xrange(K):
	e += half_beta * (P[i,k]P[i,k] + Q[j,k]Q[j,k])
	if e < 0.001:
	break
	return P, Q, step, e


	def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
	Q = Q.T
	for step in xrange(steps):
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i][j] > 0:
	eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
	for k in xrange(K):
	P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
	Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
	e = 0
	for i in xrange(len(R)):
	for j in xrange(len(R[i])):
	if R[i][j] > 0:
	e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
	for k in xrange(K):
	e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
	if e < 0.001:
	break
	return P, Q.T, step, e

	###############################################################################

	if __name__ == "__main__":
	R = [
	[5,3,0,1],
	[4,0,0,1],
	[1,1,0,5],
	[1,0,0,4],
	[0,1,5,4],
	]

	R = numpy.array(R)

	N = len(R)
	M = len(R[0])
	K = 2

	P = numpy.random.rand(N,K)
	Q = numpy.random.rand(M,K)

	nP, nQ = matrix_factorization(R, P, Q, K)
	nR = numpy.dot(nP, nQ.T)
	print nR
	import shelve
	import time
	import numpy as np
	import matplotlib.pyplot as plt


	def make_ratings_data(num_users, num_items, seed=8675309):
	'''Returns a num_users x num_items array of fake ratings, where the ratings
	can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0
	indicates that the associated user has not rated the associated item. Like
	most ratings data sets, the array will be sparse

	'''
	valid_ratings = np.arange(1, 5.5, 0.5)
	valid_ratings = np.insert(valid_ratings, 0, 0)
	#Make ~80% of the data be "missing", with the other 20% having a uniform
	#distribution over the 1-5 star rating scale
	probs = [0.20 / 9] * 9
	probs.insert(0, 0.80)
	pseudo_RNG = np.random.RandomState(seed)
	ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True,
	size=(num_users, num_items),
	p=probs)
	return ratings_matrix


	def run(hist=False, method='cython', seed=1234567, num_users=100,
	num_items=200, num_factors=8):
	start_time = time.time()
	R = make_ratings_data(num_users, num_items, seed=None)
	N, M = R.shape
	K = num_factors

	pseudo_RNG = np.random.RandomState(seed)
	P = pseudo_RNG.rand(N, K)
	Q = pseudo_RNG.rand(M, K)

	if method == 'cython':
	import mf_cython
	matrix_factorization = mf_cython.matrix_factorization
	if method == 'numba':
	import mf_numba
	matrix_factorization = mf_numba.matrix_factorization
	if method == 'python':
	import mf
	matrix_factorization = mf.matrix_factorization

	nP, nQ, steps, error = matrix_factorization(R, P, Q, K)
	print 'Convergence in %i steps' % steps
	nR = np.dot(nP, nQ.T)

	R_actual = np.ma.masked_equal(R, 0)
	missing_mask = np.ma.getmaskarray(R_actual)
	nR_actual = np.ma.masked_array(nR, mask=missing_mask)
	fit_error = nR_actual - R_actual
	fit_error_filled = fit_error.filled(-999)
	actual_ratings = np.where(fit_error_filled > -999)
	fit_diffs = np.asarray(fit_error[actual_ratings])
	fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size)

	if hist:
	plt.hist(fit_diffs, bins=20)
	plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)'
	plt.ylabel = 'Number of Consumers'
	plt.show()

	#Save the output for later if we want to analyze results without re-running
	#the code...
	s = shelve.open('mf_rec_test_results_%s.db' % method)
	s['nP'] = nP
	s['nQ'] = nQ
	s['nR'] = nR
	s['R_actual'] = R_actual
	s['nR_actual'] = nR_actual
	s['fit_error'] = fit_error
	s.close()
	print 'Recommendation test time in minutes: %.4f' \
	% ((time.time() - start_time) / 60.0)
	print 'Factorization RMSE: %.3f' % fit_RMSE

	return None
	{
	"metadata": {
	"name": "MomentOfScience_LLVM"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## A Moment of Science: LLVM may mean the best of both worlds is possible for analytic computing\n",
	"\n",
	"I got my start in programming with C++ (not counting hacking DOS video games like Duke Nukem in a hex editor). One summer in college I had an undergraduate grant with a professor who had me write a neural network algorithm using back propagation. The next summer this same professor had me recode it all in Java, as it was the hot new language. So from the start, my programming needs have been in the context of analytics, often exploratory in nature, which is why I quickly gravitated from [compiled languages](http://en.wikipedia.org/wiki/Compiled_language) like C++ and Java (statically compiled) towards [dynamic languages](http://en.wikipedia.org/wiki/Dynamic_programming_language) like MATLAB, R and Python (runtime compiled).\n",
	"\n",
	"The trade-off in using dynamic languages over compiled ones is about speed. Dynamic languages better enable the [exploratory programming](http://en.wikipedia.org/wiki/Exploratory_programming) needed when tackling new analytical problems, which leads to faster development of solutions. The trade-off is that this flexibility means that compilers do not have enough information about the code and data to optimize for run-time execution speed. In many settings, increasing computer time for saved human time is a good trade-off, but of course sometimes execution speed is critical, even in exploratory work (i.e., it is harder to iteratively refine a modeling approach when code takes many hours to run). Wouldn't it be great if there was a way to have the flexibility and expressiveness afforded by dynamic languages but with more of the execution performance of compiled languages? \n",
	"\n",
	"This _wouldn't it be great if_ wish has been around for a while, but three events centering on the [LLVM compiler](http://en.wikipedia.org/wiki/LLVM) over the past year make this wish a lot closer to reality:\n",
	"\n",
	"1. [NVIDIA moving from an in-house compiler for their GPUs to LLVM](http://www.theregister.co.uk/2011/12/16/nvidia_llvm_cuda_app_dev/)\n",
	"2. The emergence of and excitement around the [Julia language](http://julialang.org/) for technical computing\n",
	"3. The emergence of and excitement around the [numba project](http://numba.pydata.org/) for compiling Python code to the LLVM\n",
	"\n",
	"While this seems like an unrelated mix of events, the common thread is that important tools in the modern technical computing stack are moving towards using LLVM. So, I figured it was about time I became more familiar with the topic and the rest of this post shows some of my recent foray. \n",
	"\n",
	"## An example to dive in to: matrix factorization and recommendation systems\n",
	"\n",
	"[Matrix factorization](http://en.wikipedia.org/wiki/Matrix_decomposition) methods serve as the backbone of many statistical algorithms. Two particular approaches -- [singular value decomposition](http://en.wikipedia.org/wiki/Singular_value_decomposition) (SVD) and [non-negative matrix factorization](http://en.wikipedia.org/wiki/Non-negative_matrix_factorization) -- gained broader exposure with the 2009 [Netflix Prize](http://www.netflixprize.com/) for improving movie recommendations. These are both computationally expensive algorithms, the kind that you want implemented in a compiled language. But, there is [a nice tutorial on using matrix factorization for recommendation systems](http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/) that codes the algorithm in Python, making the algorithm easy to follow and test. I'll be using a slightly modified version of the code used in this tutorial as the basis for my explorations. \n",
	"\n",
	"Mathematically, what we are doing is taking an _N_ by _M_ matrix R of ratings by _N_ people over _M_ items and decomposing it into two constituent matrices, P and Q (I'll stick with the notation used in the tutorial). P will be an _N_ by _K_ matrix that we can think of as _N_ people's preference weightings over _K_ \"hidden\" (or latent) features of the items. Q will be a _K_ by _M_ matrix that we can think of as the \"loadings\" on the _K_ features by each of these _M_ items. The algorithm seeks P and Q such that the difference between R and PQ is minimized (in general, this solution is non-unique). \n",
	"\n",
	"In what follows I'll compare algorithm speed across three scenarios:\n",
	"\n",
	"1. Python\n",
	"2. C\n",
	"3. Python on the LLVM using numba\n",
	"\n",
	"## Baseline code\n",
	"\n",
	"Here is the code that will serve as the baseline for our testing (again, a slightly modified version of what is [hosted on the tutorial](http://www.quuxlabs.com/wp-content/uploads/2010/09/mf.py_.txt)). We'll save this code as a file called `mf.py`: "
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# -- coding: utf-8 --\n",
	"#!/usr/bin/python\n",
	"\n",
	"'''\n",
	"Adapted from example code by Albert Au Yeung (2010)\n",
	"http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code\n",
	"\n",
	"An implementation of matrix factorization...\n",
	"\n",
	"@INPUT:\n",
	" R : a matrix to be factorized, dimension N x M\n",
	" P : an initial matrix of dimension N x K\n",
	" Q : an initial matrix of dimension M x K\n",
	" K : the number of latent features\n",
	" steps : the maximum number of steps to perform the optimisation\n",
	" alpha : the learning rate\n",
	" beta : the regularization parameter\n",
	"@OUTPUT:\n",
	" the final matrices P and Q, steps taken and error\n",
	"\n",
	"'''\n",
	"import numpy \n",
	"\n",
	"\n",
	"def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):\n",
	" half_beta = beta / 2.0\n",
	" for step in xrange(steps):\n",
	" for i in xrange(len(R)):\n",
	" for j in xrange(len(R[i])):\n",
	" if R[i,j] > 0:\n",
	" eij = R[i,j] - numpy.dot(P[i,:],Q[j,:])\n",
	" for k in xrange(K):\n",
	" P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k])\n",
	" Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k])\n",
	" e = 0\n",
	" for i in xrange(len(R)):\n",
	" for j in xrange(len(R[i])):\n",
	" if R[i,j] > 0:\n",
	" e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2\n",
	" for k in xrange(K):\n",
	" e += half_beta * (P[i,k]P[i,k] + Q[j,k]Q[j,k])\n",
	" if e < 0.001:\n",
	" break\n",
	" return P, Q, step, e"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Right away, we can see this will probably run very slowly in a language like Python compared to C. We have deeply nested `for` loops and such looping is almost always a bottleneck in dynamically typed languages (relative to doing the exact same looping in a compiled language in which the compiler knows about the variable types and can optimize operations). \n",
	"\n",
	"In two parts of the code there is an important conditional statement to call out with respect to how the algorithm is adapted to this rating recommendation problem:\n",
	"\n",
	" if R[i,j] > 0:\n",
	"\n",
	"In the R matrix of peoples' ratings on items, R[i,j] = 0 is a sentinel meaning that person _i_ has not rated item _j_. In many settings (like Netflix's inventory of thousands of movies), most elements of R will be 0 because most people have only interacted with a small subset of the items. The algorithm above takes this into account when assessing the factorization error by ignoring the differences between the estimated R and the actual R where the actual ratings are unknown. In this way, the algorithm tries to find P and Q such that the _known_ ratings are recovered as accurately as possible. Estimates for the unobserved ratings, i.e., the predicted ratings, can then be taken from PQ wherever R was zero. We assume that ratings are non-negative, e.g. 1-5 stars.\n",
	"\n",
	"One final thing deserves mention before we run the code: we have to pick _K_, the number of latent features. But if the features are latent, how do we know how many of them there are? There is no easy answer in most settings. Some amount of experimentation and domain expertise (to interpret the factors) is needed to confidently pick _K_, but this touches on an important issue that pervades statistical modeling: estimation error due to model _selection_ is often harder to assess and correct than error due to model _fitting_, but much more is written about the latter. For this example we'll use 8 factors.\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##Performance testing\n",
	"\n",
	"OK, finally we are on to testing performance. In support of this testing I am going to define two variants of the baseline code, and finally a piece of code to easily call all three variants.\n",
	"\n",
	"The first variant we'll save to a file called `mf_cython.pyx`. We'll take advantage of a [Cython](http://www.cython.org/), which essentially lets us add some type information to our Python code above and then machine-generate analogous C code that be compiled and execute (much) more quickly. I won't post this code here but will link to it an the end.\n",
	"\n",
	"The second variant we'll save to a file called `mf_numba.py` and will use numba to add type information and generate \"intermediate\" code that can be [JIT compiled](http://en.wikipedia.org/wiki/Just-in-time_compilation) by LLVM. One of the nice things about numba is that you can do this through a simple [function decorator](http://en.wikipedia.org/wiki/Python_syntax_and_semantics#Decorators). For example, instead of our `matrix_factorization` method beginning as\n",
	"\n",
	" def matrix_factorization(R, P, Q, K):\n",
	"\n",
	"we import `autojit` from numba and then add this decorator. It auto-majically annotates the variables types in the code and prepares it for JIT compilation:\n",
	"\n",
	" from numba.decorators import autojit\n",
	" \n",
	" @autojit\n",
	" def matrix_factorization(R, P, Q, K):\n",
	"\n",
	"At this point, we have our baseline Python implementation of matrix factorization for recommendation problems, along with two variants that use C and LLVM for improving execution speed. Finally, we need a bit of code to drive the test, which I have in a file called `mf_rec_test.py`. This code creates some random ratings data for a desired number of people and items. The `run` method then calls one of the three matrix factorization variants we have and measures execution time along with the error in recovering the known ratings. Again, I won't post the code here but will link to it at the end."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"With everything in place we can drive the test across the three approaches an compare speed. Again, 8 factors will be used in the solution, and we'll pick a relatively small matrix of 30 people having ratings over 60 items...\n",
	"\n",
	" In [1]: import mf_rec_test as mfrt\n",
	" \n",
	" In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)\n",
	" Recommendation test time in minutes: 2.9021\n",
	" Factorization RMSE: 0.087\n",
	"\n",
	" In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)\n",
	" Recommendation test time in minutes: 0.0028\n",
	" Factorization RMSE: 0.081\n",
	"\n",
	"\n",
	"So, the Python code takes almost 3 minutes whereas the (machine-generated) C code executes in one thousandth of the time. For languages like Python and R much of the mathematical and statistical methods are actually calling underlying C and Fortran code, so you can get reasonably good execution speed to go along with the development productivity you get. But, as this example shows, there are cases where highly iterative algorithms can suffer horribly. How does the numba-ized code fair?\n",
	"\n",
	" In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)\n",
	" Recommendation test time in minutes: 0.0652\n",
	" Factorization RMSE: 0.088\n",
	"\n",
	"Pretty impressive: for code that we basically added a one-line decorator to, we sped things up by a factor of 44. Now, watch what happens if we re-execute the exact same call:\n",
	"\n",
	" In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)\n",
	" Recommendation test time in minutes: 0.0031\n",
	" Factorization RMSE: 0.084\n",
	"\n",
	"The numba-ized code now executes in the same time as the C code, speeding up again by a factor of 21. How so? The first time the numba-ized code is run it gets translated to byte code that LLVM can JIT compile. The next time we run the code, the results from the JIT compilation can be used and we get another big speed up."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Why LLVM matters\n",
	"\n",
	"Writing this post certainly pushed me past my comfort zone. I have spent most of my focus over the years on applying analytics to business problems and not on lower-level computing issues like compilers and memory management. But, as in many areas of computing these days, more data, more complex questions, and expectations of real-time results means that issues of the past have come back to the fore. For example, mobile developers have to deal with limited screen space and be hyper-focused on keeping memory consumption down, just as people doing _any_ kind of computing had to in the 1980s. It basically means we have to head the advice of [Peter Norvig](http://en.wikipedia.org/wiki/Peter_Norvig) to [remember that there is a \"computer\" in \"computer science\"](http://norvig.com/21-days.html). To do even exploratory analytical work nowadays means having to develop and maintain some level of maturity around technical computing. And then of course, there is the rest of being a _data scientist_, like keeping up with the evolutions in analytic methods, software implementations of said methods, domain knowledge for usefully applying said methods, techniques for visualizing results, and best practices for conveying these methods and their results to technical and non-technical audiences. Sigh...\n",
	"\n",
	"As overwhelming as this collection of issues feels sometimes, it motivates my excitement about projects like LLVM and numba: They provide a single, consistent abstraction layer on which I can maintain reasonably high performance code in languages that are very flexible and efficient for me to develop in. I can target execution against a single core on a CPU, multiple cores, or even completely different hardware like [NVIDIA's GPUs](http://nvidianews.nvidia.com/Releases/GPU-Accelerated-Computing-Reaches-Next-Generation-of-Programmers-With-Python-Support-of-NVIDIA-CUDA-950.aspx) that have thousands of cores, all through LLVM and numba. If I were to continue developing this matrix factorization algorithm for a recommendation system I would now have a way of iteratively testing approaches on much larger, more realistic data sets rather than waiting nearly 3 minutes every time I tested against a toy data set. This means I would be more likely to _actually_ explore and evaluate solutions, and that is what is most exciting."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Acknowledgements and More Reading\n",
	"\n",
	"* The community of folks on the [numba users discussion group on Google Groups](https://groups.google.com/a/continuum.io/forum/#!forum/numba-users) were very helpful in working through this example code and assessing behavior. Further, the many people who develop projects like LLVM, llvm-py, and numba in the first place\n",
	"* A [GitHub Gist](https://gist.github.com/jhemann/5584536) containing the code discussed here, as well as the complete post as an [IPython Notebook](http://ipython.org/notebook.html) (which can be viewed [here](http://nbviewer.ipython.org/5584536/MomentOfScience_blog_post.ipynb))\n",
	"* Travis Oliphant gave a talk this year at [PyCon](https://us.pycon.org/2013/about/what-is-pycon/) covering the vision for numba and LLVM ([slides](https://speakerdeck.com/pyconslides/numba-a-dynamic-python-compiler-\n",
	" for-science-by-travis-e-oliphant-jon-riehl-mark-florisson-and-siu-kwan-lam))\n",
	"* [A great tutorial on LLVM](http://www.aosabook.org/en/llvm.html)\n",
	"\n",
	"[Back to FICO Labs]( http://ficolabsblog.fico.com/2013/05/a-moment-of-science-llvm-may-mean-the-best-of-both-worlds-is-possible-for-analytic-computing.html)"
	]
	}
	],
	"metadata": {}
	}
	]
	}
	# -- coding: utf-8 --
	"""
	Created on Sun Apr 01 23:49:22 2012

	@author: JoshuaHemann
	"""

	# Run with:
	#
	# vs_compiler
	# python setup_mf_cython.py build_ext --inplace
	#
	# to compile Cython extensions in-place (useful during development)

	from distutils.core import setup
	from Cython.Distutils import build_ext
	from Cython.Build import cythonize
	import numpy as np


	setup(cmdclass={'build_ext': build_ext},
	include_dirs=[np.get_include()],
	ext_modules=cythonize('mf_cython.pyx', annotate=True))