Created
September 16, 2014 02:22
-
-
Save davclark/cf1a168f21387502a126 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:0f5ae6b41bc13ffbee8db316c23b12f9cb6b20c36f5d7fa3e0ebdc00c6a592b3" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here is some mathematical notation with the formula for computing a naive bayes probability, and how this would be converted with the use of logs:\n", | |
"\n", | |
"$$P(Y =y) \\prod_{j=1}^d P(X_j =x_j|Y =y)$$\n", | |
"\n", | |
"Taking the log, we get:\n", | |
"\n", | |
"$$ \\log P(Y =y) \\prod_{j=1}^d P(X_j =x_j|Y =y)$$\n", | |
"$$= \\log P(Y =y) + \\sum_{j=1}^d \\log P(X_j =x_j|Y =y)$$\n", | |
"\n", | |
"But some people find that hard to read, or figure out how to convert it to code." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We'll be using numpy later, so we'll just use log from there\n", | |
"# (You can also import it from `math`)\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# From Dan's last slide in 3.8: Spam Classification III\n", | |
"\n", | |
"# We can represent data in many ways, here's a way that keeps our \n", | |
"# labels very clear\n", | |
"\n", | |
"# Prior on Y\n", | |
"PY = {'spam': 0.4, \n", | |
" 'ham': 0.6}\n", | |
"\n", | |
"# Conditional probabilities of X given Y\n", | |
"PX = {'spam': {'dear': 0.0013, \n", | |
" 'sir': 0.0023, \n", | |
" ',': 0.0220,\n", | |
" 'first': 0.0018,\n", | |
" 'I': 0.0062,\n", | |
" 'must': 0.0034,\n", | |
" 'solicit': 0.0007},\n", | |
" 'ham': {'dear': 0.0009, \n", | |
" 'sir': 0.0004, \n", | |
" ',': 0.0241,\n", | |
" 'first': 0.0023,\n", | |
" 'I': 0.0119,\n", | |
" 'must': 0.0028,\n", | |
" 'solicit': 0.0002} }\n", | |
" \n", | |
"def naive_bayes(xs, y):\n", | |
" '''compute log likelihood for spam classifier\n", | |
" \n", | |
" xs : list of str\n", | |
" e.g., ['dear', 'sir', ',', 'first', 'I', 'must', 'solicit']\n", | |
" y : str\n", | |
" 'ham' or 'spam'?\n", | |
"\n", | |
" return : float\n", | |
" log likelihood\n", | |
" '''\n", | |
" # Note - if you want to compute a product, there's no equivalent\n", | |
" # standard function to `sum`. You could use `np.prod`, though.\n", | |
" return np.log(PY[y]) + sum(np.log(PX[y][x]) for x in xs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# I don't feel like typing these again, so I cheat:\n", | |
"tokens = PX['spam'].keys()\n", | |
"\n", | |
"# How fast is this?\n", | |
"%timeit naive_bayes(tokens, 'ham')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10000 loops, best of 3: 18.6 \u00b5s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## More efficient data structures?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"\n", | |
"PXdf = pd.DataFrame(PX)\n", | |
"# Here we compute the same sum as above, but using vectorized operations\n", | |
"# ...except it turns out to be MUCH slower than the dictionary approach\n", | |
"# above\n", | |
"%timeit np.log(PY['ham']) + np.log(PXdf.loc[tokens, 'ham']).sum()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1000 loops, best of 3: 475 \u00b5s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment