Created
May 27, 2019 15:08
-
-
Save ilyarudyak/ffcc2ee6bf61f37bb953c6733dc97cb5 to your computer and use it in GitHub Desktop.
Naive Bayes example based on Coursera NLP course (and Information Extraction book).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The autoreload extension is already loaded. To reload it, use:\n", | |
" %reload_ext autoreload\n" | |
] | |
} | |
], | |
"source": [ | |
"from NaiveBayes import NaiveBayes\n", | |
"\n", | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## toy example" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We use toy data from coursera lectures. But we follow it using IR book (for online reading), p. 261 (next page to algorithm)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nb = NaiveBayes()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['Chinese', 'Beijing', 'Chinese'] pos\n", | |
"['Chinese', 'Chinese', 'Shanghai'] pos\n", | |
"['Chinese', 'Macao'] pos\n", | |
"['Tokyo', 'Japan', 'Chinese'] neg\n" | |
] | |
} | |
], | |
"source": [ | |
"toy_split = nb.getToySplit()\n", | |
"for example in toy_split.train:\n", | |
" print(example.words, example.klass)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nb.train(toy_split)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'Beijing', 'Chinese', 'Japan', 'Macao', 'Shanghai', 'Tokyo'}" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.vocabulary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(4, 3, 1)" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.docsCount, nb.docsPos, nb.docsNeg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(int,\n", | |
" {'Chinese': 5,\n", | |
" 'Beijing': 1,\n", | |
" 'Shanghai': 1,\n", | |
" 'Macao': 1,\n", | |
" 'Japan': 0,\n", | |
" 'Tokyo': 0})" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.wordsPos" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(int,\n", | |
" {'Tokyo': 1,\n", | |
" 'Japan': 1,\n", | |
" 'Chinese': 1,\n", | |
" 'Shanghai': 0,\n", | |
" 'Macao': 0,\n", | |
" 'Beijing': 0})" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.wordsNeg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.75, 0.25)" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.priorPos, nb.priorNeg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(float,\n", | |
" {'Japan': 0.07142857142857142,\n", | |
" 'Shanghai': 0.14285714285714285,\n", | |
" 'Macao': 0.14285714285714285,\n", | |
" 'Tokyo': 0.07142857142857142,\n", | |
" 'Beijing': 0.14285714285714285,\n", | |
" 'Chinese': 0.42857142857142855})" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.condProbsPos" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(float,\n", | |
" {'Japan': 0.2222222222222222,\n", | |
" 'Shanghai': 0.1111111111111111,\n", | |
" 'Macao': 0.1111111111111111,\n", | |
" 'Tokyo': 0.2222222222222222,\n", | |
" 'Beijing': 0.1111111111111111,\n", | |
" 'Chinese': 0.2222222222222222})" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.condProbsNeg" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's compare these probabilities with computed in the book." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" P(Chinese|pos) book:0.4286 computed:0.4286\n", | |
" P(Tokyo|pos) book:0.0714 computed:0.0714\n", | |
" P(Japan|pos) book:0.0714 computed:0.0714\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f' P(Chinese|pos) book:{3/7:.4f} computed:{nb.condProbsPos[\"Chinese\"]:.4f}\\n'\n", | |
" f' P(Tokyo|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Tokyo\"]:.4f}\\n',\n", | |
" f'P(Japan|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Japan\"]:.4f}\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" P(Chinese|neg) book:0.2222 computed:0.2222\n", | |
" P(Tokyo|neg) book:0.2222 computed:0.2222\n", | |
" P(Japan|neg) book:0.2222 computed:0.2222\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f' P(Chinese|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Chinese\"]:.4f}\\n'\n", | |
" f' P(Tokyo|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Tokyo\"]:.4f}\\n',\n", | |
" f'P(Japan|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Japan\"]:.4f}\\n')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now let's compute predicted class for our testing example." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"words = 'Chinese Chinese Chinese Tokyo Japan'.split()\n", | |
"words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'pos'" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"nb.classify(words)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's also compare our computed probabilities for the test example." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pos_prob, neg_prob = nb.priorPos, nb.priorNeg\n", | |
"for word in words:\n", | |
" pos_prob *= nb.condProbsPos[word]\n", | |
" neg_prob *= nb.condProbsNeg[word]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"P(pos|words) book:0.000301 computed:0.000301\n", | |
"P(neg|words) book:0.000135 computed:0.000135\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f'P(pos|words) book:{(3/4)*((3/7)**3)*(1/14)*(1/14):.6f} computed:{pos_prob:.6f}')\n", | |
"print(f'P(neg|words) book:{(1/4)*((2/9)**3)*(2/9)*(2/9):.6f} computed:{neg_prob:.6f}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment