Skip to content

Instantly share code, notes, and snippets.

@ilyarudyak
Created May 27, 2019 15:08
Show Gist options
  • Save ilyarudyak/ffcc2ee6bf61f37bb953c6733dc97cb5 to your computer and use it in GitHub Desktop.
Save ilyarudyak/ffcc2ee6bf61f37bb953c6733dc97cb5 to your computer and use it in GitHub Desktop.
Naive Bayes example based on Coursera NLP course (and Information Extraction book).
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"from NaiveBayes import NaiveBayes\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## toy example"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We use toy data from coursera lectures. But we follow it using IR book (for online reading), p. 261 (next page to algorithm)."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"nb = NaiveBayes()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Chinese', 'Beijing', 'Chinese'] pos\n",
"['Chinese', 'Chinese', 'Shanghai'] pos\n",
"['Chinese', 'Macao'] pos\n",
"['Tokyo', 'Japan', 'Chinese'] neg\n"
]
}
],
"source": [
"toy_split = nb.getToySplit()\n",
"for example in toy_split.train:\n",
" print(example.words, example.klass)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"nb.train(toy_split)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Beijing', 'Chinese', 'Japan', 'Macao', 'Shanghai', 'Tokyo'}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.vocabulary"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4, 3, 1)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.docsCount, nb.docsPos, nb.docsNeg"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'Chinese': 5,\n",
" 'Beijing': 1,\n",
" 'Shanghai': 1,\n",
" 'Macao': 1,\n",
" 'Japan': 0,\n",
" 'Tokyo': 0})"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.wordsPos"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'Tokyo': 1,\n",
" 'Japan': 1,\n",
" 'Chinese': 1,\n",
" 'Shanghai': 0,\n",
" 'Macao': 0,\n",
" 'Beijing': 0})"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.wordsNeg"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.75, 0.25)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.priorPos, nb.priorNeg"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float,\n",
" {'Japan': 0.07142857142857142,\n",
" 'Shanghai': 0.14285714285714285,\n",
" 'Macao': 0.14285714285714285,\n",
" 'Tokyo': 0.07142857142857142,\n",
" 'Beijing': 0.14285714285714285,\n",
" 'Chinese': 0.42857142857142855})"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.condProbsPos"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float,\n",
" {'Japan': 0.2222222222222222,\n",
" 'Shanghai': 0.1111111111111111,\n",
" 'Macao': 0.1111111111111111,\n",
" 'Tokyo': 0.2222222222222222,\n",
" 'Beijing': 0.1111111111111111,\n",
" 'Chinese': 0.2222222222222222})"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.condProbsNeg"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's compare these probabilities with computed in the book."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" P(Chinese|pos) book:0.4286 computed:0.4286\n",
" P(Tokyo|pos) book:0.0714 computed:0.0714\n",
" P(Japan|pos) book:0.0714 computed:0.0714\n",
"\n"
]
}
],
"source": [
"print(f' P(Chinese|pos) book:{3/7:.4f} computed:{nb.condProbsPos[\"Chinese\"]:.4f}\\n'\n",
" f' P(Tokyo|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Tokyo\"]:.4f}\\n',\n",
" f'P(Japan|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Japan\"]:.4f}\\n')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" P(Chinese|neg) book:0.2222 computed:0.2222\n",
" P(Tokyo|neg) book:0.2222 computed:0.2222\n",
" P(Japan|neg) book:0.2222 computed:0.2222\n",
"\n"
]
}
],
"source": [
"print(f' P(Chinese|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Chinese\"]:.4f}\\n'\n",
" f' P(Tokyo|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Tokyo\"]:.4f}\\n',\n",
" f'P(Japan|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Japan\"]:.4f}\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's compute predicted class for our testing example."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words = 'Chinese Chinese Chinese Tokyo Japan'.split()\n",
"words"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'pos'"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb.classify(words)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's also compare our computed probabilities for the test example."
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"pos_prob, neg_prob = nb.priorPos, nb.priorNeg\n",
"for word in words:\n",
" pos_prob *= nb.condProbsPos[word]\n",
" neg_prob *= nb.condProbsNeg[word]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"P(pos|words) book:0.000301 computed:0.000301\n",
"P(neg|words) book:0.000135 computed:0.000135\n"
]
}
],
"source": [
"print(f'P(pos|words) book:{(3/4)*((3/7)**3)*(1/14)*(1/14):.6f} computed:{pos_prob:.6f}')\n",
"print(f'P(neg|words) book:{(1/4)*((2/9)**3)*(2/9)*(2/9):.6f} computed:{neg_prob:.6f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment