ilyarudyak/naive_bayes_example.ipynb

## naive_bayes_example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "from NaiveBayes import NaiveBayes\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## toy example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use toy data from coursera lectures. But we follow it using IR book (for online reading), p. 261 (next page to algorithm)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb = NaiveBayes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Chinese', 'Beijing', 'Chinese'] pos\n",
      "['Chinese', 'Chinese', 'Shanghai'] pos\n",
      "['Chinese', 'Macao'] pos\n",
      "['Tokyo', 'Japan', 'Chinese'] neg\n"
     ]
    }
   ],
   "source": [
    "toy_split = nb.getToySplit()\n",
    "for example in toy_split.train:\n",
    "    print(example.words, example.klass)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb.train(toy_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Beijing', 'Chinese', 'Japan', 'Macao', 'Shanghai', 'Tokyo'}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.vocabulary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 3, 1)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.docsCount, nb.docsPos, nb.docsNeg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'Chinese': 5,\n",
       "             'Beijing': 1,\n",
       "             'Shanghai': 1,\n",
       "             'Macao': 1,\n",
       "             'Japan': 0,\n",
       "             'Tokyo': 0})"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.wordsPos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'Tokyo': 1,\n",
       "             'Japan': 1,\n",
       "             'Chinese': 1,\n",
       "             'Shanghai': 0,\n",
       "             'Macao': 0,\n",
       "             'Beijing': 0})"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.wordsNeg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.75, 0.25)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.priorPos, nb.priorNeg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(float,\n",
       "            {'Japan': 0.07142857142857142,\n",
       "             'Shanghai': 0.14285714285714285,\n",
       "             'Macao': 0.14285714285714285,\n",
       "             'Tokyo': 0.07142857142857142,\n",
       "             'Beijing': 0.14285714285714285,\n",
       "             'Chinese': 0.42857142857142855})"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.condProbsPos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(float,\n",
       "            {'Japan': 0.2222222222222222,\n",
       "             'Shanghai': 0.1111111111111111,\n",
       "             'Macao': 0.1111111111111111,\n",
       "             'Tokyo': 0.2222222222222222,\n",
       "             'Beijing': 0.1111111111111111,\n",
       "             'Chinese': 0.2222222222222222})"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.condProbsNeg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare these probabilities with computed in the book."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " P(Chinese|pos) book:0.4286 computed:0.4286\n",
      " P(Tokyo|pos)   book:0.0714 computed:0.0714\n",
      " P(Japan|pos)   book:0.0714 computed:0.0714\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(f' P(Chinese|pos) book:{3/7:.4f} computed:{nb.condProbsPos[\"Chinese\"]:.4f}\\n'\n",
    "      f' P(Tokyo|pos)   book:{1/14:.4f} computed:{nb.condProbsPos[\"Tokyo\"]:.4f}\\n',\n",
    "      f'P(Japan|pos)   book:{1/14:.4f} computed:{nb.condProbsPos[\"Japan\"]:.4f}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " P(Chinese|neg) book:0.2222 computed:0.2222\n",
      " P(Tokyo|neg)   book:0.2222 computed:0.2222\n",
      " P(Japan|neg)   book:0.2222 computed:0.2222\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(f' P(Chinese|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Chinese\"]:.4f}\\n'\n",
    "      f' P(Tokyo|neg)   book:{2/9:.4f} computed:{nb.condProbsNeg[\"Tokyo\"]:.4f}\\n',\n",
    "      f'P(Japan|neg)   book:{2/9:.4f} computed:{nb.condProbsNeg[\"Japan\"]:.4f}\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's compute predicted class for our testing example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = 'Chinese Chinese Chinese Tokyo Japan'.split()\n",
    "words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'pos'"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.classify(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's also compare our computed probabilities for the test example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_prob, neg_prob = nb.priorPos, nb.priorNeg\n",
    "for word in words:\n",
    "    pos_prob *= nb.condProbsPos[word]\n",
    "    neg_prob *= nb.condProbsNeg[word]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "P(pos|words) book:0.000301 computed:0.000301\n",
      "P(neg|words) book:0.000135 computed:0.000135\n"
     ]
    }
   ],
   "source": [
    "print(f'P(pos|words) book:{(3/4)*((3/7)**3)*(1/14)*(1/14):.6f} computed:{pos_prob:.6f}')\n",
    "print(f'P(neg|words) book:{(1/4)*((2/9)**3)*(2/9)*(2/9):.6f} computed:{neg_prob:.6f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The autoreload extension is already loaded. To reload it, use:\n",
	" %reload_ext autoreload\n"
	]
	}
	],
	"source": [
	"from NaiveBayes import NaiveBayes\n",
	"\n",
	"%load_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## toy example"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We use toy data from coursera lectures. But we follow it using IR book (for online reading), p. 261 (next page to algorithm)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"nb = NaiveBayes()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['Chinese', 'Beijing', 'Chinese'] pos\n",
	"['Chinese', 'Chinese', 'Shanghai'] pos\n",
	"['Chinese', 'Macao'] pos\n",
	"['Tokyo', 'Japan', 'Chinese'] neg\n"
	]
	}
	],
	"source": [
	"toy_split = nb.getToySplit()\n",
	"for example in toy_split.train:\n",
	" print(example.words, example.klass)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"nb.train(toy_split)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'Beijing', 'Chinese', 'Japan', 'Macao', 'Shanghai', 'Tokyo'}"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.vocabulary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(4, 3, 1)"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.docsCount, nb.docsPos, nb.docsNeg"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(int,\n",
	" {'Chinese': 5,\n",
	" 'Beijing': 1,\n",
	" 'Shanghai': 1,\n",
	" 'Macao': 1,\n",
	" 'Japan': 0,\n",
	" 'Tokyo': 0})"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.wordsPos"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(int,\n",
	" {'Tokyo': 1,\n",
	" 'Japan': 1,\n",
	" 'Chinese': 1,\n",
	" 'Shanghai': 0,\n",
	" 'Macao': 0,\n",
	" 'Beijing': 0})"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.wordsNeg"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(0.75, 0.25)"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.priorPos, nb.priorNeg"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(float,\n",
	" {'Japan': 0.07142857142857142,\n",
	" 'Shanghai': 0.14285714285714285,\n",
	" 'Macao': 0.14285714285714285,\n",
	" 'Tokyo': 0.07142857142857142,\n",
	" 'Beijing': 0.14285714285714285,\n",
	" 'Chinese': 0.42857142857142855})"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.condProbsPos"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(float,\n",
	" {'Japan': 0.2222222222222222,\n",
	" 'Shanghai': 0.1111111111111111,\n",
	" 'Macao': 0.1111111111111111,\n",
	" 'Tokyo': 0.2222222222222222,\n",
	" 'Beijing': 0.1111111111111111,\n",
	" 'Chinese': 0.2222222222222222})"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.condProbsNeg"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's compare these probabilities with computed in the book."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" P(Chinese\|pos) book:0.4286 computed:0.4286\n",
	" P(Tokyo\|pos) book:0.0714 computed:0.0714\n",
	" P(Japan\|pos) book:0.0714 computed:0.0714\n",
	"\n"
	]
	}
	],
	"source": [
	"print(f' P(Chinese\|pos) book:{3/7:.4f} computed:{nb.condProbsPos[\"Chinese\"]:.4f}\\n'\n",
	" f' P(Tokyo\|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Tokyo\"]:.4f}\\n',\n",
	" f'P(Japan\|pos) book:{1/14:.4f} computed:{nb.condProbsPos[\"Japan\"]:.4f}\\n')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" P(Chinese\|neg) book:0.2222 computed:0.2222\n",
	" P(Tokyo\|neg) book:0.2222 computed:0.2222\n",
	" P(Japan\|neg) book:0.2222 computed:0.2222\n",
	"\n"
	]
	}
	],
	"source": [
	"print(f' P(Chinese\|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Chinese\"]:.4f}\\n'\n",
	" f' P(Tokyo\|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Tokyo\"]:.4f}\\n',\n",
	" f'P(Japan\|neg) book:{2/9:.4f} computed:{nb.condProbsNeg[\"Japan\"]:.4f}\\n')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now let's compute predicted class for our testing example."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"words = 'Chinese Chinese Chinese Tokyo Japan'.split()\n",
	"words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'pos'"
	]
	},
	"execution_count": 51,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nb.classify(words)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's also compare our computed probabilities for the test example."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 52,
	"metadata": {},
	"outputs": [],
	"source": [
	"pos_prob, neg_prob = nb.priorPos, nb.priorNeg\n",
	"for word in words:\n",
	" pos_prob *= nb.condProbsPos[word]\n",
	" neg_prob *= nb.condProbsNeg[word]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"P(pos\|words) book:0.000301 computed:0.000301\n",
	"P(neg\|words) book:0.000135 computed:0.000135\n"
	]
	}
	],
	"source": [
	"print(f'P(pos\|words) book:{(3/4)((3/7)3)(1/14)*(1/14):.6f} computed:{pos_prob:.6f}')\n",
	"print(f'P(neg\|words) book:{(1/4)((2/9)3)(2/9)*(2/9):.6f} computed:{neg_prob:.6f}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}