lowener/bernoulli.ipynb

## bernoulli.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5dea07cd",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Bernoulli + CountVectorizer\n",
    "\n",
    "\n",
    "In the Bernoulli variant, the feature vector is binarized. That's why using a CountVectorizer transformer is useful: You're more interested in the presence of the word rather than it's frequency."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "id": "33c0cb40",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 44.4 ms, sys: 12.1 ms, total: 56.5 ms\n",
      "Wall time: 56.5 ms\n",
      "CPU times: user 14.9 ms, sys: 19.6 ms, total: 34.5 ms\n",
      "Wall time: 34.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8568723201751709"
      ]
     },
     "execution_count": 241,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = CountVectorizer(stop_words='english', binary=True, ngram_range=(1,3))\n",
    "\n",
    "x_train = vec.fit_transform(X_train_text)\n",
    "x_test = vec.transform(X_test_text)\n",
    "\n",
    "bnb = BernoulliNB()\n",
    "%time bnb.fit(x_train, y_train)\n",
    "%time bnb.score(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "id": "b735ce40",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 293 ms, sys: 72.1 ms, total: 365 ms\n",
      "Wall time: 365 ms\n",
      "CPU times: user 141 ms, sys: 90.9 ms, total: 232 ms\n",
      "Wall time: 231 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8568817764310402"
      ]
     },
     "execution_count": 247,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = CountVectorizer(stop_words='english', binary=True, ngram_range=(1,3))\n",
    "x_train = vec.fit_transform(X_train_text)\n",
    "x_test = vec.transform(X_test_text)\n",
    "x_train_np, x_test_np = x_train.get(), x_test.get()\n",
    "y_train_np, y_test_np = y_train.to_numpy(), y_test.to_numpy()\n",
    "\n",
    "bnb = BernoulliNB_sk()\n",
    "%time bnb.fit(x_train_np, y_train_np)\n",
    "%time bnb.score(x_test_np, y_test_np)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "5dea07cd",
	"metadata": {
	"jp-MarkdownHeadingCollapsed": true,
	"tags": []
	},
	"source": [
	"# Bernoulli + CountVectorizer\n",
	"\n",
	"\n",
	"In the Bernoulli variant, the feature vector is binarized. That's why using a CountVectorizer transformer is useful: You're more interested in the presence of the word rather than it's frequency."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 241,
	"id": "33c0cb40",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 44.4 ms, sys: 12.1 ms, total: 56.5 ms\n",
	"Wall time: 56.5 ms\n",
	"CPU times: user 14.9 ms, sys: 19.6 ms, total: 34.5 ms\n",
	"Wall time: 34.2 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"0.8568723201751709"
	]
	},
	"execution_count": 241,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vec = CountVectorizer(stop_words='english', binary=True, ngram_range=(1,3))\n",
	"\n",
	"x_train = vec.fit_transform(X_train_text)\n",
	"x_test = vec.transform(X_test_text)\n",
	"\n",
	"bnb = BernoulliNB()\n",
	"%time bnb.fit(x_train, y_train)\n",
	"%time bnb.score(x_test, y_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 247,
	"id": "b735ce40",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 293 ms, sys: 72.1 ms, total: 365 ms\n",
	"Wall time: 365 ms\n",
	"CPU times: user 141 ms, sys: 90.9 ms, total: 232 ms\n",
	"Wall time: 231 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"0.8568817764310402"
	]
	},
	"execution_count": 247,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vec = CountVectorizer(stop_words='english', binary=True, ngram_range=(1,3))\n",
	"x_train = vec.fit_transform(X_train_text)\n",
	"x_test = vec.transform(X_test_text)\n",
	"x_train_np, x_test_np = x_train.get(), x_test.get()\n",
	"y_train_np, y_test_np = y_train.to_numpy(), y_test.to_numpy()\n",
	"\n",
	"bnb = BernoulliNB_sk()\n",
	"%time bnb.fit(x_train_np, y_train_np)\n",
	"%time bnb.score(x_test_np, y_test_np)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}