Skip to content

Instantly share code, notes, and snippets.

@lowener
Last active July 19, 2022 12:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lowener/ab24fac96245f112f3742c749ce1fdaf to your computer and use it in GitHub Desktop.
Save lowener/ab24fac96245f112f3742c749ce1fdaf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "92903d9d",
"metadata": {
"tags": []
},
"source": [
"# Gaussian NB\n",
"\n",
"Transform the text through a TF-IDF vectorizer and iterate through the dataset to do multiple partial fits of Gaussian naive Bayes."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1079a683",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 12.3 s, sys: 2.23 s, total: 14.5 s\n",
"Wall time: 22 s\n",
"0.8769999742507935\n",
"0.8840000033378601\n",
"0.878083348274231\n",
"0.8805833458900452\n",
"0.8756666779518127\n",
"0.8796666860580444\n",
"0.8786666393280029\n",
"0.8777499794960022\n",
"0.8823529481887817\n",
"CPU times: user 4.36 s, sys: 2.74 s, total: 7.1 s\n",
"Wall time: 22.8 s\n"
]
}
],
"source": [
"vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3), min_df=5)\n",
"x_train = vec.fit_transform(X_train_text)\n",
"x_test = vec.transform(X_test_text)\n",
"\n",
"def dataset_traversal(X, Y, partial_function):\n",
" chunk_size = 12000\n",
" classes = cp.unique(Y)\n",
" lower = 0\n",
" for upper in iter(range(chunk_size, X.shape[0], chunk_size)):\n",
" partial_function(X[lower:upper], Y[lower:upper], classes)\n",
" lower = upper\n",
" partial_function(X[upper:], Y[upper:], classes)\n",
"\n",
"mnb = GaussianNB()\n",
"%time dataset_traversal(x_train,\\\n",
" y_train,\\\n",
" lambda x,y, c: mnb.partial_fit(x, y, c))\n",
"\n",
"%time dataset_traversal(x_test,\\\n",
" y_test,\\\n",
" lambda x, y, c: print(mnb.score(x, y)))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a0c9ccc9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2min 47s, sys: 1min 29s, total: 4min 17s\n",
"Wall time: 4min 17s\n",
"0.885\n",
"0.8736\n",
"0.8802\n",
"0.8828\n",
"0.8836\n",
"0.8738\n",
"0.8806\n",
"0.881\n",
"0.8832\n",
"0.8784\n",
"0.8714\n",
"0.879\n",
"0.8754\n",
"0.8782\n",
"0.8816\n",
"0.8844\n",
"0.875\n",
"0.8764\n",
"0.877\n",
"0.8864\n",
"0.8796\n",
"0.8842975206611571\n",
"CPU times: user 3min 8s, sys: 2min 7s, total: 5min 16s\n",
"Wall time: 5min 16s\n"
]
}
],
"source": [
"vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3), min_df=5)\n",
"x_train = vec.fit_transform(X_train_text)\n",
"x_test = vec.transform(X_test_text)\n",
"x_train_np, x_test_np = x_train.get(), x_test.get()\n",
"y_train_np, y_test_np = y_train.to_numpy(), y_test.to_numpy()\n",
"\n",
"def dataset_traversal(X, Y, partial_function):\n",
" chunk_size = 5000\n",
" classes = np.unique(Y)\n",
" lower = 0\n",
" for upper in iter(range(chunk_size, X.shape[0], chunk_size)):\n",
" partial_function(X[lower:upper], Y[lower:upper], classes)\n",
" lower = upper\n",
" partial_function(X[upper:], Y[upper:], classes)\n",
"\n",
"mnb = GaussianNB_sk()\n",
"%time dataset_traversal(x_train_np,\\\n",
" y_train_np,\\\n",
" lambda x, y, c: mnb.partial_fit(x.toarray(), y, c))\n",
"\n",
"%time dataset_traversal(x_test_np,\\\n",
" y_test_np,\\\n",
" lambda x, y, c: print(mnb.score(x.toarray(), y)))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment