Last active
April 4, 2023 01:38
-
-
Save MaxHalford/6e6b2e9fd2740bf0a0408ff1c226ae95 to your computer and use it in GitHub Desktop.
Online logistic regression: River/scikit-learn/Vowpal Wabbit/PyTorch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Showdown" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext watermark" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Python implementation: CPython\n", | |
"Python version : 3.10.8\n", | |
"IPython version : 8.12.0\n", | |
"\n", | |
"river : 0.15.0\n", | |
"scikit-learn: 1.2.2\n", | |
"torch : 2.0.0\n", | |
"vowpalwabbit: 9.8.0\n", | |
"\n", | |
"Compiler : Clang 14.0.0 (clang-1400.0.29.102)\n", | |
"OS : Darwin\n", | |
"Release : 22.2.0\n", | |
"Machine : arm64\n", | |
"Processor : arm\n", | |
"CPU cores : 8\n", | |
"Architecture: 64bit\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%watermark -p river,scikit-learn,torch,vowpalwabbit -mv" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"models = {}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import river.linear_model\n", | |
"import river.optim\n", | |
"\n", | |
"models['River'] = lambda: river.linear_model.LogisticRegression(\n", | |
" optimizer=river.optim.SGD(lr=0.01),\n", | |
" intercept_lr=0,\n", | |
" l2=0.0\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from vowpalwabbit import pyvw\n", | |
"import river.base\n", | |
"\n", | |
"class VW2RiverClassifier(river.base.Classifier):\n", | |
" def __init__(self, *args, **kwargs):\n", | |
" self.vw = pyvw.Workspace(*args, **kwargs)\n", | |
"\n", | |
" def _format_x(self, x):\n", | |
" return \" \".join(f\"{k}:{v}\" for k, v in x.items())\n", | |
"\n", | |
" def learn_one(self, x, y):\n", | |
"\n", | |
" # Convert {False, True} to {-1, 1}\n", | |
" y = int(y)\n", | |
" y_vw = 2 * y - 1\n", | |
"\n", | |
" ex = self._format_x(x)\n", | |
" ex = f\"{y_vw} | {ex}\"\n", | |
" self.vw.learn(ex)\n", | |
" return self\n", | |
"\n", | |
" def predict_proba_one(self, x):\n", | |
" ex = \"| \" + self._format_x(x)\n", | |
" y_pred = self.vw.predict(ex)\n", | |
" return {True: y_pred, False: 1.0 - y_pred}\n", | |
"\n", | |
"models['Vowpal Wabbit'] = lambda: VW2RiverClassifier(\n", | |
" sgd=True,\n", | |
" learning_rate=0.01,\n", | |
" loss_function=\"logistic\",\n", | |
" link=\"logistic\",\n", | |
" adaptive=False,\n", | |
" normalized=False,\n", | |
" invariant=False,\n", | |
" noconstant=True,\n", | |
" l2=0.0,\n", | |
" l1=0.0,\n", | |
" power_t=0,\n", | |
" quiet=True,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sklearn.base\n", | |
"import sklearn.exceptions\n", | |
"import sklearn.linear_model\n", | |
"\n", | |
"class SK2RiverClassifier(river.base.Classifier):\n", | |
"\n", | |
" def __init__(self, estimator: sklearn.base.ClassifierMixin, classes: list):\n", | |
" self.estimator = estimator\n", | |
" self.classes = classes\n", | |
" \n", | |
" def learn_one(self, x, y):\n", | |
" self.estimator.partial_fit(X=[list(x.values())], y=[y], classes=self.classes)\n", | |
" return self\n", | |
"\n", | |
" def predict_proba_one(self, x):\n", | |
" try:\n", | |
" y_pred = self.estimator.predict_proba([list(x.values())])[0]\n", | |
" return {self.classes[i]: p for i, p in enumerate(y_pred)}\n", | |
" except sklearn.exceptions.NotFittedError:\n", | |
" return {c: 1 / len(self.classes) for c in self.classes}\n", | |
"\n", | |
"models['scikit-learn'] = lambda: SK2RiverClassifier(\n", | |
" estimator=sklearn.linear_model.SGDClassifier(\n", | |
" loss='log_loss',\n", | |
" penalty=None,\n", | |
" fit_intercept=False,\n", | |
" learning_rate='constant',\n", | |
" eta0=0.01,\n", | |
" max_iter=1,\n", | |
" ),\n", | |
" classes=[False, True]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import torch\n", | |
"\n", | |
"class TorchLogisticRegression(torch.nn.Module):\n", | |
" def __init__(self, n_features: int, n_classes: int = 2):\n", | |
" super().__init__()\n", | |
" self.linear = torch.nn.Linear(n_features, n_classes, bias=False)\n", | |
" self.linear.weight.data.zero_()\n", | |
"\n", | |
" def forward(self, x):\n", | |
" y = self.linear(x)\n", | |
" return torch.sigmoid(y)\n", | |
"\n", | |
"class Torch2RiverClassifier(river.base.Classifier):\n", | |
"\n", | |
" def __init__(self, model: torch.nn.Module, classes: list):\n", | |
" self.model = model\n", | |
" self.classes = classes\n", | |
" self.criterion = torch.nn.CrossEntropyLoss()\n", | |
" self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)\n", | |
"\n", | |
" def learn_one(self, x: dict, y):\n", | |
" x = torch.tensor([list(x.values())], dtype=torch.float32)\n", | |
" y = torch.tensor([y], dtype=torch.long)\n", | |
" # forward pass\n", | |
" y_pred = self.model(x)\n", | |
" loss = self.criterion(y_pred, y)\n", | |
" # backward pass and optimization\n", | |
" self.model.zero_grad(set_to_none=True)\n", | |
" loss.backward()\n", | |
" self.optimizer.step()\n", | |
" return self\n", | |
" \n", | |
" def predict_proba_one(self, x: dict):\n", | |
" x = torch.tensor([list(x.values())], dtype=torch.float32)\n", | |
" y_pred = self.model(x)\n", | |
" return dict(zip(self.classes, y_pred.detach().numpy()[0]))\n", | |
"\n", | |
"models['PyTorch'] = lambda: Torch2RiverClassifier(\n", | |
" model=TorchLogisticRegression(n_features=10),\n", | |
" classes=[False, True]\n", | |
")" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Metrics check" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "AttributeError", | |
"evalue": "'dict' object has no attribute 'make_classification'", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn[27], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m datasets \u001b[39m=\u001b[39m {}\n\u001b[1;32m 6\u001b[0m \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m (\u001b[39m10\u001b[39m, \u001b[39m100\u001b[39m, \u001b[39m1000\u001b[39m):\n\u001b[0;32m----> 7\u001b[0m X, Y \u001b[39m=\u001b[39m datasets\u001b[39m.\u001b[39;49mmake_classification(\n\u001b[1;32m 8\u001b[0m n_samples\u001b[39m=\u001b[39m\u001b[39m10_000\u001b[39m,\n\u001b[1;32m 9\u001b[0m n_features\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m,\n\u001b[1;32m 10\u001b[0m n_informative\u001b[39m=\u001b[39m\u001b[39m5\u001b[39m,\n\u001b[1;32m 11\u001b[0m n_redundant\u001b[39m=\u001b[39m\u001b[39m5\u001b[39m,\n\u001b[1;32m 12\u001b[0m n_classes\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m,\n\u001b[1;32m 13\u001b[0m random_state\u001b[39m=\u001b[39m\u001b[39m42\u001b[39m,\n\u001b[1;32m 14\u001b[0m )\n\u001b[1;32m 15\u001b[0m X \u001b[39m=\u001b[39m preprocessing\u001b[39m.\u001b[39mscale(X)\n\u001b[1;32m 16\u001b[0m X \u001b[39m=\u001b[39m [\u001b[39mdict\u001b[39m(\u001b[39mzip\u001b[39m(\u001b[39mrange\u001b[39m(X\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]), x)) \u001b[39mfor\u001b[39;00m x \u001b[39min\u001b[39;00m X]\n", | |
"\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'make_classification'" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import datasets\n", | |
"from sklearn import preprocessing\n", | |
"\n", | |
"data = {}\n", | |
"\n", | |
"for p in (10, 100, 1000):\n", | |
" X, Y = datasets.make_classification(\n", | |
" n_samples=10_000,\n", | |
" n_features=10,\n", | |
" n_informative=5,\n", | |
" n_redundant=5,\n", | |
" n_classes=2,\n", | |
" random_state=42,\n", | |
" )\n", | |
" X = preprocessing.scale(X)\n", | |
" X = [dict(zip(range(X.shape[1]), x)) for x in X]\n", | |
" data[p] = (X, Y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"River Accuracy: 77.72%\n", | |
"Vowpal Wabbit Accuracy: 77.73%\n", | |
"scikit-learn Accuracy: 77.72%\n", | |
"PyTorch Accuracy: 77.11%\n" | |
] | |
} | |
], | |
"source": [ | |
"from river import metrics\n", | |
"\n", | |
"for model_name, model_init in models.items():\n", | |
" model = model_init()\n", | |
" metric = metrics.Accuracy()\n", | |
" for x, y in zip(X, Y):\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)\n", | |
" metric.update(y, p)\n", | |
" print(f\"{model_name:<15} {metric}\")" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Benchmark" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"90.5 ms ± 609 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"\n", | |
"model = models['River']()\n", | |
"for x, y in zip(X, Y):\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.44 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"\n", | |
"model = models['scikit-learn']()\n", | |
"for x, y in zip(X, Y):\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"281 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"\n", | |
"model = models['Vowpal Wabbit']()\n", | |
"for x, y in zip(X, Y):\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"833 ms ± 9.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"\n", | |
"model = models['PyTorch']()\n", | |
"for x, y in zip(X, Y):\n", | |
" with torch.no_grad():\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 s ± 6.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"\n", | |
"model = models['PyTorch (compiled)']()\n", | |
"for x, y in zip(X, Y):\n", | |
" p = model.predict_one(x)\n", | |
" model.learn_one(x, y)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.11.0 ('.venv': venv)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.8" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "14b46bd212fa4dd89e3980db6ba7efbb9fe535833e1e483b914b71733e0a56d2" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment