Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Last active April 4, 2023 01:38
Show Gist options
  • Save MaxHalford/6e6b2e9fd2740bf0a0408ff1c226ae95 to your computer and use it in GitHub Desktop.
Save MaxHalford/6e6b2e9fd2740bf0a0408ff1c226ae95 to your computer and use it in GitHub Desktop.
Online logistic regression: River/scikit-learn/Vowpal Wabbit/PyTorch
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Showdown"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext watermark"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python implementation: CPython\n",
"Python version : 3.10.8\n",
"IPython version : 8.12.0\n",
"\n",
"river : 0.15.0\n",
"scikit-learn: 1.2.2\n",
"torch : 2.0.0\n",
"vowpalwabbit: 9.8.0\n",
"\n",
"Compiler : Clang 14.0.0 (clang-1400.0.29.102)\n",
"OS : Darwin\n",
"Release : 22.2.0\n",
"Machine : arm64\n",
"Processor : arm\n",
"CPU cores : 8\n",
"Architecture: 64bit\n",
"\n"
]
}
],
"source": [
"%watermark -p river,scikit-learn,torch,vowpalwabbit -mv"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"models = {}"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import river.linear_model\n",
"import river.optim\n",
"\n",
"models['River'] = lambda: river.linear_model.LogisticRegression(\n",
" optimizer=river.optim.SGD(lr=0.01),\n",
" intercept_lr=0,\n",
" l2=0.0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from vowpalwabbit import pyvw\n",
"import river.base\n",
"\n",
"class VW2RiverClassifier(river.base.Classifier):\n",
" def __init__(self, *args, **kwargs):\n",
" self.vw = pyvw.Workspace(*args, **kwargs)\n",
"\n",
" def _format_x(self, x):\n",
" return \" \".join(f\"{k}:{v}\" for k, v in x.items())\n",
"\n",
" def learn_one(self, x, y):\n",
"\n",
" # Convert {False, True} to {-1, 1}\n",
" y = int(y)\n",
" y_vw = 2 * y - 1\n",
"\n",
" ex = self._format_x(x)\n",
" ex = f\"{y_vw} | {ex}\"\n",
" self.vw.learn(ex)\n",
" return self\n",
"\n",
" def predict_proba_one(self, x):\n",
" ex = \"| \" + self._format_x(x)\n",
" y_pred = self.vw.predict(ex)\n",
" return {True: y_pred, False: 1.0 - y_pred}\n",
"\n",
"models['Vowpal Wabbit'] = lambda: VW2RiverClassifier(\n",
" sgd=True,\n",
" learning_rate=0.01,\n",
" loss_function=\"logistic\",\n",
" link=\"logistic\",\n",
" adaptive=False,\n",
" normalized=False,\n",
" invariant=False,\n",
" noconstant=True,\n",
" l2=0.0,\n",
" l1=0.0,\n",
" power_t=0,\n",
" quiet=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import sklearn.base\n",
"import sklearn.exceptions\n",
"import sklearn.linear_model\n",
"\n",
"class SK2RiverClassifier(river.base.Classifier):\n",
"\n",
" def __init__(self, estimator: sklearn.base.ClassifierMixin, classes: list):\n",
" self.estimator = estimator\n",
" self.classes = classes\n",
" \n",
" def learn_one(self, x, y):\n",
" self.estimator.partial_fit(X=[list(x.values())], y=[y], classes=self.classes)\n",
" return self\n",
"\n",
" def predict_proba_one(self, x):\n",
" try:\n",
" y_pred = self.estimator.predict_proba([list(x.values())])[0]\n",
" return {self.classes[i]: p for i, p in enumerate(y_pred)}\n",
" except sklearn.exceptions.NotFittedError:\n",
" return {c: 1 / len(self.classes) for c in self.classes}\n",
"\n",
"models['scikit-learn'] = lambda: SK2RiverClassifier(\n",
" estimator=sklearn.linear_model.SGDClassifier(\n",
" loss='log_loss',\n",
" penalty=None,\n",
" fit_intercept=False,\n",
" learning_rate='constant',\n",
" eta0=0.01,\n",
" max_iter=1,\n",
" ),\n",
" classes=[False, True]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"class TorchLogisticRegression(torch.nn.Module):\n",
" def __init__(self, n_features: int, n_classes: int = 2):\n",
" super().__init__()\n",
" self.linear = torch.nn.Linear(n_features, n_classes, bias=False)\n",
" self.linear.weight.data.zero_()\n",
"\n",
" def forward(self, x):\n",
" y = self.linear(x)\n",
" return torch.sigmoid(y)\n",
"\n",
"class Torch2RiverClassifier(river.base.Classifier):\n",
"\n",
" def __init__(self, model: torch.nn.Module, classes: list):\n",
" self.model = model\n",
" self.classes = classes\n",
" self.criterion = torch.nn.CrossEntropyLoss()\n",
" self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)\n",
"\n",
" def learn_one(self, x: dict, y):\n",
" x = torch.tensor([list(x.values())], dtype=torch.float32)\n",
" y = torch.tensor([y], dtype=torch.long)\n",
" # forward pass\n",
" y_pred = self.model(x)\n",
" loss = self.criterion(y_pred, y)\n",
" # backward pass and optimization\n",
" self.model.zero_grad(set_to_none=True)\n",
" loss.backward()\n",
" self.optimizer.step()\n",
" return self\n",
" \n",
" def predict_proba_one(self, x: dict):\n",
" x = torch.tensor([list(x.values())], dtype=torch.float32)\n",
" y_pred = self.model(x)\n",
" return dict(zip(self.classes, y_pred.detach().numpy()[0]))\n",
"\n",
"models['PyTorch'] = lambda: Torch2RiverClassifier(\n",
" model=TorchLogisticRegression(n_features=10),\n",
" classes=[False, True]\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metrics check"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'dict' object has no attribute 'make_classification'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[27], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m datasets \u001b[39m=\u001b[39m {}\n\u001b[1;32m 6\u001b[0m \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m (\u001b[39m10\u001b[39m, \u001b[39m100\u001b[39m, \u001b[39m1000\u001b[39m):\n\u001b[0;32m----> 7\u001b[0m X, Y \u001b[39m=\u001b[39m datasets\u001b[39m.\u001b[39;49mmake_classification(\n\u001b[1;32m 8\u001b[0m n_samples\u001b[39m=\u001b[39m\u001b[39m10_000\u001b[39m,\n\u001b[1;32m 9\u001b[0m n_features\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m,\n\u001b[1;32m 10\u001b[0m n_informative\u001b[39m=\u001b[39m\u001b[39m5\u001b[39m,\n\u001b[1;32m 11\u001b[0m n_redundant\u001b[39m=\u001b[39m\u001b[39m5\u001b[39m,\n\u001b[1;32m 12\u001b[0m n_classes\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m,\n\u001b[1;32m 13\u001b[0m random_state\u001b[39m=\u001b[39m\u001b[39m42\u001b[39m,\n\u001b[1;32m 14\u001b[0m )\n\u001b[1;32m 15\u001b[0m X \u001b[39m=\u001b[39m preprocessing\u001b[39m.\u001b[39mscale(X)\n\u001b[1;32m 16\u001b[0m X \u001b[39m=\u001b[39m [\u001b[39mdict\u001b[39m(\u001b[39mzip\u001b[39m(\u001b[39mrange\u001b[39m(X\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]), x)) \u001b[39mfor\u001b[39;00m x \u001b[39min\u001b[39;00m X]\n",
"\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'make_classification'"
]
}
],
"source": [
"from sklearn import datasets\n",
"from sklearn import preprocessing\n",
"\n",
"data = {}\n",
"\n",
"for p in (10, 100, 1000):\n",
" X, Y = datasets.make_classification(\n",
" n_samples=10_000,\n",
" n_features=10,\n",
" n_informative=5,\n",
" n_redundant=5,\n",
" n_classes=2,\n",
" random_state=42,\n",
" )\n",
" X = preprocessing.scale(X)\n",
" X = [dict(zip(range(X.shape[1]), x)) for x in X]\n",
" data[p] = (X, Y)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"River Accuracy: 77.72%\n",
"Vowpal Wabbit Accuracy: 77.73%\n",
"scikit-learn Accuracy: 77.72%\n",
"PyTorch Accuracy: 77.11%\n"
]
}
],
"source": [
"from river import metrics\n",
"\n",
"for model_name, model_init in models.items():\n",
" model = model_init()\n",
" metric = metrics.Accuracy()\n",
" for x, y in zip(X, Y):\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)\n",
" metric.update(y, p)\n",
" print(f\"{model_name:<15} {metric}\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"90.5 ms ± 609 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"model = models['River']()\n",
"for x, y in zip(X, Y):\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.44 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"model = models['scikit-learn']()\n",
"for x, y in zip(X, Y):\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"281 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"model = models['Vowpal Wabbit']()\n",
"for x, y in zip(X, Y):\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"833 ms ± 9.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"model = models['PyTorch']()\n",
"for x, y in zip(X, Y):\n",
" with torch.no_grad():\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 s ± 6.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"model = models['PyTorch (compiled)']()\n",
"for x, y in zip(X, Y):\n",
" p = model.predict_one(x)\n",
" model.learn_one(x, y)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.11.0 ('.venv': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "14b46bd212fa4dd89e3980db6ba7efbb9fe535833e1e483b914b71733e0a56d2"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment