Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Created June 9, 2020 17:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxHalford/09ecefc4b09ab4200226bc50a09effac to your computer and use it in GitHub Desktop.
Save MaxHalford/09ecefc4b09ab4200226bc50a09effac to your computer and use it in GitHub Desktop.
creme mini-batch performance
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear models in mini-batches"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import math\n",
"import numbers\n",
"import typing\n",
"\n",
"import numpy as np\n",
"\n",
"from creme import base\n",
"from creme import optim\n",
"from creme import utils\n",
"\n",
"\n",
"class GLM:\n",
"\n",
" def __init__(self, optimizer, loss, l2, intercept, intercept_lr, clip_gradient, initializer):\n",
" self.optimizer = optimizer\n",
" self.loss = loss\n",
" self.l2 = l2\n",
" self.intercept = intercept\n",
" self.intercept_lr = (\n",
" optim.schedulers.Constant(intercept_lr)\n",
" if isinstance(intercept_lr, numbers.Number) else\n",
" intercept_lr\n",
" )\n",
" self.clip_gradient = clip_gradient\n",
" self.weights = collections.defaultdict(initializer)\n",
" self.initializer = initializer\n",
" \n",
" def _fit(self, x, y, w, get_grad):\n",
" \n",
" # Some optimizers need to do something before a prediction is made\n",
" self.weights = self.optimizer.update_before_pred(w=self.weights)\n",
"\n",
" # Calculate the gradient\n",
" gradient, loss_gradient = get_grad(x, y, w)\n",
"\n",
" # Update the intercept\n",
" self.intercept -= self.intercept_lr.get(self.optimizer.n_iterations) * loss_gradient\n",
"\n",
" # Update the weights\n",
" self.weights = self.optimizer.update_after_pred(w=self.weights, g=gradient)\n",
"\n",
" return self\n",
" \n",
" # Single instance methods\n",
" \n",
" def _raw_dot_one(self, x):\n",
" return utils.math.dot(self.weights, x) + self.intercept\n",
" \n",
" def _eval_gradient_one(self, x, y, w):\n",
"\n",
" loss_gradient = self.loss.gradient(y_true=y, y_pred=self._raw_dot_one(x))\n",
" loss_gradient *= w\n",
" loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
"\n",
" return (\n",
" {\n",
" i: xi * loss_gradient + 2. * self.l2 * self.weights.get(i, 0)\n",
" for i, xi in x.items()\n",
" },\n",
" loss_gradient\n",
" )\n",
"\n",
" def fit_one(self, x, y, w=1.):\n",
" return self._fit(x, y, w, get_grad=self._eval_gradient_one)\n",
" \n",
" # Mini-batch methods\n",
" \n",
" def _raw_dot_many(self, X):\n",
" weights = np.array([self.weights[c] for c in X.columns])\n",
" return X.values @ weights + self.intercept\n",
" \n",
" def _eval_gradient_many(self, X, y, w):\n",
" \n",
" loss_gradient = self.loss.gradient(y_true=y.values, y_pred=self._raw_dot_many(X))\n",
" loss_gradient *= w\n",
" loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
" \n",
" # At this point we have a feature matrix X of shape (n, p). The loss gradient is a vector of\n",
" # length p. We want to multiply each of X's by the corresponding value in the loss gradient.\n",
" # When this is all done, we collapse X by computing the average of each column, thereby\n",
" # obtaining the mean gradient of the batch. From thereon, the code reduces to the single\n",
" # instance case.\n",
" gradient = np.einsum('ij,i->ij', x.values, loss_gradient).mean(axis=0)\n",
" \n",
" return dict(zip(X.columns, gradient)), loss_gradient.mean() \n",
" \n",
" def fit_many(self, X, y, w=1):\n",
" return self._fit(x, y, w, get_grad=self._eval_gradient_many) \n",
" \n",
"\n",
"class LinearRegression(GLM, base.Regressor):\n",
"\n",
" def __init__(\n",
" self,\n",
" optimizer: optim.Optimizer = None,\n",
" loss: optim.losses.RegressionLoss = None,\n",
" l2=.0,\n",
" intercept=0.,\n",
" intercept_lr: typing.Union[optim.schedulers.Scheduler, float] = .01,\n",
" clip_gradient=1e+12,\n",
" initializer: optim.initializers.Initializer = None\n",
" ):\n",
" super().__init__(\n",
" optimizer=(\n",
" optim.SGD(optim.schedulers.InverseScaling(.01, .25))\n",
" if optimizer is None else\n",
" optimizer\n",
" ),\n",
" loss=optim.losses.Squared() if loss is None else loss,\n",
" intercept=intercept,\n",
" intercept_lr=intercept_lr,\n",
" l2=l2,\n",
" clip_gradient=clip_gradient,\n",
" initializer=initializer if initializer else optim.initializers.Zeros()\n",
" )\n",
"\n",
" def predict_one(self, x):\n",
" return self.loss.mean_func(self._raw_dot_one(x))\n",
" \n",
" def predict_many(self, X):\n",
" return self.loss.mean_func(self._raw_dot_many(x))\n",
" \n",
" \n",
"class Squared:\n",
" \n",
" def gradient(self, y_true, y_pred):\n",
" return y_pred - y_true\n",
" \n",
" def mean_func(self, y):\n",
" return y\n",
" \n",
" \n",
"class Log:\n",
" \n",
" def gradient(self, y_true, y_pred):\n",
" y_true = y_true * 2 - 1\n",
" z = y_pred * y_true\n",
" return -y_true / (np.exp(z) + 1.)\n",
" \n",
" def mean_func(self, y):\n",
" return 1 / (1 + np.exp(-y))"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([4, 4, 4])"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.maximum([1, 2, 3], 4)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#weights = np.array([sgd.weights[c] for c in x.columns])\n",
"#weights += 1\n",
"#y_pred = x.values @ weights + sgd.intercept\n",
"#loss_gradient = sgd.loss.gradient(y_true=y.values, y_pred=y_pred)\n",
"#gradient = np.einsum('ij,i->ij', x.values, loss_gradient)\n",
"#gradient = gradient.mean(axis=0)\n",
"#gradient"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Single-instance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"MAE: 5.766969"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from creme import metrics\n",
"from sklearn import datasets\n",
"from sklearn import linear_model\n",
"from sklearn import preprocessing\n",
"\n",
"X, Y = datasets.load_boston(return_X_y=True)\n",
"X = preprocessing.scale(X)\n",
"\n",
"sgd = linear_model.SGDRegressor(\n",
" learning_rate='constant',\n",
" eta0=0.01,\n",
" penalty='none'\n",
")\n",
"mae = metrics.MAE()\n",
"\n",
"for i, (x, y) in enumerate(zip(X, Y)):\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict([x])\n",
" mae.update(y, y_pred)\n",
" \n",
" sgd.partial_fit([x], [y])\n",
" \n",
"mae"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"creme."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"MAE: 5.766969"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from creme import optim\n",
"\n",
"sgd = LinearRegression(\n",
" loss=Squared(),\n",
" optimizer=optim.SGD(0.01),\n",
" l2=0.,\n",
" intercept_lr=0.01\n",
")\n",
"mae = metrics.MAE()\n",
"\n",
"for i, (x, y) in enumerate(zip(X, Y)):\n",
" \n",
" x = dict(enumerate(x))\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_one(x)\n",
" mae.update(y, y_pred)\n",
" \n",
" sgd.fit_one(x, y)\n",
" \n",
"mae"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Single instance with classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogLoss: 0.078291"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')[:30000]\n",
"Y = X.pop('Class')\n",
"X = preprocessing.scale(X)\n",
"classes = np.unique(Y)\n",
"\n",
"sgd = linear_model.SGDClassifier(\n",
" learning_rate='constant',\n",
" loss='log',\n",
" eta0=0.01,\n",
" penalty='none'\n",
")\n",
"metric = metrics.LogLoss()\n",
"\n",
"for i, (x, y) in enumerate(zip(X, Y)):\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict([x])\n",
" metric.update(y, y_pred)\n",
" \n",
" sgd.partial_fit([x], [y], classes=classes)\n",
" \n",
"metric"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"creme."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogLoss: 0.078291"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from creme import linear_model\n",
"\n",
"sgd = linear_model.LogisticRegression(\n",
" #loss=Log(),\n",
" optimizer=optim.SGD(0.01),\n",
" l2=0.,\n",
" intercept_lr=0.01\n",
")\n",
"metric = metrics.LogLoss()\n",
"\n",
"for i, (x, y) in enumerate(zip(X, Y)):\n",
" \n",
" x = dict(enumerate(x))\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_one(x)\n",
" metric.update(y, y_pred)\n",
" \n",
" sgd.fit_one(x, y)\n",
" \n",
"metric"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogLoss: 0.078291"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from creme import linear_model\n",
"\n",
"sgd = linear_model.LogisticRegression(\n",
" loss=Log(),\n",
" optimizer=optim.SGD(0.01),\n",
" l2=0.,\n",
" intercept_lr=0.01\n",
")\n",
"metric = metrics.LogLoss()\n",
"\n",
"for i, (x, y) in enumerate(zip(X, Y)):\n",
" \n",
" x = dict(enumerate(x))\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_one(x)\n",
" metric.update(y, y_pred)\n",
" \n",
" sgd.fit_one(x, y)\n",
" \n",
"metric"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mini-batches"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MAE: 185,603.285604"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
"X = preprocessing.scale(X)\n",
"\n",
"batch_size = 1\n",
"n_splits = len(X) // batch_size\n",
"\n",
"sgd = linear_model.SGDRegressor(\n",
" learning_rate='constant',\n",
" eta0=0.01,\n",
" penalty='none',\n",
" shuffle=False,\n",
" max_iter=1\n",
")\n",
"mae = metrics.MAE()\n",
"\n",
"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict(x)\n",
" for yt, yp in zip(y, y_pred):\n",
" mae.update(yt, yp)\n",
" \n",
" sgd.partial_fit(x, y)\n",
" \n",
"mae"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"creme."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MAE: 185,603.285604"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
"X = preprocessing.scale(X)\n",
"\n",
"sgd = LinearRegression(\n",
" loss=Squared(),\n",
" optimizer=optim.SGD(0.01),\n",
" l2=0.,\n",
" intercept_lr=0.01\n",
")\n",
"mae = metrics.MAE()\n",
"\n",
"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
" \n",
" x = pd.DataFrame(x)\n",
" y = pd.Series(y)\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_many(x)\n",
" for yt, yp in zip(y, y_pred):\n",
" mae.update(yt, yp)\n",
" \n",
" sgd.fit_many(x, y)\n",
" \n",
"mae"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mini-batches with classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"scikit-learn."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogLoss: 0.006645"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import utils\n",
"from sklearn import linear_model\n",
"\n",
"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
"Y = X.pop('Class')\n",
"X = preprocessing.scale(X)\n",
"classes = np.unique(Y)\n",
"\n",
"batch_size = 1\n",
"n_splits = len(X) // batch_size\n",
"\n",
"sgd = linear_model.SGDClassifier(\n",
" learning_rate='constant',\n",
" eta0=0.01,\n",
" penalty='none',\n",
" loss='log',\n",
" shuffle=False,\n",
" max_iter=1\n",
")\n",
"metric = metrics.LogLoss()\n",
"\n",
"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_proba(x)[:, 1]\n",
" for yt, yp in zip(y, y_pred):\n",
" metric.update(yt, yp)\n",
" \n",
" sgd.partial_fit(x, y, classes=classes)\n",
" \n",
"metric"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"creme."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogLoss: 0.006645"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sgd = LinearRegression(\n",
" loss=Log(),\n",
" optimizer=optim.SGD(0.01),\n",
" l2=0.,\n",
" intercept_lr=0.01\n",
")\n",
"metric = metrics.LogLoss()\n",
"\n",
"\n",
"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
" \n",
" x = pd.DataFrame(x)\n",
" \n",
" if i > 0:\n",
" y_pred = sgd.predict_many(x)\n",
" for yt, yp in zip(y, y_pred):\n",
" metric.update(yt, yp)\n",
" \n",
" sgd.fit_many(x, y)\n",
" \n",
"metric"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speed comparison"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import preprocessing\n",
"\n",
"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
"Y = X.pop('Class')\n",
"X[:] = preprocessing.scale(X)\n",
"classes = np.unique(Y)\n",
"\n",
"batch_size = 512\n",
"n_batches = len(X) // batch_size"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"344 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"8.55 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"from creme import linear_model\n",
"\n",
"model = linear_model.LogisticRegression()\n",
"\n",
"%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.fit_many(x_batch, y_batch)\n",
"%timeit model.predict_proba_many(X)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"745 ms ± 41.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"13.3 ms ± 470 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"model = linear_model.SGDClassifier(loss='log')\n",
"\n",
"%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.partial_fit(x_batch, y_batch, classes=classes)\n",
"%timeit model.predict_proba(X)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'Pipeline' object has no attribute 'predict_proba_many'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'Pipeline' object has no attribute 'predict_proba_many'"
]
}
],
"source": [
"%%time\n",
"\n",
"from creme import compose\n",
"from creme import linear_model\n",
"from creme import preprocessing\n",
"\n",
"model = compose.Pipeline(\n",
" preprocessing.StandardScaler(),\n",
" linear_model.LogisticRegression()\n",
")\n",
"\n",
"for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)):\n",
" y_pred = model.predict_proba_many(x_batch)\n",
" model.fit_many(x_batch, y_batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment