MaxHalford/linear-models.ipynb

## linear-models.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear models in mini-batches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import math\n",
    "import numbers\n",
    "import typing\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from creme import base\n",
    "from creme import optim\n",
    "from creme import utils\n",
    "\n",
    "\n",
    "class GLM:\n",
    "\n",
    "    def __init__(self, optimizer, loss, l2, intercept, intercept_lr, clip_gradient, initializer):\n",
    "        self.optimizer = optimizer\n",
    "        self.loss = loss\n",
    "        self.l2 = l2\n",
    "        self.intercept = intercept\n",
    "        self.intercept_lr = (\n",
    "            optim.schedulers.Constant(intercept_lr)\n",
    "            if isinstance(intercept_lr, numbers.Number) else\n",
    "            intercept_lr\n",
    "        )\n",
    "        self.clip_gradient = clip_gradient\n",
    "        self.weights = collections.defaultdict(initializer)\n",
    "        self.initializer = initializer\n",
    "    \n",
    "    def _fit(self, x, y, w, get_grad):\n",
    "        \n",
    "        # Some optimizers need to do something before a prediction is made\n",
    "        self.weights = self.optimizer.update_before_pred(w=self.weights)\n",
    "\n",
    "        # Calculate the gradient\n",
    "        gradient, loss_gradient = get_grad(x, y, w)\n",
    "\n",
    "        # Update the intercept\n",
    "        self.intercept -= self.intercept_lr.get(self.optimizer.n_iterations) * loss_gradient\n",
    "\n",
    "        # Update the weights\n",
    "        self.weights = self.optimizer.update_after_pred(w=self.weights, g=gradient)\n",
    "\n",
    "        return self\n",
    "    \n",
    "    # Single instance methods\n",
    "    \n",
    "    def _raw_dot_one(self, x):\n",
    "        return utils.math.dot(self.weights, x) + self.intercept\n",
    "    \n",
    "    def _eval_gradient_one(self, x, y, w):\n",
    "\n",
    "        loss_gradient = self.loss.gradient(y_true=y, y_pred=self._raw_dot_one(x))\n",
    "        loss_gradient *= w\n",
    "        loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
    "\n",
    "        return (\n",
    "            {\n",
    "                i: xi * loss_gradient + 2. * self.l2 * self.weights.get(i, 0)\n",
    "                for i, xi in x.items()\n",
    "            },\n",
    "            loss_gradient\n",
    "        )\n",
    "\n",
    "    def fit_one(self, x, y, w=1.):\n",
    "        return self._fit(x, y, w, get_grad=self._eval_gradient_one)\n",
    "    \n",
    "    # Mini-batch methods\n",
    "    \n",
    "    def _raw_dot_many(self, X):\n",
    "        weights = np.array([self.weights[c] for c in X.columns])\n",
    "        return X.values @ weights + self.intercept\n",
    "    \n",
    "    def _eval_gradient_many(self, X, y, w):\n",
    "        \n",
    "        loss_gradient = self.loss.gradient(y_true=y.values, y_pred=self._raw_dot_many(X))\n",
    "        loss_gradient *= w\n",
    "        loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
    "        \n",
    "        # At this point we have a feature matrix X of shape (n, p). The loss gradient is a vector of\n",
    "        # length p. We want to multiply each of X's by the corresponding value in the loss gradient.\n",
    "        # When this is all done, we collapse X by computing the average of each column, thereby\n",
    "        # obtaining the mean gradient of the batch. From thereon, the code reduces to the single\n",
    "        # instance case.\n",
    "        gradient = np.einsum('ij,i->ij', x.values, loss_gradient).mean(axis=0)\n",
    "        \n",
    "        return dict(zip(X.columns, gradient)), loss_gradient.mean() \n",
    "    \n",
    "    def fit_many(self, X, y, w=1):\n",
    "        return self._fit(x, y, w, get_grad=self._eval_gradient_many) \n",
    "        \n",
    "\n",
    "class LinearRegression(GLM, base.Regressor):\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        optimizer: optim.Optimizer = None,\n",
    "        loss: optim.losses.RegressionLoss = None,\n",
    "        l2=.0,\n",
    "        intercept=0.,\n",
    "        intercept_lr: typing.Union[optim.schedulers.Scheduler, float] = .01,\n",
    "        clip_gradient=1e+12,\n",
    "        initializer: optim.initializers.Initializer = None\n",
    "    ):\n",
    "        super().__init__(\n",
    "            optimizer=(\n",
    "                optim.SGD(optim.schedulers.InverseScaling(.01, .25))\n",
    "                if optimizer is None else\n",
    "                optimizer\n",
    "            ),\n",
    "            loss=optim.losses.Squared() if loss is None else loss,\n",
    "            intercept=intercept,\n",
    "            intercept_lr=intercept_lr,\n",
    "            l2=l2,\n",
    "            clip_gradient=clip_gradient,\n",
    "            initializer=initializer if initializer else optim.initializers.Zeros()\n",
    "        )\n",
    "\n",
    "    def predict_one(self, x):\n",
    "        return self.loss.mean_func(self._raw_dot_one(x))\n",
    "    \n",
    "    def predict_many(self, X):\n",
    "        return self.loss.mean_func(self._raw_dot_many(x))\n",
    "    \n",
    "    \n",
    "class Squared:\n",
    "    \n",
    "    def gradient(self, y_true, y_pred):\n",
    "        return y_pred - y_true\n",
    "    \n",
    "    def mean_func(self, y):\n",
    "        return y\n",
    "    \n",
    "    \n",
    "class Log:\n",
    "    \n",
    "    def gradient(self, y_true, y_pred):\n",
    "        y_true = y_true * 2 - 1\n",
    "        z = y_pred * y_true\n",
    "        return -y_true / (np.exp(z) + 1.)\n",
    "    \n",
    "    def mean_func(self, y):\n",
    "        return 1 / (1 + np.exp(-y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4, 4, 4])"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.maximum([1, 2, 3], 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#weights = np.array([sgd.weights[c] for c in x.columns])\n",
    "#weights += 1\n",
    "#y_pred = x.values @ weights + sgd.intercept\n",
    "#loss_gradient = sgd.loss.gradient(y_true=y.values, y_pred=y_pred)\n",
    "#gradient = np.einsum('ij,i->ij', x.values, loss_gradient)\n",
    "#gradient = gradient.mean(axis=0)\n",
    "#gradient"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Single-instance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAE: 5.766969"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from creme import metrics\n",
    "from sklearn import datasets\n",
    "from sklearn import linear_model\n",
    "from sklearn import preprocessing\n",
    "\n",
    "X, Y = datasets.load_boston(return_X_y=True)\n",
    "X = preprocessing.scale(X)\n",
    "\n",
    "sgd = linear_model.SGDRegressor(\n",
    "    learning_rate='constant',\n",
    "    eta0=0.01,\n",
    "    penalty='none'\n",
    ")\n",
    "mae = metrics.MAE()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(X, Y)):\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict([x])\n",
    "        mae.update(y, y_pred)\n",
    "        \n",
    "    sgd.partial_fit([x], [y])\n",
    "    \n",
    "mae"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "creme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAE: 5.766969"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from creme import optim\n",
    "\n",
    "sgd = LinearRegression(\n",
    "    loss=Squared(),\n",
    "    optimizer=optim.SGD(0.01),\n",
    "    l2=0.,\n",
    "    intercept_lr=0.01\n",
    ")\n",
    "mae = metrics.MAE()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(X, Y)):\n",
    "    \n",
    "    x = dict(enumerate(x))\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_one(x)\n",
    "        mae.update(y, y_pred)\n",
    "        \n",
    "    sgd.fit_one(x, y)\n",
    "    \n",
    "mae"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Single instance with classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogLoss: 0.078291"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')[:30000]\n",
    "Y = X.pop('Class')\n",
    "X = preprocessing.scale(X)\n",
    "classes = np.unique(Y)\n",
    "\n",
    "sgd = linear_model.SGDClassifier(\n",
    "    learning_rate='constant',\n",
    "    loss='log',\n",
    "    eta0=0.01,\n",
    "    penalty='none'\n",
    ")\n",
    "metric = metrics.LogLoss()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(X, Y)):\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict([x])\n",
    "        metric.update(y, y_pred)\n",
    "        \n",
    "    sgd.partial_fit([x], [y], classes=classes)\n",
    "    \n",
    "metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "creme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogLoss: 0.078291"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from creme import linear_model\n",
    "\n",
    "sgd = linear_model.LogisticRegression(\n",
    "    #loss=Log(),\n",
    "    optimizer=optim.SGD(0.01),\n",
    "    l2=0.,\n",
    "    intercept_lr=0.01\n",
    ")\n",
    "metric = metrics.LogLoss()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(X, Y)):\n",
    "    \n",
    "    x = dict(enumerate(x))\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_one(x)\n",
    "        metric.update(y, y_pred)\n",
    "        \n",
    "    sgd.fit_one(x, y)\n",
    "    \n",
    "metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogLoss: 0.078291"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from creme import linear_model\n",
    "\n",
    "sgd = linear_model.LogisticRegression(\n",
    "    loss=Log(),\n",
    "    optimizer=optim.SGD(0.01),\n",
    "    l2=0.,\n",
    "    intercept_lr=0.01\n",
    ")\n",
    "metric = metrics.LogLoss()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(X, Y)):\n",
    "    \n",
    "    x = dict(enumerate(x))\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_one(x)\n",
    "        metric.update(y, y_pred)\n",
    "        \n",
    "    sgd.fit_one(x, y)\n",
    "    \n",
    "metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mini-batches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAE: 185,603.285604"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import linear_model\n",
    "\n",
    "X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
    "X = preprocessing.scale(X)\n",
    "\n",
    "batch_size = 1\n",
    "n_splits = len(X) // batch_size\n",
    "\n",
    "sgd = linear_model.SGDRegressor(\n",
    "    learning_rate='constant',\n",
    "    eta0=0.01,\n",
    "    penalty='none',\n",
    "    shuffle=False,\n",
    "    max_iter=1\n",
    ")\n",
    "mae = metrics.MAE()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict(x)\n",
    "        for yt, yp in zip(y, y_pred):\n",
    "            mae.update(yt, yp)\n",
    "        \n",
    "    sgd.partial_fit(x, y)\n",
    "    \n",
    "mae"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "creme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAE: 185,603.285604"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
    "X = preprocessing.scale(X)\n",
    "\n",
    "sgd = LinearRegression(\n",
    "    loss=Squared(),\n",
    "    optimizer=optim.SGD(0.01),\n",
    "    l2=0.,\n",
    "    intercept_lr=0.01\n",
    ")\n",
    "mae = metrics.MAE()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
    "    \n",
    "    x = pd.DataFrame(x)\n",
    "    y = pd.Series(y)\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_many(x)\n",
    "        for yt, yp in zip(y, y_pred):\n",
    "            mae.update(yt, yp)\n",
    "        \n",
    "    sgd.fit_many(x, y)\n",
    "    \n",
    "mae"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mini-batches with classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogLoss: 0.006645"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import utils\n",
    "from sklearn import linear_model\n",
    "\n",
    "X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
    "Y = X.pop('Class')\n",
    "X = preprocessing.scale(X)\n",
    "classes = np.unique(Y)\n",
    "\n",
    "batch_size = 1\n",
    "n_splits = len(X) // batch_size\n",
    "\n",
    "sgd = linear_model.SGDClassifier(\n",
    "    learning_rate='constant',\n",
    "    eta0=0.01,\n",
    "    penalty='none',\n",
    "    loss='log',\n",
    "    shuffle=False,\n",
    "    max_iter=1\n",
    ")\n",
    "metric = metrics.LogLoss()\n",
    "\n",
    "for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_proba(x)[:, 1]\n",
    "        for yt, yp in zip(y, y_pred):\n",
    "            metric.update(yt, yp)\n",
    "        \n",
    "    sgd.partial_fit(x, y, classes=classes)\n",
    "    \n",
    "metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "creme."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogLoss: 0.006645"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sgd = LinearRegression(\n",
    "    loss=Log(),\n",
    "    optimizer=optim.SGD(0.01),\n",
    "    l2=0.,\n",
    "    intercept_lr=0.01\n",
    ")\n",
    "metric = metrics.LogLoss()\n",
    "\n",
    "\n",
    "for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
    "    \n",
    "    x = pd.DataFrame(x)\n",
    "    \n",
    "    if i > 0:\n",
    "        y_pred = sgd.predict_many(x)\n",
    "        for yt, yp in zip(y, y_pred):\n",
    "            metric.update(yt, yp)\n",
    "        \n",
    "    sgd.fit_many(x, y)\n",
    "    \n",
    "metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Speed comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import preprocessing\n",
    "\n",
    "X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
    "Y = X.pop('Class')\n",
    "X[:] = preprocessing.scale(X)\n",
    "classes = np.unique(Y)\n",
    "\n",
    "batch_size = 512\n",
    "n_batches = len(X) // batch_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "344 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "8.55 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "from creme import linear_model\n",
    "\n",
    "model = linear_model.LogisticRegression()\n",
    "\n",
    "%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.fit_many(x_batch, y_batch)\n",
    "%timeit model.predict_proba_many(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "745 ms ± 41.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "13.3 ms ± 470 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import linear_model\n",
    "\n",
    "model = linear_model.SGDClassifier(loss='log')\n",
    "\n",
    "%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.partial_fit(x_batch, y_batch, classes=classes)\n",
    "%timeit model.predict_proba(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'Pipeline' object has no attribute 'predict_proba_many'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'Pipeline' object has no attribute 'predict_proba_many'"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "from creme import compose\n",
    "from creme import linear_model\n",
    "from creme import preprocessing\n",
    "\n",
    "model = compose.Pipeline(\n",
    "    preprocessing.StandardScaler(),\n",
    "    linear_model.LogisticRegression()\n",
    ")\n",
    "\n",
    "for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)):\n",
    "    y_pred = model.predict_proba_many(x_batch)\n",
    "    model.fit_many(x_batch, y_batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

## standard-scaling.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              standard-scaling.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"%load_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Linear models in mini-batches"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"import collections\n",
	"import math\n",
	"import numbers\n",
	"import typing\n",
	"\n",
	"import numpy as np\n",
	"\n",
	"from creme import base\n",
	"from creme import optim\n",
	"from creme import utils\n",
	"\n",
	"\n",
	"class GLM:\n",
	"\n",
	" def __init__(self, optimizer, loss, l2, intercept, intercept_lr, clip_gradient, initializer):\n",
	" self.optimizer = optimizer\n",
	" self.loss = loss\n",
	" self.l2 = l2\n",
	" self.intercept = intercept\n",
	" self.intercept_lr = (\n",
	" optim.schedulers.Constant(intercept_lr)\n",
	" if isinstance(intercept_lr, numbers.Number) else\n",
	" intercept_lr\n",
	" )\n",
	" self.clip_gradient = clip_gradient\n",
	" self.weights = collections.defaultdict(initializer)\n",
	" self.initializer = initializer\n",
	" \n",
	" def _fit(self, x, y, w, get_grad):\n",
	" \n",
	" # Some optimizers need to do something before a prediction is made\n",
	" self.weights = self.optimizer.update_before_pred(w=self.weights)\n",
	"\n",
	" # Calculate the gradient\n",
	" gradient, loss_gradient = get_grad(x, y, w)\n",
	"\n",
	" # Update the intercept\n",
	" self.intercept -= self.intercept_lr.get(self.optimizer.n_iterations) * loss_gradient\n",
	"\n",
	" # Update the weights\n",
	" self.weights = self.optimizer.update_after_pred(w=self.weights, g=gradient)\n",
	"\n",
	" return self\n",
	" \n",
	" # Single instance methods\n",
	" \n",
	" def _raw_dot_one(self, x):\n",
	" return utils.math.dot(self.weights, x) + self.intercept\n",
	" \n",
	" def _eval_gradient_one(self, x, y, w):\n",
	"\n",
	" loss_gradient = self.loss.gradient(y_true=y, y_pred=self._raw_dot_one(x))\n",
	" loss_gradient *= w\n",
	" loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
	"\n",
	" return (\n",
	" {\n",
	" i: xi * loss_gradient + 2. * self.l2 * self.weights.get(i, 0)\n",
	" for i, xi in x.items()\n",
	" },\n",
	" loss_gradient\n",
	" )\n",
	"\n",
	" def fit_one(self, x, y, w=1.):\n",
	" return self._fit(x, y, w, get_grad=self._eval_gradient_one)\n",
	" \n",
	" # Mini-batch methods\n",
	" \n",
	" def _raw_dot_many(self, X):\n",
	" weights = np.array([self.weights[c] for c in X.columns])\n",
	" return X.values @ weights + self.intercept\n",
	" \n",
	" def _eval_gradient_many(self, X, y, w):\n",
	" \n",
	" loss_gradient = self.loss.gradient(y_true=y.values, y_pred=self._raw_dot_many(X))\n",
	" loss_gradient *= w\n",
	" loss_gradient = np.clip(loss_gradient, -self.clip_gradient, self.clip_gradient)\n",
	" \n",
	" # At this point we have a feature matrix X of shape (n, p). The loss gradient is a vector of\n",
	" # length p. We want to multiply each of X's by the corresponding value in the loss gradient.\n",
	" # When this is all done, we collapse X by computing the average of each column, thereby\n",
	" # obtaining the mean gradient of the batch. From thereon, the code reduces to the single\n",
	" # instance case.\n",
	" gradient = np.einsum('ij,i->ij', x.values, loss_gradient).mean(axis=0)\n",
	" \n",
	" return dict(zip(X.columns, gradient)), loss_gradient.mean() \n",
	" \n",
	" def fit_many(self, X, y, w=1):\n",
	" return self._fit(x, y, w, get_grad=self._eval_gradient_many) \n",
	" \n",
	"\n",
	"class LinearRegression(GLM, base.Regressor):\n",
	"\n",
	" def __init__(\n",
	" self,\n",
	" optimizer: optim.Optimizer = None,\n",
	" loss: optim.losses.RegressionLoss = None,\n",
	" l2=.0,\n",
	" intercept=0.,\n",
	" intercept_lr: typing.Union[optim.schedulers.Scheduler, float] = .01,\n",
	" clip_gradient=1e+12,\n",
	" initializer: optim.initializers.Initializer = None\n",
	" ):\n",
	" super().__init__(\n",
	" optimizer=(\n",
	" optim.SGD(optim.schedulers.InverseScaling(.01, .25))\n",
	" if optimizer is None else\n",
	" optimizer\n",
	" ),\n",
	" loss=optim.losses.Squared() if loss is None else loss,\n",
	" intercept=intercept,\n",
	" intercept_lr=intercept_lr,\n",
	" l2=l2,\n",
	" clip_gradient=clip_gradient,\n",
	" initializer=initializer if initializer else optim.initializers.Zeros()\n",
	" )\n",
	"\n",
	" def predict_one(self, x):\n",
	" return self.loss.mean_func(self._raw_dot_one(x))\n",
	" \n",
	" def predict_many(self, X):\n",
	" return self.loss.mean_func(self._raw_dot_many(x))\n",
	" \n",
	" \n",
	"class Squared:\n",
	" \n",
	" def gradient(self, y_true, y_pred):\n",
	" return y_pred - y_true\n",
	" \n",
	" def mean_func(self, y):\n",
	" return y\n",
	" \n",
	" \n",
	"class Log:\n",
	" \n",
	" def gradient(self, y_true, y_pred):\n",
	" y_true = y_true * 2 - 1\n",
	" z = y_pred * y_true\n",
	" return -y_true / (np.exp(z) + 1.)\n",
	" \n",
	" def mean_func(self, y):\n",
	" return 1 / (1 + np.exp(-y))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 85,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([4, 4, 4])"
	]
	},
	"execution_count": 85,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.maximum([1, 2, 3], 4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"#weights = np.array([sgd.weights[c] for c in x.columns])\n",
	"#weights += 1\n",
	"#y_pred = x.values @ weights + sgd.intercept\n",
	"#loss_gradient = sgd.loss.gradient(y_true=y.values, y_pred=y_pred)\n",
	"#gradient = np.einsum('ij,i->ij', x.values, loss_gradient)\n",
	"#gradient = gradient.mean(axis=0)\n",
	"#gradient"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Single-instance"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"scikit-learn."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"jupyter": {
	"source_hidden": true
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MAE: 5.766969"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from creme import metrics\n",
	"from sklearn import datasets\n",
	"from sklearn import linear_model\n",
	"from sklearn import preprocessing\n",
	"\n",
	"X, Y = datasets.load_boston(return_X_y=True)\n",
	"X = preprocessing.scale(X)\n",
	"\n",
	"sgd = linear_model.SGDRegressor(\n",
	" learning_rate='constant',\n",
	" eta0=0.01,\n",
	" penalty='none'\n",
	")\n",
	"mae = metrics.MAE()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(X, Y)):\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict([x])\n",
	" mae.update(y, y_pred)\n",
	" \n",
	" sgd.partial_fit([x], [y])\n",
	" \n",
	"mae"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"creme."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"jupyter": {
	"source_hidden": true
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MAE: 5.766969"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from creme import optim\n",
	"\n",
	"sgd = LinearRegression(\n",
	" loss=Squared(),\n",
	" optimizer=optim.SGD(0.01),\n",
	" l2=0.,\n",
	" intercept_lr=0.01\n",
	")\n",
	"mae = metrics.MAE()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(X, Y)):\n",
	" \n",
	" x = dict(enumerate(x))\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_one(x)\n",
	" mae.update(y, y_pred)\n",
	" \n",
	" sgd.fit_one(x, y)\n",
	" \n",
	"mae"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Single instance with classification"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"scikit-learn."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LogLoss: 0.078291"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"\n",
	"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')[:30000]\n",
	"Y = X.pop('Class')\n",
	"X = preprocessing.scale(X)\n",
	"classes = np.unique(Y)\n",
	"\n",
	"sgd = linear_model.SGDClassifier(\n",
	" learning_rate='constant',\n",
	" loss='log',\n",
	" eta0=0.01,\n",
	" penalty='none'\n",
	")\n",
	"metric = metrics.LogLoss()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(X, Y)):\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict([x])\n",
	" metric.update(y, y_pred)\n",
	" \n",
	" sgd.partial_fit([x], [y], classes=classes)\n",
	" \n",
	"metric"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"creme."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LogLoss: 0.078291"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from creme import linear_model\n",
	"\n",
	"sgd = linear_model.LogisticRegression(\n",
	" #loss=Log(),\n",
	" optimizer=optim.SGD(0.01),\n",
	" l2=0.,\n",
	" intercept_lr=0.01\n",
	")\n",
	"metric = metrics.LogLoss()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(X, Y)):\n",
	" \n",
	" x = dict(enumerate(x))\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_one(x)\n",
	" metric.update(y, y_pred)\n",
	" \n",
	" sgd.fit_one(x, y)\n",
	" \n",
	"metric"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LogLoss: 0.078291"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from creme import linear_model\n",
	"\n",
	"sgd = linear_model.LogisticRegression(\n",
	" loss=Log(),\n",
	" optimizer=optim.SGD(0.01),\n",
	" l2=0.,\n",
	" intercept_lr=0.01\n",
	")\n",
	"metric = metrics.LogLoss()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(X, Y)):\n",
	" \n",
	" x = dict(enumerate(x))\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_one(x)\n",
	" metric.update(y, y_pred)\n",
	" \n",
	" sgd.fit_one(x, y)\n",
	" \n",
	"metric"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Mini-batches"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"scikit-learn."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MAE: 185,603.285604"
	]
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn import linear_model\n",
	"\n",
	"X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
	"X = preprocessing.scale(X)\n",
	"\n",
	"batch_size = 1\n",
	"n_splits = len(X) // batch_size\n",
	"\n",
	"sgd = linear_model.SGDRegressor(\n",
	" learning_rate='constant',\n",
	" eta0=0.01,\n",
	" penalty='none',\n",
	" shuffle=False,\n",
	" max_iter=1\n",
	")\n",
	"mae = metrics.MAE()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict(x)\n",
	" for yt, yp in zip(y, y_pred):\n",
	" mae.update(yt, yp)\n",
	" \n",
	" sgd.partial_fit(x, y)\n",
	" \n",
	"mae"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"creme."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MAE: 185,603.285604"
	]
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"\n",
	"X, Y = datasets.fetch_california_housing(return_X_y=True)\n",
	"X = preprocessing.scale(X)\n",
	"\n",
	"sgd = LinearRegression(\n",
	" loss=Squared(),\n",
	" optimizer=optim.SGD(0.01),\n",
	" l2=0.,\n",
	" intercept_lr=0.01\n",
	")\n",
	"mae = metrics.MAE()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
	" \n",
	" x = pd.DataFrame(x)\n",
	" y = pd.Series(y)\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_many(x)\n",
	" for yt, yp in zip(y, y_pred):\n",
	" mae.update(yt, yp)\n",
	" \n",
	" sgd.fit_many(x, y)\n",
	" \n",
	"mae"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Mini-batches with classification"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"scikit-learn."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LogLoss: 0.006645"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn import utils\n",
	"from sklearn import linear_model\n",
	"\n",
	"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
	"Y = X.pop('Class')\n",
	"X = preprocessing.scale(X)\n",
	"classes = np.unique(Y)\n",
	"\n",
	"batch_size = 1\n",
	"n_splits = len(X) // batch_size\n",
	"\n",
	"sgd = linear_model.SGDClassifier(\n",
	" learning_rate='constant',\n",
	" eta0=0.01,\n",
	" penalty='none',\n",
	" loss='log',\n",
	" shuffle=False,\n",
	" max_iter=1\n",
	")\n",
	"metric = metrics.LogLoss()\n",
	"\n",
	"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_proba(x)[:, 1]\n",
	" for yt, yp in zip(y, y_pred):\n",
	" metric.update(yt, yp)\n",
	" \n",
	" sgd.partial_fit(x, y, classes=classes)\n",
	" \n",
	"metric"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"creme."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LogLoss: 0.006645"
	]
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sgd = LinearRegression(\n",
	" loss=Log(),\n",
	" optimizer=optim.SGD(0.01),\n",
	" l2=0.,\n",
	" intercept_lr=0.01\n",
	")\n",
	"metric = metrics.LogLoss()\n",
	"\n",
	"\n",
	"for i, (x, y) in enumerate(zip(np.array_split(X, n_splits), np.array_split(Y, n_splits))):\n",
	" \n",
	" x = pd.DataFrame(x)\n",
	" \n",
	" if i > 0:\n",
	" y_pred = sgd.predict_many(x)\n",
	" for yt, yp in zip(y, y_pred):\n",
	" metric.update(yt, yp)\n",
	" \n",
	" sgd.fit_many(x, y)\n",
	" \n",
	"metric"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Speed comparison"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"from sklearn import preprocessing\n",
	"\n",
	"X = pd.read_csv('/Users/mhalford/creme_data/CreditCard/creditcard.csv')\n",
	"Y = X.pop('Class')\n",
	"X[:] = preprocessing.scale(X)\n",
	"classes = np.unique(Y)\n",
	"\n",
	"batch_size = 512\n",
	"n_batches = len(X) // batch_size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"344 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"8.55 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"from creme import linear_model\n",
	"\n",
	"model = linear_model.LogisticRegression()\n",
	"\n",
	"%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.fit_many(x_batch, y_batch)\n",
	"%timeit model.predict_proba_many(X)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"745 ms ± 41.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"13.3 ms ± 470 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"from sklearn import linear_model\n",
	"\n",
	"model = linear_model.SGDClassifier(loss='log')\n",
	"\n",
	"%timeit for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): model.partial_fit(x_batch, y_batch, classes=classes)\n",
	"%timeit model.predict_proba(X)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"ename": "AttributeError",
	"evalue": "'Pipeline' object has no attribute 'predict_proba_many'",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
	"\u001b[0;31mAttributeError\u001b[0m: 'Pipeline' object has no attribute 'predict_proba_many'"
	]
	}
	],
	"source": [
	"%%time\n",
	"\n",
	"from creme import compose\n",
	"from creme import linear_model\n",
	"from creme import preprocessing\n",
	"\n",
	"model = compose.Pipeline(\n",
	" preprocessing.StandardScaler(),\n",
	" linear_model.LogisticRegression()\n",
	")\n",
	"\n",
	"for x_batch, y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)):\n",
	" y_pred = model.predict_proba_many(x_batch)\n",
	" model.fit_many(x_batch, y_batch)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}