Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Created March 31, 2020 10:15
Show Gist options
  • Save MaxHalford/47cd83f7cb8e23d2db5616ba9b177ea9 to your computer and use it in GitHub Desktop.
Save MaxHalford/47cd83f7cb8e23d2db5616ba9b177ea9 to your computer and use it in GitHub Desktop.
Improving scikit-learn's single prediction speed
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Speeding up scikit-learn for single predictions"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.22.2.post1'"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sklearn\n",
"\n",
"sklearn.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Linear regression"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import datasets\n",
"\n",
"X, y = datasets.load_boston(return_X_y=True)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"44.4 µs ± 3.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"lin_reg = linear_model.LinearRegression()\n",
"lin_reg.fit(X, y)\n",
"%timeit lin_reg.predict(X[[0]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.38 µs ± 31.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"class BarebonesLinearRegression(linear_model.LinearRegression):\n",
" \n",
" def predict_single(self, x):\n",
" return np.dot(self.coef_, x) + self.intercept_\n",
" \n",
"bb_lin_reg = BarebonesLinearRegression()\n",
"bb_lin_reg.fit(X, y)\n",
"%timeit bb_lin_reg.predict_single(X[0])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"for xi in X:\n",
" assert lin_reg.predict([xi])[0] == bb_lin_reg.predict_single(xi)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import datasets\n",
"from sklearn import preprocessing\n",
"\n",
"X, y = datasets.load_digits(return_X_y=True)\n",
"X = preprocessing.scale(X)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"71.3 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"log_reg = linear_model.LogisticRegression()\n",
"log_reg.fit(X, y)\n",
"%timeit log_reg.predict_proba(X[[0]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
" 4400004 function calls in 4.414 seconds\n",
"\n",
" Ordered by: internal time\n",
" List reduced from 34 to 10 due to restriction <10>\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 100000 1.068 0.000 3.504 0.000 _logsumexp.py:9(logsumexp)\n",
" 200000 0.530 0.000 0.530 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n",
" 300000 0.315 0.000 1.376 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}\n",
" 100000 0.257 0.000 3.761 0.000 _logsumexp.py:132(softmax)\n",
" 100000 0.254 0.000 4.316 0.000 <ipython-input-145-3bbdfc107533>:5(predict_proba_single)\n",
" 200000 0.245 0.000 0.578 0.000 _ufunc_config.py:39(seterr)\n",
" 100000 0.232 0.000 0.383 0.000 _util.py:200(_asarray_validated)\n",
" 200000 0.226 0.000 0.868 0.000 fromnumeric.py:73(_wrapreduction)\n",
" 200000 0.201 0.000 0.222 0.000 _ufunc_config.py:139(geterr)\n",
" 100000 0.095 0.000 0.520 0.000 fromnumeric.py:2092(sum)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from scipy import special\n",
"\n",
"class BarebonesLogisticRegression(linear_model.LogisticRegression):\n",
" \n",
" def predict_proba_single(self, x):\n",
" return special.softmax(np.dot(self.coef_, x) + self.intercept_)\n",
"\n",
"bb_log_reg = BarebonesLogisticRegression()\n",
"bb_log_reg.fit(X, y)\n",
"%prun -l 10 [bb_log_reg.predict_proba_single(X[0]) for _ in range(100000)]"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14.7 µs ± 682 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
]
}
],
"source": [
"def custom_softmax(x):\n",
" z = x - max(x)\n",
" numerator = np.exp(z)\n",
" denominator = np.sum(numerator)\n",
" return numerator / denominator\n",
"\n",
"class BarebonesLogisticRegression(linear_model.LogisticRegression):\n",
" \n",
" def predict_proba_single(self, x):\n",
" return custom_softmax(np.dot(self.coef_, x) + self.intercept_)\n",
"\n",
"bb_log_reg = BarebonesLogisticRegression()\n",
"bb_log_reg.fit(X, y)\n",
"%timeit bb_log_reg.predict_proba_single(X[0])"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"for xi in X:\n",
" assert np.allclose(log_reg.predict_proba([xi])[0], bb_log_reg.predict_proba_single(xi))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Standard scaling"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"43 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"scaler = preprocessing.StandardScaler()\n",
"scaler.fit(X)\n",
"%timeit scaler.transform(X[[0]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.7 µs ± 34.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n"
]
}
],
"source": [
"class BarebonesStandardScaler(preprocessing.StandardScaler):\n",
"\n",
" def transform_single(self, x):\n",
" return (x - self.mean_) / self.var_ ** .5\n",
" \n",
"bb_scaler = BarebonesStandardScaler()\n",
"bb_scaler.fit(X)\n",
"%timeit bb_scaler.transform_single(X[0])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"for xi in X:\n",
" assert np.array_equal(scaler.transform([xi])[0], bb_scaler.transform_single(xi))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"97.6 µs ± 4.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"from sklearn import pipeline\n",
"\n",
"pp = pipeline.Pipeline([('scaler', scaler), ('lin_reg', lin_reg)])\n",
"pp.fit(X, y)\n",
"%timeit pp.predict(X[[0]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.96 µs ± 184 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
]
}
],
"source": [
"class BarebonesPipeline(pipeline.Pipeline):\n",
"\n",
" def predict_single(self, x):\n",
" for _, transformer in self.steps[:-1]:\n",
" x = transformer.transform_single(x)\n",
" return self.steps[-1][1].predict_single(x)\n",
" \n",
" def predict_proba_single(self, x):\n",
" for _, transformer in self.steps[:-1]:\n",
" x = transformer.transform_single(x)\n",
" return self.steps[-1][1].predict_proba_single(x)\n",
" \n",
"bb_pp = BarebonesPipeline([('bb_scaler', bb_scaler), ('bb_lin_reg', bb_lin_reg)])\n",
"bb_pp.fit(X, y)\n",
"%timeit bb_pp.predict_single(X[0])"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"for xi in X:\n",
" assert pp.predict([xi])[0] == bb_pp.predict_single(xi)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Decision tree"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"49 µs ± 1.41 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"from sklearn import tree\n",
"\n",
"X, y = datasets.fetch_california_housing(return_X_y=True)\n",
"dtree = tree.DecisionTreeRegressor(max_depth=7)\n",
"dtree.fit(X, y)\n",
"%timeit dtree.predict(X[[0]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"42.9 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"class BarebonesDecisionTreeRegressor(tree.DecisionTreeRegressor):\n",
" \n",
" def predict_single(self, x):\n",
" node_idx = self.apply([x])[0]\n",
" return self.tree_.value[node_idx][0, 0]\n",
" \n",
"bb_dtree = BarebonesDecisionTreeRegressor(max_depth=7)\n",
"bb_dtree.fit(X, y)\n",
"%timeit bb_dtree.predict_single(X[0])"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"for xi in X:\n",
" assert np.isclose(dtree.predict([xi])[0], bb_dtree.predict_single(xi))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment