Skip to content

Instantly share code, notes, and snippets.

@stsievert
Last active August 1, 2018 18:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stsievert/c675b3a237a60efbd01dcb112e29115b to your computer and use it in GitHub Desktop.
Save stsievert/c675b3a237a60efbd01dcb112e29115b to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Incremental Model Selection\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.linear_model import SGDClassifier\n",
"from dask_ml.datasets import make_classification\n",
"from dask_ml.model_selection._incremental import fit\n",
"from sklearn.datasets import fetch_20newsgroups_vectorized, load_digits\n",
"import dask.array as da\n",
"from sklearn.neural_network import MLPClassifier\n",
"from dask_ml.model_selection._incremental import fit\n",
"from dask_ml.model_selection import train_test_split\n",
"from sklearn.model_selection import ParameterSampler\n",
"import dask.array as da\n",
"import dask\n",
"from dask.distributed import Client\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from model_selection_algs import stop_on_plateau"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:56024\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>8</li>\n",
" <li><b>Cores: </b>8</li>\n",
" <li><b>Memory: </b>17.18 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:56024' processes=8 cores=8>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client = Client(processes=True)\n",
"client"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"data = load_digits()\n",
"_X = data.data\n",
"_y = data.target"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(dask.array<array, shape=(1797, 64), dtype=float64, chunksize=(1198, 64)>,\n",
" dask.array<array, shape=(1797,), dtype=int64, chunksize=(1198,)>)"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n, d = _X.shape\n",
"X = da.from_array(_X, chunks=(n // 10, d))\n",
"y = da.from_array(_y, chunks=n // 10)\n",
"X, y"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"model = MLPClassifier()\n",
"params = {'activation': ['logistic', 'tanh', 'relu'],\n",
" 'solver': ['adam', 'sgd'],\n",
" 'alpha': np.logspace(-6, -2, num=1000),\n",
" 'batch_size': 2**np.arange(5, 8 + 1),\n",
" 'learning_rate': ['constant', 'invscaling', 'adaptive'],\n",
" 'learning_rate_init': np.logspace(-4, -1, num=1000),\n",
" 'power_t': np.linspace(0.1, 0.75),\n",
" 'momentum': np.linspace(0, 1, num=1000),\n",
" 'beta_1': np.linspace(0.8, 0.99),\n",
" 'beta_2': 1 - np.logspace(-5, -2)}\n",
"classes = da.unique(y).compute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Successive halving"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"all_history = {}"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(81.0, 3.0), (34.0, 9.0), (15.0, 27.0), (8.0, 81.0), (5.0, 243.0)]"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import math\n",
"R = 81 * 3\n",
"eta = 3.0\n",
"s_max = math.floor(math.log(R, eta))\n",
"B = (s_max + 1) * R\n",
"# for s in [...]:\n",
"s = np.arange(s_max + 1)\n",
"s = s_max - s # pick the most exploratory bracket\n",
"n = np.ceil(B / R * eta**s / (s + 1))\n",
"r = np.floor(R * eta**-s)\n",
"list(zip(n, r))"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 4.0 27.0\n",
"1 6.0 9.0\n",
"2 12.0 3.0\n",
"3 27.0 1.0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n",
"distributed.nanny - WARNING - Restarting worker\n"
]
}
],
"source": [
"from sklearn.model_selection import ParameterSampler\n",
"from model_selection_algs import SHA\n",
"import math\n",
"\n",
"def test_sha(s=0, eta=3, n=81, r=5, repeat=0):\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
" \n",
" alg = SHA(n, r)\n",
" params_list = list(ParameterSampler(params, n))\n",
"\n",
" info, models, history = fit(model, params_list, X_train, y_train, X_test, y_test,\n",
" alg.fit, {'classes': classes})\n",
" history = [{'alg': f'sh-{s}', 'repeat': repeat, 'bracket': s, **h} for h in history]\n",
" return history\n",
"\n",
"for s, (ni, ri) in enumerate(reversed(list(zip(n, r)))):\n",
" print(s, ni, ri)\n",
" all_history[f'sh-{s}'] = [test_sha(repeat=repeat, n=ni, r=ri, s=s) for repeat in range(10)]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from toolz import partial\n",
"\n",
"def test_rand(i=0, n=5, r=81):\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
" classes = da.unique(y).compute()\n",
"\n",
" params_list = list(ParameterSampler(params, n))\n",
"\n",
" fn = partial(stop_on_plateau, max_iter=r)\n",
" info, models, history = fit(model, params_list, X_train, y_train, X_test, y_test, fn,\n",
" {'classes': classes})\n",
" history = [{'alg': 'random sampling', 'repeat': i, 'n': n, 'r': r, **h}\n",
" for h in history]\n",
" return history"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3613, 243)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from toolz import concat\n",
"from pprint import pprint\n",
"\n",
"def get_total_calls(history):\n",
" total_calls = {}\n",
" for item in history:\n",
" if total_calls.get(item['model_id'], 0) < item['partial_fit_calls']:\n",
" total_calls[item['model_id']] = item['partial_fit_calls']\n",
" return sum(total_calls.values()), max(total_calls.values())\n",
"\n",
"max_calls = 0\n",
"total_calls = 0\n",
"for key, runs in all_history.items():\n",
" run = runs[0]\n",
" calls, most_calls = get_total_calls(run)\n",
" if most_calls > max_calls:\n",
" max_calls = most_calls\n",
" total_calls += calls\n",
" \n",
"total_calls, max_calls"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"r = max_calls\n",
"n = int(total_calls / max_calls)\n",
"all_history['random'] = [test_rand(n=n, r=r, i=i) for i in range(20)]\n",
"all_history['random-2n'] = [test_rand(n=2*n, r=r, i=i) for i in range(20)]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['sh-0', 'sh-1', 'sh-2', 'sh-3', 'sh-4', 'random', 'random-2n'])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_history.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notes from the profile dashboard for all of hyperband:\n",
"\n",
"* partial_fit: 163s (84%)\n",
" * copy: 15s (9% of fit)\n",
"* score: 30s (15%)\n",
"\n",
"Random sample profile:\n",
"\n",
"* partial_fit: 174s (91%)\n",
" * copy: 14s (8% of fit)\n",
"* score: 16s (8%)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"out = []\n",
"for alg, hist in all_history.items():\n",
" for repeat, batch in enumerate(hist):\n",
" best_score = -np.inf\n",
" calls = {}\n",
" skips = 0\n",
" for k, item in enumerate(batch):\n",
" if 'score' not in item:\n",
" continue\n",
" calls[item['model_id']] = item['partial_fit_calls']\n",
" total_calls = sum(calls.values())\n",
" best_score = item['score'] if item['score'] > best_score else best_score\n",
" if 'random' in alg or alg == 'sh-0':\n",
" adaptive = False\n",
" bracket = 0\n",
" else:\n",
" adaptive = True\n",
" bracket = int(alg.split('-')[-1])\n",
"\n",
" out += [{'best_score': best_score,\n",
" 'total_partial_fit_calls': total_calls,\n",
" 'alg': alg,\n",
" 'epoch': total_calls / n,\n",
" 'adaptive': adaptive,\n",
" 'bracket': bracket,\n",
" 'repeat': repeat}]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bracket</th>\n",
" <th>alg</th>\n",
" <th>score</th>\n",
" <th>model_id</th>\n",
" <th>partial_fit_calls</th>\n",
" <th>i</th>\n",
" <th>adaptive</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0.1</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.1</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0.1</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.1</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0.1</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bracket alg score model_id partial_fit_calls i adaptive\n",
"0 0.0 sh-0 0.0 2 0 0.1 0.1\n",
"1 0.0 sh-0 0.0 1 0 0.1 0.1\n",
"2 0.0 sh-0 0.0 3 0 0.1 0.1\n",
"3 0.0 sh-0 0.0 0 0 0.1 0.1\n",
"4 0.0 sh-0 0.0 4 0 0.1 0.1"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import altair as alt\n",
"hist = pd.DataFrame(sum(sum(all_history.values(), []), []),\n",
" columns=['bracket', 'alg', 'score', 'model_id',\n",
" 'partial_fit_calls', 'i', 'adaptive'])\n",
"hist = hist.fillna(0.1)\n",
"hist.head()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>adaptive</th>\n",
" <th>alg</th>\n",
" <th>best_score</th>\n",
" <th>bracket</th>\n",
" <th>epoch</th>\n",
" <th>repeat</th>\n",
" <th>total_partial_fit_calls</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>sh-0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" adaptive alg best_score bracket epoch repeat total_partial_fit_calls\n",
"0 False sh-0 0.0 0 0.0 0 0\n",
"1 False sh-0 0.0 0 0.0 0 0\n",
"2 False sh-0 0.0 0 0.0 0 0\n",
"3 False sh-0 0.0 0 0.0 0 0\n",
"4 False sh-0 0.0 0 0.0 0 0"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.DataFrame(out)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"today = datetime.now().isoformat()[:10]\n",
"# df.to_parquet(today + '-df-even.parquet')\n",
"# hist.to_parquet(today + '-show-even.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def get(df, key):\n",
" values = df[key].unique()\n",
" assert len(values) == 1\n",
" return values[0]\n",
"\n",
"\n",
"plt.style.use('bmh')\n",
"fig, ax = plt.subplots()\n",
"\n",
"import matplotlib\n",
"cmap = matplotlib.cm.get_cmap('plasma')\n",
"n = len(df.bracket.unique())\n",
"colors = [cmap(i / n) for i in range(n)]\n",
"\n",
"labels = []\n",
"for alg in df.alg.unique():\n",
" show = df[df.alg == alg]\n",
" bracket = get(show, 'bracket')\n",
" adaptive = get(show, 'adaptive')\n",
" show = show.pivot_table(values='best_score',\n",
" index='total_partial_fit_calls',\n",
" aggfunc=np.mean)\n",
" alpha = 1\n",
" if 'random' in alg:\n",
" color = 'black'\n",
" alpha = 0.50\n",
" if '2n' in alg:\n",
" alpha = 0.5\n",
" continue\n",
" else:\n",
" color = colors[bracket]\n",
" show.plot(ax=ax, color=color,\n",
" style='-.' if not adaptive else '-',\n",
" legend=False, alpha=alpha)\n",
" if 'sh' in alg:\n",
" bracket = alg.split('-')[-1]\n",
" label = f'succ-halv {bracket}'\n",
" else:\n",
" label = alg\n",
" labels += [label]\n",
"plt.legend(labels, loc='best')\n",
"plt.ylim(0.70, 1)\n",
"plt.ylabel('score')\n",
"# plt.grid()\n",
"plt.savefig('2018-08-01-even-243.png', dpi=300)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import math
import toolz
import numpy as np
def stop_on_plateau(info, patience=10, tol=0.001, max_iter=None):
out = {}
for ident, records in info.items():
if max_iter is not None and len(records) > max_iter:
out[ident] = 0
elif len(records) > patience:
old = records[-patience]['score']
if all(d['score'] < old + tol for d in records[-patience:]):
out[ident] = 0
else:
out[ident] = 1
else:
out[ident] = 1
return out
class SHA:
def __init__(self, n, r, eta=3):
self.steps = 0
self.n = n
self.r = r
self.eta = eta
def fit(self, info):
n, r, eta = self.n, self.r, self.eta
n_i = math.floor(n * eta ** -self.steps)
r_i = r * eta**self.steps
if self.steps == 0:
self.steps = 1
assert len(info) == self.n
self.to_reach = {k: r_i for k in info}
return {k: 1 for k in info}
keep_training = stop_on_plateau(info)
if sum(keep_training.values()) == 0:
return keep_training
iteration_increase = len(info) / sum(keep_training.values())
info = {k: info[k] for k in keep_training}
calls = {k: record[-1]['partial_fit_calls']
for k, record in info.items()}
if calls != self.to_reach:
return {k: 1 for k in info}
best = toolz.topk(n_i, info, key=lambda k: info[k][-1]['score'])
if 1 <= len(best) < eta:
self._best_arm = max(best, key=lambda k: info[k][-1]['score'])
if len(best) in {0, 1}:
best = self._best_arm
return {best: 0}
to_reach = {k: r_i - info[k][-1]['partial_fit_calls']
for k in best}
self.to_reach = {k: int(v * iteration_increase) for k, v in to_reach.items()}
self.steps += 1
return {k: 1 for k in to_reach}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment