Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Created November 8, 2017 15:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save TomAugspurger/dced18b10aa28cba434771a37a3576f7 to your computer and use it in GitHub Desktop.
Save TomAugspurger/dced18b10aa28cba434771a37a3576f7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import dask.array as da\n",
"from dask import compute, delayed, persist\n",
"\n",
"import dask_ml.decomposition as dd\n",
"import sklearn.decomposition as sd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"pca = dd.PCA()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Small"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def make_data(N, P, f=16):\n",
" K = (N // f, P)\n",
"\n",
" ix = da.random.uniform(low=-5, high=5, size=(N, 2), chunks=K)\n",
" ox = da.random.uniform(size=(N, P - 2), chunks=K)\n",
" X = da.concatenate([ix, ox], axis=1)\n",
" X = X.rechunk(K)\n",
" X, = persist(X)\n",
" return X"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"N = 1_000\n",
"P = 500\n",
"\n",
"X = make_data(N, P)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 285 ms, sys: 48.9 ms, total: 334 ms\n",
"Wall time: 182 ms\n"
]
}
],
"source": [
"%%time\n",
"dpca = dd.PCA(n_components=2, random_state=0,\n",
" svd_solver='full', iterated_power=2)\n",
"dpca.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 14.2 ms, sys: 2.55 ms, total: 16.7 ms\n",
"Wall time: 9.15 ms\n"
]
}
],
"source": [
"%%time\n",
"spca = sd.PCA(n_components=2, random_state=0,\n",
" svd_solver='randomized', iterated_power=2)\n",
"spca.fit(X.compute())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So there's some overhead from dask.\n",
"\n",
"Let's scale it up 100x and switch over to the 'randomized' solvers:"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"N = 100_000\n",
"P = 500\n",
"X = make_data(N, P)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.85 s, sys: 715 ms, total: 3.56 s\n",
"Wall time: 841 ms\n"
]
}
],
"source": [
"%%time\n",
"dpca = dd.PCA(n_components=2, random_state=0,\n",
" svd_solver='randomized', iterated_power=2)\n",
"dpca.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.15 s, sys: 915 ms, total: 4.06 s\n",
"Wall time: 1.67 s\n"
]
}
],
"source": [
"%%time\n",
"spca = sd.PCA(n_components=2, random_state=0,\n",
" svd_solver='randomized', iterated_power=2)\n",
"spca.fit(X.compute())"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"np.testing.assert_allclose(spca.explained_variance_,\n",
" dpca.explained_variance_, rtol=1e-3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Both are fast, but dask is slightly better. Let's go further."
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"N = 100_000\n",
"P = 5_000\n",
"X = make_data(N, P)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4.0"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.nbytes / 10**9 # 4 GB"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 30.5 s, sys: 18.2 s, total: 48.7 s\n",
"Wall time: 10.7 s\n"
]
}
],
"source": [
"%%time\n",
"dpca = dd.PCA(n_components=2, random_state=0,\n",
" svd_solver='randomized', iterated_power=2)\n",
"dpca.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 34.8 s, sys: 14.8 s, total: 49.6 s\n",
"Wall time: 35.1 s\n"
]
}
],
"source": [
"%%time\n",
"spca = sd.PCA(n_components=2, random_state=0,\n",
" svd_solver='randomized', iterated_power=2)\n",
"spca.fit(X)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment