Created
November 8, 2017 15:53
-
-
Save TomAugspurger/dced18b10aa28cba434771a37a3576f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import dask.array as da\n", | |
"from dask import compute, delayed, persist\n", | |
"\n", | |
"import dask_ml.decomposition as dd\n", | |
"import sklearn.decomposition as sd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pca = dd.PCA()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Small" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_data(N, P, f=16):\n", | |
" K = (N // f, P)\n", | |
"\n", | |
" ix = da.random.uniform(low=-5, high=5, size=(N, 2), chunks=K)\n", | |
" ox = da.random.uniform(size=(N, P - 2), chunks=K)\n", | |
" X = da.concatenate([ix, ox], axis=1)\n", | |
" X = X.rechunk(K)\n", | |
" X, = persist(X)\n", | |
" return X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N = 1_000\n", | |
"P = 500\n", | |
"\n", | |
"X = make_data(N, P)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 285 ms, sys: 48.9 ms, total: 334 ms\n", | |
"Wall time: 182 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dpca = dd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='full', iterated_power=2)\n", | |
"dpca.fit(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 14.2 ms, sys: 2.55 ms, total: 16.7 ms\n", | |
"Wall time: 9.15 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"spca = sd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='randomized', iterated_power=2)\n", | |
"spca.fit(X.compute())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"So there's some overhead from dask.\n", | |
"\n", | |
"Let's scale it up 100x and switch over to the 'randomized' solvers:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N = 100_000\n", | |
"P = 500\n", | |
"X = make_data(N, P)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.85 s, sys: 715 ms, total: 3.56 s\n", | |
"Wall time: 841 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dpca = dd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='randomized', iterated_power=2)\n", | |
"dpca.fit(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3.15 s, sys: 915 ms, total: 4.06 s\n", | |
"Wall time: 1.67 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"spca = sd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='randomized', iterated_power=2)\n", | |
"spca.fit(X.compute())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.testing.assert_allclose(spca.explained_variance_,\n", | |
" dpca.explained_variance_, rtol=1e-3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Both are fast, but dask is slightly better. Let's go further." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N = 100_000\n", | |
"P = 5_000\n", | |
"X = make_data(N, P)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4.0" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.nbytes / 10**9 # 4 GB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 30.5 s, sys: 18.2 s, total: 48.7 s\n", | |
"Wall time: 10.7 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dpca = dd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='randomized', iterated_power=2)\n", | |
"dpca.fit(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 34.8 s, sys: 14.8 s, total: 49.6 s\n", | |
"Wall time: 35.1 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"spca = sd.PCA(n_components=2, random_state=0,\n", | |
" svd_solver='randomized', iterated_power=2)\n", | |
"spca.fit(X)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment