Created
June 2, 2018 13:18
-
-
Save rphes/48569eb0c929d33deef18c9de0d96aa8 to your computer and use it in GitHub Desktop.
Parallel K-Modes and K-Prototypes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import time\n", | |
"\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"from kmodes.kmodes import KModes\n", | |
"from kmodes.kprototypes import KPrototypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = ...\n", | |
"cat_cols = data.x_data.select_dtypes('category').columns\n", | |
"cat_index = [data.x_data.columns.get_loc(column)\n", | |
" for column, dtype in data.x_data.dtypes.items()\n", | |
" if pd.api.types.is_categorical_dtype(dtype)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Finished with 1 jobs in 14.94139575958252 s.\n", | |
"Finished with 2 jobs in 7.5124900341033936 s.\n", | |
"Finished with 3 jobs in 5.480122327804565 s.\n", | |
"Finished with 4 jobs in 4.830286026000977 s.\n", | |
"Finished with 5 jobs in 5.453425884246826 s.\n", | |
"Finished with 6 jobs in 5.0629661083221436 s.\n", | |
"Finished with 7 jobs in 5.260700702667236 s.\n", | |
"Finished with 8 jobs in 6.280949831008911 s.\n" | |
] | |
} | |
], | |
"source": [ | |
"labels = []\n", | |
"for n_jobs in range(1, 9):\n", | |
" np.random.seed(42)\n", | |
" kproto = KPrototypes(20, n_jobs=n_jobs, random_state=42)\n", | |
" start = time.time()\n", | |
" kproto.fit(data.x_data.sample(n=1000, random_state=42), data.x_data, categorical=cat_index)\n", | |
" print(\"Finished with {} jobs in {} s.\".format(n_jobs, time.time() - start))\n", | |
" labels.append(kproto.labels_)\n", | |
" \n", | |
" if n_jobs > 1:\n", | |
" assert np.all(np.equal(labels[n_jobs-2], labels[n_jobs-1]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Finished with 1 jobs in 22.188632011413574 s.\n", | |
"Finished with 2 jobs in 26.93237280845642 s.\n", | |
"Finished with 3 jobs in 26.965989112854004 s.\n", | |
"Finished with 4 jobs in 26.93553900718689 s.\n", | |
"Finished with 5 jobs in 27.35899806022644 s.\n", | |
"Finished with 6 jobs in 28.72810435295105 s.\n", | |
"Finished with 7 jobs in 27.51678490638733 s.\n", | |
"Finished with 8 jobs in 27.7084379196167 s.\n" | |
] | |
} | |
], | |
"source": [ | |
"labels = []\n", | |
"for n_jobs in range(1, 9):\n", | |
" np.random.seed(42)\n", | |
" kmodes = KModes(20, n_jobs=n_jobs, random_state=42)\n", | |
" start = time.time()\n", | |
" kmodes.fit(data.x_data[cat_cols].sample(n=100000, random_state=42), data.x_data)\n", | |
" print(\"Finished with {} jobs in {} s.\".format(n_jobs, time.time() - start))\n", | |
" labels.append(kmodes.labels_)\n", | |
" \n", | |
" if n_jobs > 1:\n", | |
" assert np.all(np.equal(labels[n_jobs-2], labels[n_jobs-1]))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks @rphes for the parallel execution in Kprototypes