Skip to content

Instantly share code, notes, and snippets.

@Sinusoidal36
Created December 7, 2020 06:32
Show Gist options
  • Save Sinusoidal36/5094341c5dd4a2ddddedfacd191ec55a to your computer and use it in GitHub Desktop.
Save Sinusoidal36/5094341c5dd4a2ddddedfacd191ec55a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"import pandas as pd\n",
"import timeit\n",
"from IPython.display import Image\n",
"\n",
"DATASKETCH_PKG_PATH = '/home/sinusoidal/projects/datasketch/'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"old_setup = '''\n",
"from datasketch import minhash\n",
"from xxhash import xxh3_64_intdigest as hashfunc\n",
"import random\n",
"\n",
"'''\n",
"\n",
"new_setup = f'''\n",
"import importlib.util\n",
"import random\n",
"from xxhash import xxh3_64_intdigest as hashfunc\n",
"DATASKETCH_PKG_PATH = \"{DATASKETCH_PKG_PATH}\"\n",
"\n",
"spec = importlib.util.spec_from_file_location(\"minhash\", DATASKETCH_PKG_PATH + \"datasketch/minhash.py\")\n",
"minhash = importlib.util.module_from_spec(spec)\n",
"spec.loader.exec_module(minhash)\n",
"\n",
"spec = importlib.util.spec_from_file_location(\"bulk\", DATASKETCH_PKG_PATH + \"datasketch/bulk.py\")\n",
"bulk = importlib.util.module_from_spec(spec)\n",
"spec.loader.exec_module(bulk)\n",
"\n",
"'''"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Updates\n",
"\n",
"Here it is shown that the new update implementation is substantially faster when providing a list of tokens instead of performing updates iteratively. \n",
"\n",
"Comparative performance scales with the number of updates."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"iterations = 100\n",
"repeats = 10\n",
"\n",
"update_setup = '''\n",
"byte_data = [str(random.randint(0, 1000000)).encode(\"utf8\") for n in range({})]\n",
"'''\n",
"iterative_statement = '''\n",
"m = minhash.MinHash(hashfunc=hashfunc)\n",
"for b in byte_data:\n",
" m.update(b)\n",
"'''\n",
"list_statement = '''\n",
"m = minhash.MinHash(hashfunc=hashfunc)\n",
"m.update_batch(byte_data)\n",
"'''\n",
"\n",
"data = []\n",
"for token_count in [10, 100, 1000, 10000]:\n",
" _update_setup = update_setup.format(token_count)\n",
" old = timeit.repeat(iterative_statement, setup=old_setup + _update_setup, number=iterations, repeat=repeats)\n",
" new = timeit.repeat(list_statement, setup=new_setup + _update_setup, number=iterations, repeat=repeats)\n",
" data += [{'version':'old', 'tokens':token_count, 't':t} for t in old]\n",
" data += [{'version':'new', 'tokens':token_count, 't':t} for t in new]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data)\n",
"df['millis'] = (df['t'] * 1000) / iterations\n",
"reference = df.groupby(['version','tokens'])['millis'].median().old\n",
"df = df.loc[df['version'] == 'new'].groupby(['version','tokens'])['millis'].median() / reference\n",
"df = df.reset_index()\n",
"df['ratio'] = 1 / df['millis']\n",
"\n",
"fig = px.bar(\n",
" df,\n",
" y='ratio',\n",
" x='tokens'\n",
")\n",
"fig.update_layout(barmode='group')\n",
"fig.update_xaxes(type='category')\n",
"Image(fig.to_image(format=\"png\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bulk\n",
"\n",
"Here it is shown that we can avoid a significant amount of overhead when computing minhashes using the bulk method. \n",
"\n",
"This yields anywhere from 5-25X performance gains when combined with the new update method."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"iterations = 3\n",
"repeats = 2\n",
"\n",
"bulk_setup = '''\n",
"minhash_data = [[str(random.randint(0, 1000000)).encode(\"utf8\") for n in range({})] for m in range({})]\n",
"'''\n",
"old_statement = '''\n",
"for byte_data in minhash_data:\n",
" m = minhash.MinHash(hashfunc=hashfunc)\n",
" for b in byte_data:\n",
" m.update(b)\n",
"'''\n",
"new_statement = '''\n",
"m = minhash.MinHash(hashfunc=hashfunc)\n",
"bulk.compute_minhashes(minhash_data, m)\n",
"'''\n",
"\n",
"data = []\n",
"for minhash_count in [10, 100, 1000]:\n",
" for token_count in [10, 50, 100, 1000]:\n",
" _bulk_setup = bulk_setup.format(token_count, minhash_count)\n",
" old = timeit.repeat(old_statement, setup=old_setup + _bulk_setup, number=iterations, repeat=repeats)\n",
" new = timeit.repeat(new_statement, setup=new_setup + _bulk_setup, number=iterations, repeat=repeats)\n",
" data += [{'version':'old', 'tokens':token_count, 'minhashes':minhash_count, 't':t} for t in old]\n",
" data += [{'version':'new', 'tokens':token_count, 'minhashes':minhash_count, 't':t} for t in new]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data)\n",
"df['millis'] = (df['t'] * 1000) / iterations\n",
"reference = df.groupby(['version','minhashes','tokens'])['millis'].median().old\n",
"df = df.loc[df['version'] == 'new'].groupby(['version','minhashes','tokens'])['millis'].median() / reference\n",
"df = df.reset_index()\n",
"df['tokens'] = df['tokens'].astype(str)\n",
"df['ratio'] = 1 / df['millis']\n",
"\n",
"fig = px.bar(\n",
" df,\n",
" y='ratio',\n",
" x='minhashes',\n",
" color='tokens'\n",
")\n",
"fig.update_layout(barmode='group')\n",
"fig.update_xaxes(type='category')\n",
"Image(fig.to_image(format=\"png\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment