Skip to content

Instantly share code, notes, and snippets.

@kfsone
Created April 29, 2024 07:50
Show Gist options
  • Save kfsone/f7f7de6d7c0216cf44cdf97b526a474e to your computer and use it in GitHub Desktop.
Save kfsone/f7f7de6d7c0216cf44cdf97b526a474e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c8616412-8781-44cd-b163-2e9616c1dbda",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"station ids: 244792, min = 128000000, max = 3929141760, range = 3801141760\n"
]
}
],
"source": [
"# Load the IDs from the database\n",
"import apsw ; db = apsw.Connection(\"./data/Tradedangerous.db\") ; sids = tuple(r[0] for r in db.execute(\"SELECT station_id FROM station\"))\n",
"print(f\"station ids: {len(sids)}, min = {min(sids)}, max = {max(sids)}, range = {max(sids) - min(sids)}\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b041f0ef-b4aa-4808-971d-e3458cfdd992",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"exploring (30599,524288)\n"
]
}
],
"source": [
"# calculate some plausible numbers for reducing these into a worst-case 8 stations per bucket\n",
"lower = int(len(sids) / 8) # anything lower can't possibly bottom out at 8-per\n",
"for i in range(8, 64): # calculate a power-of-2 at least 2x the count\n",
" if (1 << i) >= 2*len(sids):\n",
" upper = 1 << i\n",
" break\n",
"print(f\"exploring ({lower},{upper})\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7f8ea17b-3c1e-47dc-9f53-8e21e9728273",
"metadata": {},
"outputs": [],
"source": [
"# method that hashes all the ids we have and returns the most- and least- populated buckets, and count of empties\n",
"from collections import Counter\n",
"def analyze(seed: int):\n",
" buckets = Counter()\n",
" for sid in sids:\n",
" bucket = sid % seed\n",
" buckets[bucket] += 1\n",
" # most_common returns (value, count) in descending order of occupancy\n",
" stats = buckets.most_common()\n",
" most, least = stats[0], stats[-1]\n",
" empties = seed - len(stats)\n",
" max_count = 1\n",
" while max_count < len(sids) and stats[max_count][1] == most[1]:\n",
" max_count += 1\n",
" min_count = 1\n",
" while min_count > -len(sids) and stats[min_count][1] == least[1]:\n",
" min_count += 1\n",
" average_per_bucket = len(sids) / (seed - empties)\n",
" average_potential = len(sids) / seed\n",
" return most, max_count, least, min_count, empties, average_per_bucket, average_potential\n",
"\n",
"def describe(seed, most, maxc, least, minc, empties, avg, pot):\n",
" print(f\"{seed}: {most[1]}/bucket {maxc}x <-> {least[1]}/bucket {minc}x; {empties} empty. avg pop: {avg:.3f}, avg potential: {pot:.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7a8492ce-898a-44fb-bebc-a5cc607b6cbb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"example:\n",
"244792: 18/bucket 2x <-> 1/bucket 1x; 213593 empty. avg pop: 7.846, avg potential: 1.000\n"
]
}
],
"source": [
"print(\"example:\")\n",
"most, maxc, least, minc, empties, avg, pot = analyze(len(sids))\n",
"describe(len(sids), most, maxc, least, minc, empties, avg, pot)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4fdd0a59-837b-497d-9636-f2fd1dbde55a",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "611f584eaa36481f9a86e7caf4c9a446",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Output()"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">30599: 18/bucket 2x &lt;-&gt; 1/bucket 1x; 0 empty. avg pop: 8.000, avg potential: 8.000\n",
"</pre>\n"
],
"text/plain": [
"30599: 18/bucket 2x <-> 1/bucket 1x; 0 empty. avg pop: 8.000, avg potential: 8.000\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">30601: 17/bucket 5x &lt;-&gt; 1/bucket 1x; 0 empty. avg pop: 7.999, avg potential: 7.999\n",
"</pre>\n"
],
"text/plain": [
"30601: 17/bucket 5x <-> 1/bucket 1x; 0 empty. avg pop: 7.999, avg potential: 7.999\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">30645: 16/bucket 14x &lt;-&gt; 1/bucket 1x; 0 empty. avg pop: 7.988, avg potential: 7.988\n",
"</pre>\n"
],
"text/plain": [
"30645: 16/bucket 14x <-> 1/bucket 1x; 0 empty. avg pop: 7.988, avg potential: 7.988\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">32191: 15/bucket 38x &lt;-&gt; 1/bucket 1x; 0 empty. avg pop: 7.604, avg potential: 7.604\n",
"</pre>\n"
],
"text/plain": [
"32191: 15/bucket 38x <-> 1/bucket 1x; 0 empty. avg pop: 7.604, avg potential: 7.604\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">34665: 14/bucket 24x &lt;-&gt; 1/bucket 1x; 0 empty. avg pop: 7.062, avg potential: 7.062\n",
"</pre>\n"
],
"text/plain": [
"34665: 14/bucket 24x <-> 1/bucket 1x; 0 empty. avg pop: 7.062, avg potential: 7.062\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">41049: 13/bucket 20x &lt;-&gt; 1/bucket 1x; 10 empty. avg pop: 5.965, avg potential: 5.963\n",
"</pre>\n"
],
"text/plain": [
"41049: 13/bucket 20x <-> 1/bucket 1x; 10 empty. avg pop: 5.965, avg potential: 5.963\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 13\u001b[0m\n\u001b[0;32m 10\u001b[0m desc \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSearching...\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m seed \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(lower, upper\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m):\n\u001b[1;32m---> 13\u001b[0m most, maxc, least, minc, empties, avg, pot \u001b[38;5;241m=\u001b[39m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mseed\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m most[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m<\u001b[39m most_compact:\n\u001b[0;32m 15\u001b[0m describe(seed, most, maxc, least, minc, empties, avg, pot)\n",
"Cell \u001b[1;32mIn[3], line 7\u001b[0m, in \u001b[0;36manalyze\u001b[1;34m(seed)\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sid \u001b[38;5;129;01min\u001b[39;00m sids:\n\u001b[0;32m 6\u001b[0m bucket \u001b[38;5;241m=\u001b[39m sid \u001b[38;5;241m%\u001b[39m seed\n\u001b[1;32m----> 7\u001b[0m buckets[bucket] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# most_common returns (value, count) in descending order of occupancy\u001b[39;00m\n\u001b[0;32m 9\u001b[0m stats \u001b[38;5;241m=\u001b[39m buckets\u001b[38;5;241m.\u001b[39mmost_common()\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"from rich import print as rprint\n",
"from rich.console import Console\n",
"from rich import progress as rp\n",
"\n",
"with rp.Progress(rp.SpinnerColumn(), rp.TextColumn(\"\"), rp.BarColumn(), rp.MofNCompleteColumn(), rp.TimeRemainingColumn(), transient=False, auto_refresh=True) as prog:\n",
" task = prog.add_task(\"Analyzing...\", total=upper, value=lower, auto_start=True)\n",
"\n",
" # start describing seed/distributions below this threshold\n",
" most_compact = 128\n",
" desc = \"Searching...\"\n",
" \n",
" for seed in range(lower, upper+1):\n",
" most, maxc, least, minc, empties, avg, pot = analyze(seed)\n",
" if most[1] < most_compact:\n",
" describe(seed, most, maxc, least, minc, empties, avg, pot)\n",
" most_compact = most[1]\n",
" prog.update(task, description=desc, completed=seed)\n",
" print(\"finished\")\n",
" prog.refresh()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3c1d1a2-dd0f-4e9a-b9b8-9695fd71b6d9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment