Skip to content

Instantly share code, notes, and snippets.

@kfsone
Created June 12, 2024 06:12
Show Gist options
  • Save kfsone/5a44d68809d369423b9e93e001dbff91 to your computer and use it in GitHub Desktop.
Save kfsone/5a44d68809d369423b9e93e001dbff91 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"id": "1c4a5c94-56a3-4506-9bf1-b488049cdfc2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: apsw in c:\\semc\\python3.12\\venv\\lib\\site-packages (3.46.0.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install apsw"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "62e91b55-b6b5-46d0-a5e0-5e358b7338df",
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"from pathlib import Path\n",
"from random import Random\n",
"import apsw\n",
"import os\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "503c1238-8f20-44f9-9736-486e1a01a084",
"metadata": {},
"outputs": [],
"source": [
"STATS = {}\n",
"\n",
"def populate(filepath: os.PathLike, schema: str, rowid: bool, value_list: tuple[str]):\n",
" filepath = Path(filepath)\n",
" print(f\"-- {filepath}\")\n",
"\n",
" def size():\n",
" return f\"{int(filepath.stat().st_size/1024):12,d}Kb\"\n",
"\n",
" # Create a random seed, but we want consistency between tests.\n",
" rng = Random()\n",
" rng.seed(42)\n",
"\n",
" filepath.unlink(missing_ok=True)\n",
" db = apsw.Connection(str(filepath))\n",
" try:\n",
" db.execute(f\"CREATE TABLE test ({schema}) {\"\" if rowid else \"WITHOUT ROWID\"}\")\n",
" print(f\"| -- schema only. bytes: {size()}\")\n",
" \n",
" db = apsw.Connection(str(filepath))\n",
" iterations = 10_000\n",
" step = 1_024\n",
" total_count = iterations * step\n",
" # Populate the table 1024 entries at a time so we get *some* commit buffering,\n",
" # Shuffle the order we insert the blocks so they're scrambled.\n",
" blocks = list(range(0, iterations * step, step))\n",
" rng.shuffle(blocks)\n",
" start = time.time()\n",
" for block_range in blocks:\n",
" with db as cur:\n",
" # scramble the insertions in each group\n",
" values = [(block_range + i, value_list[(block_range + i) % len(value_list)]) for i in range(0, step)]\n",
" rng.shuffle(values)\n",
" for full_id, value in values:\n",
" db.execute(\"INSERT INTO test (item_id, has_market) VALUES (?, ?)\", (full_id, value))\n",
" print(f\"| -- populated. bytes: {size()}. took {time.time() - start:.4f}s\")\n",
"\n",
" STATS[filepath.name] = filepath.stat().st_size\n",
"\n",
" start = time.time()\n",
" db.execute(\"VACUUM\")\n",
" print(f\"| -- vacuumed. bytes: {size()}. took {time.time() - start:.4f}s\")\n",
" finally:\n",
" db.close()\n",
" del db\n",
"\n",
" filepath.unlink(missing_ok=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a2bedde3-c5d9-45f0-ae14-9402dc39b341",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-text-with-rowid\n",
"| -- schema only. bytes: 12Kb\n",
"| -- populated. bytes: 292,204Kb. took 62.7029s\n",
"| -- vacuumed. bytes: 264,600Kb. took 4.3991s\n"
]
}
],
"source": [
"populate(\"has-as-text-with-rowid\", \"item_id int not null primary key, has_market text(1) not null default '?'\", True, [\"?\", \"N\", \"Y\"])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f25cb94f-98e7-4c88-82a8-5e59ce34a03a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-text-without-rowid\n",
"| -- schema only. bytes: 8Kb\n",
"| -- populated. bytes: 126,320Kb. took 58.4818s\n",
"| -- vacuumed. bytes: 102,280Kb. took 1.9747s\n"
]
}
],
"source": [
"populate(\"has-as-text-without-rowid\", \"item_id int not null primary key, has_market text(1) not null default '?'\", False, [\"?\", \"N\", \"Y\"])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "36cbb3cb-0f40-4ed5-936f-433e05ac29bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-nullable-text-with-rowid\n",
"| -- schema only. bytes: 12Kb\n",
"| -- populated. bytes: 288,984Kb. took 64.9643s\n",
"| -- vacuumed. bytes: 261,380Kb. took 4.1477s\n"
]
}
],
"source": [
"populate(\"has-as-nullable-text-with-rowid\", \"item_id int not null primary key, has_market text(1)\", True, [None, \"N\", \"Y\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b59c222c-d220-464b-9f4f-6be092ed5266",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-nullable-text-without-rowid\n",
"| -- schema only. bytes: 8Kb\n",
"| -- populated. bytes: 122,292Kb. took 62.2043s\n",
"| -- vacuumed. bytes: 98,932Kb. took 2.5043s\n"
]
}
],
"source": [
"populate(\"has-as-nullable-text-without-rowid\", \"item_id int not null primary key, has_market text(1)\", False, [None, \"N\", \"Y\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7f032adf-083b-42f5-8e07-fa671df22820",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-int-with-rowid\n",
"| -- schema only. bytes: 12Kb\n",
"| -- populated. bytes: 285,620Kb. took 68.2471s\n",
"| -- vacuumed. bytes: 258,016Kb. took 6.3711s\n"
]
}
],
"source": [
"populate(\"has-as-int-with-rowid\", \"item_id int not null primary key, has_market int not null\", True, [-1, 0, 1])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "63a07dd2-0a58-4d4f-b02a-72db277d1f9f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-int-without-rowid\n",
"| -- schema only. bytes: 8Kb\n",
"| -- populated. bytes: 117,948Kb. took 62.5073s\n",
"| -- vacuumed. bytes: 95,428Kb. took 2.7009s\n"
]
}
],
"source": [
"populate(\"has-as-int-without-rowid\", \"item_id int not null primary key, has_market int not null\", False, [-1, 0, 1])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9b2bdeb7-42d3-4cda-acf6-6ac214e08d81",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-nullable-int-with-rowid\n",
"| -- schema only. bytes: 12Kb\n",
"| -- populated. bytes: 282,248Kb. took 69.5749s\n",
"| -- vacuumed. bytes: 254,644Kb. took 7.0157s\n"
]
}
],
"source": [
"populate(\"has-as-nullable-int-with-rowid\", \"item_id int not null primary key, has_market int\", True, [None, 0, 1])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "af85506a-946a-40a9-b81b-703a8278b0bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- has-as-nullable-int-without-rowid\n",
"| -- schema only. bytes: 8Kb\n",
"| -- populated. bytes: 113,960Kb. took 62.2484s\n",
"| -- vacuumed. bytes: 92,128Kb. took 2.8635s\n"
]
}
],
"source": [
"populate(\"has-as-nullable-int-without-rowid\", \"item_id int not null primary key, has_market int\", False, [None, 0, 1])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "880e6faf-d0ed-4209-a739-448ffcebc7c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"has-as-text-with-rowid : 299,216,896\n",
"has-as-nullable-text-with-rowid: 295,919,616\n",
"has-as-int-with-rowid : 292,474,880\n",
"has-as-nullable-int-with-rowid: 289,021,952\n",
"has-as-text-without-rowid : 129,351,680\n",
"has-as-nullable-text-without-rowid: 125,227,008\n",
"has-as-int-without-rowid : 120,778,752\n",
"has-as-nullable-int-without-rowid: 116,695,040\n"
]
}
],
"source": [
"ranks = sorted(STATS.keys(), key=lambda k: STATS[k], reverse=True)\n",
"for ranked in ranks:\n",
" print(f\"{ranked:30s}: {STATS[ranked]:15,}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53f48e39-9f4b-4efc-8be2-c36c1d4e19e3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment