Skip to content

Instantly share code, notes, and snippets.

@strayge
Created March 1, 2023 07:54
Show Gist options
  • Save strayge/68df810813a13d2c96e265c3347e06bb to your computer and use it in GitHub Desktop.
Save strayge/68df810813a13d2c96e265c3347e06bb to your computer and use it in GitHub Desktop.
python sqlite performance test
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import os\n",
"import json\n",
"from time import time\n",
"from typing import Iterable\n",
"from sqlalchemy import create_engine, Column, Integer, String, Identity\n",
"from sqlalchemy.orm import DeclarativeBase, MappedAsDataclass, Mapped, mapped_column, Session, declarative_base\n",
"\n",
"\n",
"def read_json_by_line(filename: str) -> Iterable[dict]:\n",
" with open(filename, 'r') as f:\n",
" for line in f:\n",
" if not line:\n",
" continue\n",
" yield json.loads(line) # type: ignore[no-any-return]\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total time: 7.99 seconds, 516151 rows\n"
]
}
],
"source": [
"\"\"\"Raw sqlite3\"\"\"\n",
"\n",
"if os.path.exists('data.db'):\n",
" os.remove('data.db')\n",
"\n",
"conn = sqlite3.connect('data.db')\n",
"c = conn.cursor()\n",
"c.execute('''\n",
" CREATE TABLE packages (\n",
" name text PRIMARY KEY, \n",
" name_lower text, \n",
" version text, \n",
" upload_time integer, \n",
" home_page text, \n",
" summary text, \n",
" summary_lower text\n",
" )\n",
"'''\n",
")\n",
"\n",
"t2 = time()\n",
"for i, data in enumerate(read_json_by_line(os.path.join('data', 'metadata_lines.json'))):\n",
" name = data['name']\n",
" summary = data.get('summary') or ''\n",
" c.execute(\n",
" '''\n",
" INSERT INTO packages (\n",
" name, name_lower, version, upload_time, home_page, summary, summary_lower\n",
" )\n",
" VALUES (\n",
" ?, ?, ?, ?, ?, ?, ?\n",
" )\n",
" ''', \n",
" (\n",
" name, \n",
" name.lower(), \n",
" data['version'], \n",
" data['upload_time'], \n",
" data['home_page'], \n",
" summary, \n",
" summary.lower(), \n",
" )\n",
" )\n",
"conn.commit()\n",
"t3 = time()\n",
"\n",
"print(f'Total time: {t3 - t2:.2f} seconds, {i} rows')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Inserted packages in 42.72 seconds, 516151 rows\n"
]
}
],
"source": [
"\"\"\"SQLAlchemy, dataclass, no batch\"\"\"\n",
"\n",
"if os.path.exists('data.db'):\n",
" os.remove('data.db')\n",
"\n",
"engine = create_engine('sqlite:///data.db')\n",
"\n",
"class BaseModel(MappedAsDataclass, DeclarativeBase):\n",
" pass\n",
"\n",
"class Package(BaseModel):\n",
" __tablename__ = 'packages'\n",
"\n",
" name: Mapped[str] = mapped_column(primary_key=True)\n",
" name_lower: Mapped[str]\n",
" version: Mapped[str]\n",
" upload_time: Mapped[str]\n",
" home_page: Mapped[str]\n",
" summary: Mapped[str]\n",
" summary_lower: Mapped[str]\n",
"\n",
"BaseModel.metadata.create_all(engine)\n",
"\n",
"t1 = time()\n",
"with Session(engine) as session:\n",
" for i, data in enumerate(read_json_by_line(os.path.join('data', 'metadata_lines.json'))):\n",
" name = data['name']\n",
" summary = data.get('summary') or ''\n",
" session.add(Package(\n",
" name=name, \n",
" name_lower=name.lower(), \n",
" version=data['version'], \n",
" upload_time=data['upload_time'], \n",
" home_page=data['home_page'] or '', \n",
" summary=summary, \n",
" summary_lower=summary.lower(), \n",
" ))\n",
" session.commit()\n",
"t2 = time()\n",
"print(f'Inserted packages in {t2 - t1:.2f} seconds, {i} rows')\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Inserted packages in 31.73 seconds, 516151 rows\n"
]
}
],
"source": [
"\"\"\"SQLAlchemy, dataclass, batch\"\"\"\n",
"\n",
"if os.path.exists('data.db'):\n",
" os.remove('data.db')\n",
"\n",
"engine = create_engine('sqlite:///data.db')\n",
"\n",
"class BaseModel(MappedAsDataclass, DeclarativeBase):\n",
" pass\n",
"\n",
"class Package(BaseModel):\n",
" __tablename__ = 'packages'\n",
"\n",
" name: Mapped[str] = mapped_column(primary_key=True)\n",
" name_lower: Mapped[str]\n",
" version: Mapped[str]\n",
" upload_time: Mapped[str]\n",
" home_page: Mapped[str]\n",
" summary: Mapped[str]\n",
" summary_lower: Mapped[str]\n",
"\n",
"BaseModel.metadata.create_all(engine)\n",
"\n",
"with open(os.path.join('data', 'metadata.json'), 'r') as f:\n",
" metadata = json.load(f)\n",
"\n",
"t1 = time()\n",
"with Session(engine) as session:\n",
" packages = []\n",
" i = 0\n",
" for i, data in enumerate(metadata.values()):\n",
" name = data['name']\n",
" summary = data.get('summary') or ''\n",
" p = Package(\n",
" name=name, \n",
" name_lower=name.lower(), \n",
" version=data['version'], \n",
" upload_time=data['upload_time'], \n",
" home_page=data['home_page'] or '', \n",
" summary=summary, \n",
" summary_lower=summary.lower(), \n",
" )\n",
" packages.append(p)\n",
" if i % 1000 == 0:\n",
" session.add_all(packages)\n",
" session.flush()\n",
" packages = []\n",
"\n",
" session.commit()\n",
"t2 = time()\n",
"print(f'Inserted packages in {t2 - t1:.2f} seconds, {i} rows')\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Read packages in 3.20 seconds\n",
"Inserted packages in 38.20 seconds\n",
"Total time: 41.39 seconds, 516151 rows\n"
]
}
],
"source": [
"\"\"\"SQLAlchemy, no dataclass, batch\"\"\"\n",
"\n",
"if os.path.exists('data.db'):\n",
" os.remove('data.db')\n",
"\n",
"engine = create_engine('sqlite:///data.db')\n",
"\n",
"Base = declarative_base()\n",
"\n",
"class Package(Base):\n",
" __tablename__ = 'packages'\n",
" name = Column(String, primary_key=True)\n",
" name_lower = Column(String)\n",
" version = Column(String)\n",
" upload_time = Column(String)\n",
" home_page = Column(String)\n",
" summary = Column(String)\n",
" summary_lower = Column(String)\n",
"\n",
"BaseModel.metadata.create_all(engine)\n",
"\n",
"t1 = time()\n",
"with open(os.path.join('data', 'metadata.json'), 'r') as f:\n",
" metadata = json.load(f)\n",
"\n",
"t2 = time()\n",
"with Session(engine) as session:\n",
" packages = []\n",
" for i, data in enumerate(metadata.values()):\n",
" name = data['name']\n",
" summary = data.get('summary') or ''\n",
" packages.append(Package(\n",
" name=name, \n",
" name_lower=name.lower(), \n",
" version=data['version'], \n",
" upload_time=data['upload_time'], \n",
" home_page=data['home_page'] or '', \n",
" summary=summary, \n",
" summary_lower=summary.lower(), \n",
" ))\n",
" if i % 10000 == 0:\n",
" session.add_all(packages)\n",
" session.flush()\n",
" packages = []\n",
" if packages:\n",
" session.add_all(packages)\n",
" session.flush()\n",
" packages = []\n",
" session.commit()\n",
"t3 = time()\n",
"\n",
"print(f'Read packages in {t2 - t1:.2f} seconds')\n",
"print(f'Inserted packages in {t3 - t2:.2f} seconds')\n",
"print(f'Total time: {t3 - t1:.2f} seconds, {i} rows')\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total time: 23.40 seconds, 500000 rows\n"
]
}
],
"source": [
"\"\"\"SQLAlchemy, example from their performance test\"\"\"\n",
"\n",
"Base = declarative_base()\n",
"\n",
"\n",
"class Customer(Base):\n",
" __tablename__ = \"customer\"\n",
" id = Column(Integer, Identity(), primary_key=True)\n",
" name = Column(String(255))\n",
" description = Column(String(255))\n",
"\n",
"engine = create_engine(\"sqlite:///:memory:\")\n",
"Base.metadata.drop_all(engine)\n",
"Base.metadata.create_all(engine)\n",
"\n",
"t1 = time()\n",
"session = Session(bind=engine)\n",
"n = 500000\n",
"for chunk in range(0, n, 1000):\n",
" session.add_all(\n",
" [\n",
" Customer(\n",
" id=i + 1,\n",
" name=\"customer name %d\" % i,\n",
" description=\"customer description %d\" % i,\n",
" )\n",
" for i in range(chunk, chunk + 1000)\n",
" ]\n",
" )\n",
" session.flush()\n",
"session.commit()\n",
"\n",
"t2 = time()\n",
"print(f'Total time: {t2 - t1:.2f} seconds, {n} rows')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.11.1 ('venv': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4f283fdae1340cad2e43ca9876253c01895e38babc9160fc03a0f6de2150e0d8"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment