Skip to content

Instantly share code, notes, and snippets.

@coleifer
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save coleifer/692c69724f08dc1dc8cc to your computer and use it in GitHub Desktop.
Save coleifer/692c69724f08dc1dc8cc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:84058d11511f32ba79be8c9c8e30392bb52c1fb9f4a60468021d349e5df76bcd"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SQLite transitive closure extension demo"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from peewee import *\n",
"from playhouse.sqlite_ext import *\n",
"\n",
"db = SqliteExtDatabase('/tmp/categories.db')\n",
"db.load_extension('/home/charles/envs/notebook/closure') # Note we leave off .so"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class Category(Model):\n",
" name = CharField()\n",
" parent = ForeignKeyField('self', null=True, related_name='children')\n",
"\n",
" class Meta:\n",
" database = db\n",
"\n",
"# Create a virtual table using the transitive closure extension.\n",
"CategoryClosure = ClosureTable(Category)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create the tables if they do not exist already.\n",
"Category.create_table(True)\n",
"CategoryClosure.create_table(True)\n",
"Category.delete().execute() # Blow away any data."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"0"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"books = Category.create(name='Books')\n",
"fiction = Category.create(name='Fiction', parent=books)\n",
"scifi = Category.create(name='Sci-fi', parent=fiction)\n",
"westerns = Category.create(name='Westerns', parent=fiction)\n",
"non_fiction = Category.create(name='Non-fiction', parent=books)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get all descendant nodes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Using a JOIN.\n",
"all_descendants = (Category\n",
" .select()\n",
" .join(CategoryClosure, on=(Category.id == CategoryClosure.id))\n",
" .where(CategoryClosure.root == books))\n",
"print [cat.name for cat in all_descendants]\n",
"\n",
"# Using a subquery instead. \"<<\" translates to \"IN\".\n",
"subquery = (CategoryClosure\n",
" .select(CategoryClosure.id)\n",
" .where(CategoryClosure.root == books))\n",
"all_descendants = Category.select().where(Category.id << subquery)\n",
"print [cat.name for cat in all_descendants]\n",
"\n",
"# Using the helper method.\n",
"all_descendants = CategoryClosure.descendants(\n",
" books, include_node=True)\n",
"print [cat.name for cat in all_descendants]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n",
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n",
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get direct descendant nodes (child nodes)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We can use just the Category table in this case.\n",
"direct_descendants = Category.select().where(Category.parent == fiction)\n",
"print [cat.name for cat in direct_descendants]\n",
"\n",
"# We can join on the closure table.\n",
"direct_descendants = (Category\n",
" .select()\n",
" .join(\n",
" CategoryClosure,\n",
" on=(Category.id == CategoryClosure.id))\n",
" .where(\n",
" (CategoryClosure.root == fiction) &\n",
" (CategoryClosure.depth == 1)))\n",
"print [cat.name for cat in direct_descendants]\n",
"\n",
"# We can use a subquery.\n",
"subquery = (CategoryClosure\n",
" .select(CategoryClosure.id)\n",
" .where(\n",
" (CategoryClosure.root == fiction) &\n",
" (CategoryClosure.depth == 1)))\n",
"direct_descendants = Category.select().where(Category.id << subquery)\n",
"print [cat.name for cat in direct_descendants]\n",
"\n",
"# Using helper method.\n",
"direct_descendants = CategoryClosure.descendants(fiction, depth=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[u'Sci-fi', u'Westerns']\n",
"[u'Sci-fi', u'Westerns']\n",
"[u'Sci-fi', u'Westerns']\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get all sibling nodes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We can use just the Category table.\n",
"siblings = Category.select().where(Category.parent == scifi.parent)\n",
"print [cat.name for cat in siblings]\n",
"\n",
"# Or use the closure table.\n",
"siblings = (Category\n",
" .select()\n",
" .join(CategoryClosure, on=(Category.id == CategoryClosure.id))\n",
" .where(\n",
" (CategoryClosure.root == scifi.parent) &\n",
" (CategoryClosure.depth == 1)))\n",
"print [cat.name for cat in siblings]\n",
"\n",
"# Using helper method.\n",
"siblings = CategoryClosure.siblings(scifi, include_node=True)\n",
"print [cat.name for cat in siblings]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[u'Sci-fi', u'Westerns']\n",
"[u'Sci-fi', u'Westerns']\n",
"[u'Sci-fi', u'Westerns']\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get all ancestors"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Using a JOIN.\n",
"ancestors = (Category\n",
" .select()\n",
" .join(CategoryClosure, on=(Category.id == CategoryClosure.root))\n",
" .where(CategoryClosure.id == scifi))\n",
"print [cat.name for cat in ancestors]\n",
"\n",
"# Using multiple tables in the FROM clause.\n",
"ancestors = (Category\n",
" .select()\n",
" .from_(Category, CategoryClosure)\n",
" .where(\n",
" (Category.id == CategoryClosure.root) &\n",
" (CategoryClosure.id == scifi)))\n",
"print [cat.name for cat in ancestors]\n",
"\n",
"# Using helper method.\n",
"ancestors = CategoryClosure.ancestors(scifi, include_node=True)\n",
"print [cat.name for cat in ancestors]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[u'Books', u'Fiction', u'Sci-fi']\n",
"[u'Books', u'Fiction', u'Sci-fi']\n",
"[u'Books', u'Fiction', u'Sci-fi']\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Benchmarking performance\n",
"\n",
"In this sample benchmark, we'll measure the performance of SQLite using materialized path (as a delimited string with `LIKE` queries) and transitive closures. The test will measure the performance of inserting rows into the table and querying for direct descendants and *all* descendants of a sample set of rows. We will test different size trees to see if the table structure has any impact on performance."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import namedtuple\n",
"import operator\n",
"import os\n",
"import time\n",
"\n",
"from peewee import *\n",
"from playhouse.sqlite_ext import *\n",
"from playhouse.test_utils import count_queries\n",
"\n",
"# Transitive-closure database.\n",
"db_tc = SqliteExtDatabase('/tmp/bench-tc.db')\n",
"db_tc.load_extension('/home/charles/envs/notebook/closure')\n",
"\n",
"# Materialized path database.\n",
"db_mp = SqliteExtDatabase('/tmp/bench-mp.db')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class CategoryTC(Model):\n",
" name = CharField()\n",
" parent = ForeignKeyField('self', index=True, null=True)\n",
" depth = IntegerField(default=0)\n",
"\n",
" class Meta:\n",
" database = db_tc\n",
"\n",
" def __unicode__(self):\n",
" return self.name\n",
"\n",
" def save(self, *args, **kwargs):\n",
" if self.parent:\n",
" self.depth = self.parent.depth + 1\n",
" return super(CategoryTC, self).save(*args, **kwargs)\n",
"\n",
" @classmethod\n",
" def query_direct_children(cls, name):\n",
" cat = cls.get(cls.name == name)\n",
" return (cls\n",
" .select()\n",
" .join(CategoryClosure, on=(cls.id == CategoryClosure.id))\n",
" .where((CategoryClosure.root == cat) & (CategoryClosure.depth == 1)))\n",
"\n",
" @classmethod\n",
" def query_all_children(cls, name):\n",
" cat = cls.get(cls.name == name)\n",
" return (cls\n",
" .select()\n",
" .join(CategoryClosure, on=(cls.id == CategoryClosure.id))\n",
" .where(CategoryClosure.root == cat))\n",
"\n",
"CategoryClosure = ClosureTable(CategoryTC)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class CategoryMP(Model):\n",
" name = CharField()\n",
" path = CharField(index=True)\n",
" parent = ForeignKeyField('self', index=True, null=True)\n",
" depth = IntegerField(default=0)\n",
"\n",
" class Meta:\n",
" database = db_mp\n",
"\n",
" def save(self, *args, **kwargs):\n",
" if self.parent:\n",
" self.path = self.parent.path + '.' + self.name\n",
" self.depth = self.parent.depth + 1\n",
" else:\n",
" self.path = self.name\n",
" self.depth = 0\n",
" inst = super(CategoryMP, self).save(*args, **kwargs)\n",
"\n",
" @classmethod\n",
" def query_direct_children(cls, name):\n",
" cat = cls.get(cls.name == name)\n",
" return cls.select().where(\n",
" (cls.path.startswith(cat.path + '.')) &\n",
" (cls.depth == cat.depth + 1))\n",
"\n",
" @classmethod\n",
" def query_all_children(cls, name):\n",
" cat = cls.get(cls.name == name)\n",
" return cls.select().where(cls.path.startswith(cat.path + '.'))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Helper class to measure the time it takes to run one or more operations.\n",
"class timed(object):\n",
" def __init__(self):\n",
" self.time = None\n",
"\n",
" def __enter__(self):\n",
" self.start = time.time()\n",
" return self\n",
"\n",
" def __exit__(self, *args, **kwargs):\n",
" self.time = time.time() - self.start"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Helper function to construct a tree given a list of node counts\n",
"# describing the branching factor at each level of the tree.\n",
"def create_tree(model_class, node_counts):\n",
" def build_tree(parent, idx):\n",
" if idx == len(node_counts):\n",
" return\n",
" for i in range(node_counts[idx]):\n",
" if parent:\n",
" name = '%s.%s' % (parent.name, i)\n",
" else:\n",
" name = str(i)\n",
" inst = model_class.create(name=name, parent=parent)\n",
" build_tree(inst, idx + 1)\n",
"\n",
" build_tree(None, 0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Helper function to query the table for all descendants and direct\n",
"# descendants. The query is \"consumed\" by calling `list`, otherwise\n",
"# it would not be evaluated.\n",
"def query(model_class, names, iters=3):\n",
" for name in names:\n",
" for i in range(iters):\n",
" list(model_class.query_all_children(name))\n",
" list(model_class.query_direct_children(name))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Helper function to generate a list of node names to query.\n",
"def make_names(structure):\n",
" depth = len(structure)\n",
" names = []\n",
" for i in range(depth):\n",
" n_at_depth = min(structure[i], 10)\n",
" nums = ['0'] * (i + 1)\n",
" names.append('.'.join(nums))\n",
" for i in range(1, n_at_depth):\n",
" nums[-1] = str(i)\n",
" names.append('.'.join(nums))\n",
" return names"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Showing how make_names works:\n",
"make_names([2, 2, 2])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 22,
"text": [
"['0', '1', '0.0', '0.1', '0.0.0', '0.0.1']"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Helper class to store the results of a particular benchmark.\n",
"class Result(namedtuple('_R', ('structure', 'time', 'queries', 'timings'))):\n",
" def __new__(cls, structure, time, queries):\n",
" return super(Result, cls).__new__(cls, structure, time, queries, [])\n",
"\n",
" def __repr__(self):\n",
" return '<Result: %s nodes, created in %.3f, avg %.3f>' % (\n",
" self.nodes,\n",
" self.time,\n",
" self.avg)\n",
"\n",
" @property\n",
" def nodes(self):\n",
" return sum(reduce(operator.mul, self.structure[:i + 1])\n",
" for i in range(len(self.structure)))\n",
"\n",
" @property\n",
" def avg(self):\n",
" return sum(timing for _, timing in self.timings) / len(self.timings)\n",
"\n",
" def add_result(self, name, timing):\n",
" depth = len(name.split('.'))\n",
" self.timings.append((depth, timing))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# The actual benchmarking code.\n",
"def benchmark(model_class, structure):\n",
" # Ensure the database connection is closed, if it is not already.\n",
" if not db_tc.is_closed():\n",
" db_tc.close()\n",
" if not db_mp.is_closed():\n",
" db_mp.close()\n",
"\n",
" # Delete any database files on disk so we are starting fresh each time.\n",
" filenames = ['/tmp/bench-tc.db', '/tmp/bench-mp.db']\n",
" for filename in filenames:\n",
" if os.path.exists(filename):\n",
" os.unlink(filename)\n",
"\n",
" # Re-create the tables.\n",
" CategoryTC.create_table()\n",
" CategoryClosure.create_table()\n",
" CategoryMP.create_table()\n",
"\n",
" with timed() as timer:\n",
" with count_queries() as cq:\n",
" create_tree(model_class, structure)\n",
"\n",
" result = Result(structure, timer.time, cq.count)\n",
" names = make_names(structure)\n",
"\n",
" for name in names:\n",
" with timed() as timer:\n",
" query(model_class, names)\n",
" result.add_result(name, timer.time)\n",
"\n",
" return result"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"structures = [\n",
" ('Deep', [3, 3, 3, 2, 2, 2, 2, 2, 2]),\n",
" ('Very deep', [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1]),\n",
" ('Uniform', [5, 5, 5, 5, 3]),\n",
" ('Top-heavy', [50, 5, 5, 1]),\n",
" ('Bottom-heavy', [1, 1, 5, 10, 50]),\n",
" ('Very wide', [1000, 1]),\n",
"]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Disable logging while running the benchmark.\n",
"import logging\n",
"logging.disable(logging.DEBUG)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Run the benchmark!\n",
"for label, structure in structures:\n",
" print label \n",
" for model_class in [CategoryMP, CategoryTC]:\n",
" result = benchmark(model_class, structure)\n",
" print '%s: %r' % (model_class.__name__, result)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Deep\n",
"CategoryMP: <Result: 3441 nodes, created in 1.186, avg 0.498>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 3441 nodes, created in 0.999, avg 0.440>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Very deep\n",
"CategoryMP: <Result: 2046 nodes, created in 0.672, avg 0.371>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 2046 nodes, created in 0.589, avg 0.338>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Uniform\n",
"CategoryMP: <Result: 2655 nodes, created in 0.854, avg 0.334>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 2655 nodes, created in 0.771, avg 0.315>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Top-heavy\n",
"CategoryMP: <Result: 2800 nodes, created in 0.917, avg 0.168>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 2800 nodes, created in 0.812, avg 0.131>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Bottom-heavy\n",
"CategoryMP: <Result: 2557 nodes, created in 0.834, avg 0.703>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 2557 nodes, created in 0.742, avg 0.653>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Very wide\n",
"CategoryMP: <Result: 2000 nodes, created in 0.646, avg 0.060>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CategoryTC: <Result: 2000 nodes, created in 0.611, avg 0.048>"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment