-
-
Save coleifer/692c69724f08dc1dc8cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:84058d11511f32ba79be8c9c8e30392bb52c1fb9f4a60468021d349e5df76bcd" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### SQLite transitive closure extension demo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from peewee import *\n", | |
"from playhouse.sqlite_ext import *\n", | |
"\n", | |
"db = SqliteExtDatabase('/tmp/categories.db')\n", | |
"db.load_extension('/home/charles/envs/notebook/closure') # Note we leave off .so" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class Category(Model):\n", | |
" name = CharField()\n", | |
" parent = ForeignKeyField('self', null=True, related_name='children')\n", | |
"\n", | |
" class Meta:\n", | |
" database = db\n", | |
"\n", | |
"# Create a virtual table using the transitive closure extension.\n", | |
"CategoryClosure = ClosureTable(Category)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Create the tables if they do not exist already.\n", | |
"Category.create_table(True)\n", | |
"CategoryClosure.create_table(True)\n", | |
"Category.delete().execute() # Blow away any data." | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"0" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"books = Category.create(name='Books')\n", | |
"fiction = Category.create(name='Fiction', parent=books)\n", | |
"scifi = Category.create(name='Sci-fi', parent=fiction)\n", | |
"westerns = Category.create(name='Westerns', parent=fiction)\n", | |
"non_fiction = Category.create(name='Non-fiction', parent=books)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get all descendant nodes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Using a JOIN.\n", | |
"all_descendants = (Category\n", | |
" .select()\n", | |
" .join(CategoryClosure, on=(Category.id == CategoryClosure.id))\n", | |
" .where(CategoryClosure.root == books))\n", | |
"print [cat.name for cat in all_descendants]\n", | |
"\n", | |
"# Using a subquery instead. \"<<\" translates to \"IN\".\n", | |
"subquery = (CategoryClosure\n", | |
" .select(CategoryClosure.id)\n", | |
" .where(CategoryClosure.root == books))\n", | |
"all_descendants = Category.select().where(Category.id << subquery)\n", | |
"print [cat.name for cat in all_descendants]\n", | |
"\n", | |
"# Using the helper method.\n", | |
"all_descendants = CategoryClosure.descendants(\n", | |
" books, include_node=True)\n", | |
"print [cat.name for cat in all_descendants]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n", | |
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n", | |
"[u'Books', u'Fiction', u'Sci-fi', u'Westerns', u'Non-fiction']\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get direct descendant nodes (child nodes)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We can use just the Category table in this case.\n", | |
"direct_descendants = Category.select().where(Category.parent == fiction)\n", | |
"print [cat.name for cat in direct_descendants]\n", | |
"\n", | |
"# We can join on the closure table.\n", | |
"direct_descendants = (Category\n", | |
" .select()\n", | |
" .join(\n", | |
" CategoryClosure,\n", | |
" on=(Category.id == CategoryClosure.id))\n", | |
" .where(\n", | |
" (CategoryClosure.root == fiction) &\n", | |
" (CategoryClosure.depth == 1)))\n", | |
"print [cat.name for cat in direct_descendants]\n", | |
"\n", | |
"# We can use a subquery.\n", | |
"subquery = (CategoryClosure\n", | |
" .select(CategoryClosure.id)\n", | |
" .where(\n", | |
" (CategoryClosure.root == fiction) &\n", | |
" (CategoryClosure.depth == 1)))\n", | |
"direct_descendants = Category.select().where(Category.id << subquery)\n", | |
"print [cat.name for cat in direct_descendants]\n", | |
"\n", | |
"# Using helper method.\n", | |
"direct_descendants = CategoryClosure.descendants(fiction, depth=1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[u'Sci-fi', u'Westerns']\n", | |
"[u'Sci-fi', u'Westerns']\n", | |
"[u'Sci-fi', u'Westerns']\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get all sibling nodes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We can use just the Category table.\n", | |
"siblings = Category.select().where(Category.parent == scifi.parent)\n", | |
"print [cat.name for cat in siblings]\n", | |
"\n", | |
"# Or use the closure table.\n", | |
"siblings = (Category\n", | |
" .select()\n", | |
" .join(CategoryClosure, on=(Category.id == CategoryClosure.id))\n", | |
" .where(\n", | |
" (CategoryClosure.root == scifi.parent) &\n", | |
" (CategoryClosure.depth == 1)))\n", | |
"print [cat.name for cat in siblings]\n", | |
"\n", | |
"# Using helper method.\n", | |
"siblings = CategoryClosure.siblings(scifi, include_node=True)\n", | |
"print [cat.name for cat in siblings]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[u'Sci-fi', u'Westerns']\n", | |
"[u'Sci-fi', u'Westerns']\n", | |
"[u'Sci-fi', u'Westerns']\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get all ancestors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Using a JOIN.\n", | |
"ancestors = (Category\n", | |
" .select()\n", | |
" .join(CategoryClosure, on=(Category.id == CategoryClosure.root))\n", | |
" .where(CategoryClosure.id == scifi))\n", | |
"print [cat.name for cat in ancestors]\n", | |
"\n", | |
"# Using multiple tables in the FROM clause.\n", | |
"ancestors = (Category\n", | |
" .select()\n", | |
" .from_(Category, CategoryClosure)\n", | |
" .where(\n", | |
" (Category.id == CategoryClosure.root) &\n", | |
" (CategoryClosure.id == scifi)))\n", | |
"print [cat.name for cat in ancestors]\n", | |
"\n", | |
"# Using helper method.\n", | |
"ancestors = CategoryClosure.ancestors(scifi, include_node=True)\n", | |
"print [cat.name for cat in ancestors]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[u'Books', u'Fiction', u'Sci-fi']\n", | |
"[u'Books', u'Fiction', u'Sci-fi']\n", | |
"[u'Books', u'Fiction', u'Sci-fi']\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Benchmarking performance\n", | |
"\n", | |
"In this sample benchmark, we'll measure the performance of SQLite using materialized path (as a delimited string with `LIKE` queries) and transitive closures. The test will measure the performance of inserting rows into the table and querying for direct descendants and *all* descendants of a sample set of rows. We will test different size trees to see if the table structure has any impact on performance." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from collections import namedtuple\n", | |
"import operator\n", | |
"import os\n", | |
"import time\n", | |
"\n", | |
"from peewee import *\n", | |
"from playhouse.sqlite_ext import *\n", | |
"from playhouse.test_utils import count_queries\n", | |
"\n", | |
"# Transitive-closure database.\n", | |
"db_tc = SqliteExtDatabase('/tmp/bench-tc.db')\n", | |
"db_tc.load_extension('/home/charles/envs/notebook/closure')\n", | |
"\n", | |
"# Materialized path database.\n", | |
"db_mp = SqliteExtDatabase('/tmp/bench-mp.db')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class CategoryTC(Model):\n", | |
" name = CharField()\n", | |
" parent = ForeignKeyField('self', index=True, null=True)\n", | |
" depth = IntegerField(default=0)\n", | |
"\n", | |
" class Meta:\n", | |
" database = db_tc\n", | |
"\n", | |
" def __unicode__(self):\n", | |
" return self.name\n", | |
"\n", | |
" def save(self, *args, **kwargs):\n", | |
" if self.parent:\n", | |
" self.depth = self.parent.depth + 1\n", | |
" return super(CategoryTC, self).save(*args, **kwargs)\n", | |
"\n", | |
" @classmethod\n", | |
" def query_direct_children(cls, name):\n", | |
" cat = cls.get(cls.name == name)\n", | |
" return (cls\n", | |
" .select()\n", | |
" .join(CategoryClosure, on=(cls.id == CategoryClosure.id))\n", | |
" .where((CategoryClosure.root == cat) & (CategoryClosure.depth == 1)))\n", | |
"\n", | |
" @classmethod\n", | |
" def query_all_children(cls, name):\n", | |
" cat = cls.get(cls.name == name)\n", | |
" return (cls\n", | |
" .select()\n", | |
" .join(CategoryClosure, on=(cls.id == CategoryClosure.id))\n", | |
" .where(CategoryClosure.root == cat))\n", | |
"\n", | |
"CategoryClosure = ClosureTable(CategoryTC)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class CategoryMP(Model):\n", | |
" name = CharField()\n", | |
" path = CharField(index=True)\n", | |
" parent = ForeignKeyField('self', index=True, null=True)\n", | |
" depth = IntegerField(default=0)\n", | |
"\n", | |
" class Meta:\n", | |
" database = db_mp\n", | |
"\n", | |
" def save(self, *args, **kwargs):\n", | |
" if self.parent:\n", | |
" self.path = self.parent.path + '.' + self.name\n", | |
" self.depth = self.parent.depth + 1\n", | |
" else:\n", | |
" self.path = self.name\n", | |
" self.depth = 0\n", | |
" inst = super(CategoryMP, self).save(*args, **kwargs)\n", | |
"\n", | |
" @classmethod\n", | |
" def query_direct_children(cls, name):\n", | |
" cat = cls.get(cls.name == name)\n", | |
" return cls.select().where(\n", | |
" (cls.path.startswith(cat.path + '.')) &\n", | |
" (cls.depth == cat.depth + 1))\n", | |
"\n", | |
" @classmethod\n", | |
" def query_all_children(cls, name):\n", | |
" cat = cls.get(cls.name == name)\n", | |
" return cls.select().where(cls.path.startswith(cat.path + '.'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Helper class to measure the time it takes to run one or more operations.\n", | |
"class timed(object):\n", | |
" def __init__(self):\n", | |
" self.time = None\n", | |
"\n", | |
" def __enter__(self):\n", | |
" self.start = time.time()\n", | |
" return self\n", | |
"\n", | |
" def __exit__(self, *args, **kwargs):\n", | |
" self.time = time.time() - self.start" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Helper function to construct a tree given a list of node counts\n", | |
"# describing the branching factor at each level of the tree.\n", | |
"def create_tree(model_class, node_counts):\n", | |
" def build_tree(parent, idx):\n", | |
" if idx == len(node_counts):\n", | |
" return\n", | |
" for i in range(node_counts[idx]):\n", | |
" if parent:\n", | |
" name = '%s.%s' % (parent.name, i)\n", | |
" else:\n", | |
" name = str(i)\n", | |
" inst = model_class.create(name=name, parent=parent)\n", | |
" build_tree(inst, idx + 1)\n", | |
"\n", | |
" build_tree(None, 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Helper function to query the table for all descendants and direct\n", | |
"# descendants. The query is \"consumed\" by calling `list`, otherwise\n", | |
"# it would not be evaluated.\n", | |
"def query(model_class, names, iters=3):\n", | |
" for name in names:\n", | |
" for i in range(iters):\n", | |
" list(model_class.query_all_children(name))\n", | |
" list(model_class.query_direct_children(name))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Helper function to generate a list of node names to query.\n", | |
"def make_names(structure):\n", | |
" depth = len(structure)\n", | |
" names = []\n", | |
" for i in range(depth):\n", | |
" n_at_depth = min(structure[i], 10)\n", | |
" nums = ['0'] * (i + 1)\n", | |
" names.append('.'.join(nums))\n", | |
" for i in range(1, n_at_depth):\n", | |
" nums[-1] = str(i)\n", | |
" names.append('.'.join(nums))\n", | |
" return names" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Showing how make_names works:\n", | |
"make_names([2, 2, 2])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 22, | |
"text": [ | |
"['0', '1', '0.0', '0.1', '0.0.0', '0.0.1']" | |
] | |
} | |
], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Helper class to store the results of a particular benchmark.\n", | |
"class Result(namedtuple('_R', ('structure', 'time', 'queries', 'timings'))):\n", | |
" def __new__(cls, structure, time, queries):\n", | |
" return super(Result, cls).__new__(cls, structure, time, queries, [])\n", | |
"\n", | |
" def __repr__(self):\n", | |
" return '<Result: %s nodes, created in %.3f, avg %.3f>' % (\n", | |
" self.nodes,\n", | |
" self.time,\n", | |
" self.avg)\n", | |
"\n", | |
" @property\n", | |
" def nodes(self):\n", | |
" return sum(reduce(operator.mul, self.structure[:i + 1])\n", | |
" for i in range(len(self.structure)))\n", | |
"\n", | |
" @property\n", | |
" def avg(self):\n", | |
" return sum(timing for _, timing in self.timings) / len(self.timings)\n", | |
"\n", | |
" def add_result(self, name, timing):\n", | |
" depth = len(name.split('.'))\n", | |
" self.timings.append((depth, timing))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# The actual benchmarking code.\n", | |
"def benchmark(model_class, structure):\n", | |
" # Ensure the database connection is closed, if it is not already.\n", | |
" if not db_tc.is_closed():\n", | |
" db_tc.close()\n", | |
" if not db_mp.is_closed():\n", | |
" db_mp.close()\n", | |
"\n", | |
" # Delete any database files on disk so we are starting fresh each time.\n", | |
" filenames = ['/tmp/bench-tc.db', '/tmp/bench-mp.db']\n", | |
" for filename in filenames:\n", | |
" if os.path.exists(filename):\n", | |
" os.unlink(filename)\n", | |
"\n", | |
" # Re-create the tables.\n", | |
" CategoryTC.create_table()\n", | |
" CategoryClosure.create_table()\n", | |
" CategoryMP.create_table()\n", | |
"\n", | |
" with timed() as timer:\n", | |
" with count_queries() as cq:\n", | |
" create_tree(model_class, structure)\n", | |
"\n", | |
" result = Result(structure, timer.time, cq.count)\n", | |
" names = make_names(structure)\n", | |
"\n", | |
" for name in names:\n", | |
" with timed() as timer:\n", | |
" query(model_class, names)\n", | |
" result.add_result(name, timer.time)\n", | |
"\n", | |
" return result" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"structures = [\n", | |
" ('Deep', [3, 3, 3, 2, 2, 2, 2, 2, 2]),\n", | |
" ('Very deep', [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1]),\n", | |
" ('Uniform', [5, 5, 5, 5, 3]),\n", | |
" ('Top-heavy', [50, 5, 5, 1]),\n", | |
" ('Bottom-heavy', [1, 1, 5, 10, 50]),\n", | |
" ('Very wide', [1000, 1]),\n", | |
"]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Disable logging while running the benchmark.\n", | |
"import logging\n", | |
"logging.disable(logging.DEBUG)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Run the benchmark!\n", | |
"for label, structure in structures:\n", | |
" print label \n", | |
" for model_class in [CategoryMP, CategoryTC]:\n", | |
" result = benchmark(model_class, structure)\n", | |
" print '%s: %r' % (model_class.__name__, result)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Deep\n", | |
"CategoryMP: <Result: 3441 nodes, created in 1.186, avg 0.498>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 3441 nodes, created in 0.999, avg 0.440>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Very deep\n", | |
"CategoryMP: <Result: 2046 nodes, created in 0.672, avg 0.371>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 2046 nodes, created in 0.589, avg 0.338>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Uniform\n", | |
"CategoryMP: <Result: 2655 nodes, created in 0.854, avg 0.334>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 2655 nodes, created in 0.771, avg 0.315>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Top-heavy\n", | |
"CategoryMP: <Result: 2800 nodes, created in 0.917, avg 0.168>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 2800 nodes, created in 0.812, avg 0.131>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Bottom-heavy\n", | |
"CategoryMP: <Result: 2557 nodes, created in 0.834, avg 0.703>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 2557 nodes, created in 0.742, avg 0.653>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Very wide\n", | |
"CategoryMP: <Result: 2000 nodes, created in 0.646, avg 0.060>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"CategoryTC: <Result: 2000 nodes, created in 0.611, avg 0.048>" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 27 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment