Skip to content

Instantly share code, notes, and snippets.

@BertrandBordage
Last active February 27, 2021 15:54
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save BertrandBordage/4aee04f6931e4fa8c2c351eb7eeb8e31 to your computer and use it in GitHub Desktop.
Save BertrandBordage/4aee04f6931e4fa8c2c351eb7eeb8e31 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Before running\n",
"\n",
"`sudo -u postgres createdb test`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## General utils"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict, OrderedDict\n",
"import re\n",
"from time import time\n",
"\n",
"import psycopg2\n",
"\n",
"\n",
"def avg(iterable):\n",
" try:\n",
" return sum(iterable) / len(iterable)\n",
" except ZeroDivisionError:\n",
" return 0.0\n",
"\n",
"\n",
"def ms(t):\n",
" return '%.1f ms' % t\n",
"\n",
"\n",
"class TimeIt:\n",
" def __enter__(self):\n",
" self.start = time()\n",
" return self\n",
"\n",
" def __exit__(self, *args, **kwargs):\n",
" self.dt = 1000 * (time() - self.start)\n",
"\n",
"\n",
"class PostgreSQLBenchmark:\n",
" EXPLAIN_RE = re.compile(r'^(?:Planning|Execution) time: ([\\d\\.]+) ms$')\n",
"\n",
" def __init__(self, insert_sql, data, search_sql, add_index=False):\n",
" self.insert_sql = insert_sql\n",
" self.data = data\n",
" self.search_sql = search_sql\n",
" self.add_index = add_index\n",
" self.connection = psycopg2.connect(dbname='test', user='postgres')\n",
" self.connection.autocommit = True\n",
" self.cursor = self.connection.cursor()\n",
" self.cursor.execute('DROP TABLE IF EXISTS english_search;')\n",
" self.cursor.execute('CREATE TABLE english_search (body text, body_search tsvector);')\n",
" if add_index:\n",
" self.cursor.execute('CREATE INDEX ON english_search USING GIN (body_search);')\n",
"\n",
" self.insert_times = []\n",
" self.search_times = defaultdict(list)\n",
" self.search_explains = {}\n",
" self.search_explain_times = defaultdict(list)\n",
" self.search_results_counts = {}\n",
"\n",
" def __del__(self):\n",
" self.cursor.close()\n",
" self.connection.close()\n",
"\n",
" def insert(self):\n",
" self.cursor.executemany(self.insert_sql, self.data)\n",
"\n",
" def optimise_database(self):\n",
" self.cursor.execute('VACUUM FULL ANALYSE;')\n",
" self.cursor.execute('REINDEX DATABASE test;')\n",
" self.cursor.execute('CLUSTER;')\n",
"\n",
" def setup(self):\n",
" with TimeIt() as ti:\n",
" self.insert()\n",
" self.insert_times.append(ti.dt)\n",
" self.optimise_database()\n",
" \n",
" self.cursor.execute(\"SELECT pg_size_pretty(pg_database_size('test'));\")\n",
" self.size = self.cursor.fetchone()[0]\n",
"\n",
" def explain_analyse(self, sql, params):\n",
" self.cursor.execute('EXPLAIN ANALYSE ' + sql, params)\n",
" return [row[0] for row in self.cursor.fetchall()]\n",
"\n",
" def search(self, search_query):\n",
" params = (search_query,)\n",
" with TimeIt() as ti:\n",
" self.cursor.execute(self.search_sql, params)\n",
" results = self.cursor.fetchall()\n",
" self.search_times[search_query].append(ti.dt)\n",
" self.search_results_counts[search_query] = len(results)\n",
" \n",
" explain = self.explain_analyse(self.search_sql, params)\n",
" explain_without_measures = []\n",
" explain_time = 0.0\n",
" for line in explain:\n",
" match = self.EXPLAIN_RE.match(line)\n",
" if match is None:\n",
" explain_without_measures.append(line)\n",
" else:\n",
" explain_time += float(match.group(1))\n",
" self.search_explains[search_query] = '\\n'.join(explain_without_measures)\n",
" self.search_explain_times[search_query].append(explain_time)\n",
"\n",
" def print_time(self, times):\n",
" if len(times) < 3:\n",
" print(ms(avg(times)))\n",
" else:\n",
" print('min %s, avg %s, max %s'\n",
" % (ms(min(times)), ms(avg(times)), ms(max(times))))\n",
"\n",
" def test(self, *search_queries):\n",
" self.setup()\n",
" for search_query in search_queries:\n",
" for _ in range(10):\n",
" self.search(search_query)\n",
"\n",
" def values(self):\n",
" values = OrderedDict((\n",
" ('Inserting data', ms(avg(self.insert_times))),\n",
" ('Database size (incl. indexes)', self.size),\n",
" ))\n",
" for search_query in sorted(self.search_times):\n",
" label = '%s search' % repr(search_query)\n",
" values[label + ' count'] = self.search_results_counts[search_query]\n",
" values[label + ' time'] = ms(avg(self.search_times[search_query]))\n",
" values[label + ' EXPLAIN time'] = ms(avg(self.search_explain_times[search_query]))\n",
" return values\n",
"\n",
" def print_report(self):\n",
" print('Adding data:')\n",
" self.print_time(self.insert_times)\n",
" print()\n",
" for search_query in self.search_times:\n",
" print(\"Searching for %s:\" % repr(search_query))\n",
" print(self.search_results_counts[search_query], 'results')\n",
" print('Measured time:')\n",
" self.print_time(self.search_times[search_query])\n",
" print('Total time declared by EXPLAIN:')\n",
" self.print_time(self.search_explain_times[search_query])\n",
" print('EXPLAIN:')\n",
" print(self.search_explains[search_query])\n",
" print()\n",
" print('Full database size:', self.get_size())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conditions (database content & search queries)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from random import randint, choice\n",
"\n",
"with open('/usr/share/dict/british-english') as f:\n",
" words = f.read().split('\\n')\n",
"\n",
"INITIAL_DATA = []\n",
"for i in range(500000):\n",
" text = []\n",
" for _ in range(randint(1, 50)):\n",
" text.append(choice(words))\n",
" INITIAL_DATA.append(' '.join(text))\n",
"\n",
"\n",
"SEARCH_QUERIES = 'th', 'the', 'ther', 'there', 'theref'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Karl’s approach"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def get_left_edge_ngrams(text, min_length=2, max_length=15):\n",
" for term in text.split():\n",
" for length in range(min_length, max_length + 1):\n",
" if length > len(term):\n",
" break\n",
"\n",
" yield term[:length]\n",
"\n",
"\n",
"karl_data = [{'body': text,\n",
" 'body_search': ' '.join(get_left_edge_ngrams(text))}\n",
" for text in INITIAL_DATA]\n",
"\n",
"karl_insert_sql = 'INSERT INTO english_search VALUES (%(body)s, to_tsvector(%(body_search)s));'\n",
"karl_search_sql = \"SELECT body FROM english_search WHERE body_search @@ to_tsquery(%s)\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Without index"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"karl_benchmark_without_index = PostgreSQLBenchmark(karl_insert_sql, karl_data, karl_search_sql)\n",
"karl_benchmark_without_index.test(*SEARCH_QUERIES)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### With index"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"karl_benchmark = PostgreSQLBenchmark(karl_insert_sql, karl_data, karl_search_sql, add_index=True)\n",
"karl_benchmark.test(*SEARCH_QUERIES)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bertrand’s approach"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bertrand_data = [{'body': text} for text in INITIAL_DATA]\n",
"\n",
"bertrand_insert_sql = 'INSERT INTO english_search VALUES (%(body)s, to_tsvector(%(body)s));'\n",
"bertrand_search_sql = \"SELECT body FROM english_search WHERE body_search @@ to_tsquery(''%s':*')\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Without index"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"bertrand_benchmark_without_index = PostgreSQLBenchmark(bertrand_insert_sql, bertrand_data, bertrand_search_sql)\n",
"bertrand_benchmark_without_index.test(*SEARCH_QUERIES)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### With index"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"bertrand_benchmark = PostgreSQLBenchmark(bertrand_insert_sql, bertrand_data, bertrand_search_sql, add_index=True)\n",
"bertrand_benchmark.test(*SEARCH_QUERIES)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Name</th>\n",
" <th>Karl (without index)</th>\n",
" <th>Bertrand (without index)</th>\n",
" <th>Karl (with index)</th>\n",
" <th>Bertrand (with index)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Inserting data</th>\n",
" <td>410432.7 ms</td>\n",
" <td>228455.0 ms</td>\n",
" <td>810141.5 ms</td>\n",
" <td>287196.6 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Database size (incl. indexes)</th>\n",
" <td>1110 MB</td>\n",
" <td>339 MB</td>\n",
" <td>1364 MB</td>\n",
" <td>388 MB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'th' search count</th>\n",
" <td>75432</td>\n",
" <td>75432</td>\n",
" <td>75432</td>\n",
" <td>75432</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'th' search time</th>\n",
" <td>3833.4 ms</td>\n",
" <td>1654.1 ms</td>\n",
" <td>270.6 ms</td>\n",
" <td>276.6 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'th' search EXPLAIN time</th>\n",
" <td>3651.7 ms</td>\n",
" <td>1531.4 ms</td>\n",
" <td>275.5 ms</td>\n",
" <td>197.4 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'the' search count</th>\n",
" <td>20932</td>\n",
" <td>20932</td>\n",
" <td>20932</td>\n",
" <td>20932</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'the' search time</th>\n",
" <td>3850.1 ms</td>\n",
" <td>1622.8 ms</td>\n",
" <td>96.2 ms</td>\n",
" <td>107.6 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'the' search EXPLAIN time</th>\n",
" <td>3718.1 ms</td>\n",
" <td>1552.9 ms</td>\n",
" <td>75.5 ms</td>\n",
" <td>78.7 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'ther' search count</th>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'ther' search time</th>\n",
" <td>3945.5 ms</td>\n",
" <td>1696.3 ms</td>\n",
" <td>16.8 ms</td>\n",
" <td>19.1 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'ther' search EXPLAIN time</th>\n",
" <td>3893.4 ms</td>\n",
" <td>1686.1 ms</td>\n",
" <td>10.1 ms</td>\n",
" <td>11.4 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'there' search count</th>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" <td>7111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'there' search time</th>\n",
" <td>4124.0 ms</td>\n",
" <td>1865.1 ms</td>\n",
" <td>17.5 ms</td>\n",
" <td>19.4 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'there' search EXPLAIN time</th>\n",
" <td>4144.2 ms</td>\n",
" <td>1861.4 ms</td>\n",
" <td>10.3 ms</td>\n",
" <td>11.5 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'theref' search count</th>\n",
" <td>274</td>\n",
" <td>274</td>\n",
" <td>274</td>\n",
" <td>274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'theref' search time</th>\n",
" <td>3937.8 ms</td>\n",
" <td>1800.3 ms</td>\n",
" <td>0.7 ms</td>\n",
" <td>0.9 ms</td>\n",
" </tr>\n",
" <tr>\n",
" <th>'theref' search EXPLAIN time</th>\n",
" <td>4026.0 ms</td>\n",
" <td>1793.5 ms</td>\n",
" <td>0.4 ms</td>\n",
" <td>0.5 ms</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Name Karl (without index) Bertrand (without index) \\\n",
"Inserting data 410432.7 ms 228455.0 ms \n",
"Database size (incl. indexes) 1110 MB 339 MB \n",
"'th' search count 75432 75432 \n",
"'th' search time 3833.4 ms 1654.1 ms \n",
"'th' search EXPLAIN time 3651.7 ms 1531.4 ms \n",
"'the' search count 20932 20932 \n",
"'the' search time 3850.1 ms 1622.8 ms \n",
"'the' search EXPLAIN time 3718.1 ms 1552.9 ms \n",
"'ther' search count 7111 7111 \n",
"'ther' search time 3945.5 ms 1696.3 ms \n",
"'ther' search EXPLAIN time 3893.4 ms 1686.1 ms \n",
"'there' search count 7111 7111 \n",
"'there' search time 4124.0 ms 1865.1 ms \n",
"'there' search EXPLAIN time 4144.2 ms 1861.4 ms \n",
"'theref' search count 274 274 \n",
"'theref' search time 3937.8 ms 1800.3 ms \n",
"'theref' search EXPLAIN time 4026.0 ms 1793.5 ms \n",
"\n",
"Name Karl (with index) Bertrand (with index) \n",
"Inserting data 810141.5 ms 287196.6 ms \n",
"Database size (incl. indexes) 1364 MB 388 MB \n",
"'th' search count 75432 75432 \n",
"'th' search time 270.6 ms 276.6 ms \n",
"'th' search EXPLAIN time 275.5 ms 197.4 ms \n",
"'the' search count 20932 20932 \n",
"'the' search time 96.2 ms 107.6 ms \n",
"'the' search EXPLAIN time 75.5 ms 78.7 ms \n",
"'ther' search count 7111 7111 \n",
"'ther' search time 16.8 ms 19.1 ms \n",
"'ther' search EXPLAIN time 10.1 ms 11.4 ms \n",
"'there' search count 7111 7111 \n",
"'there' search time 17.5 ms 19.4 ms \n",
"'there' search EXPLAIN time 10.3 ms 11.5 ms \n",
"'theref' search count 274 274 \n",
"'theref' search time 0.7 ms 0.9 ms \n",
"'theref' search EXPLAIN time 0.4 ms 0.5 ms "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas\n",
"\n",
"karl_values_without_index = karl_benchmark_without_index.values()\n",
"karl_values_without_index['Approach'] = 'Karl (without index)'\n",
"karl_values = karl_benchmark.values()\n",
"karl_values['Approach'] = 'Karl (with index)'\n",
"\n",
"bertrand_values_without_index = bertrand_benchmark_without_index.values()\n",
"bertrand_values_without_index['Approach'] = 'Bertrand (without index)'\n",
"bertrand_values = bertrand_benchmark.values()\n",
"bertrand_values['Approach'] = 'Bertrand (with index)'\n",
"\n",
"df = pandas.DataFrame.from_records([\n",
" karl_values_without_index, bertrand_values_without_index,\n",
" karl_values, bertrand_values])\n",
"df.set_index('Approach').transpose()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment