Last active
August 29, 2015 14:04
-
-
Save kevinlondon/fd4b36851abc9eafa296 to your computer and use it in GitHub Desktop.
MySQL UUID Field Benchmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import MySQLdb | |
import logging | |
import uuid | |
import time | |
import random | |
from contextlib import contextmanager | |
from binascii import unhexlify, hexlify | |
# Modified from http://iops.io/blog/storing-billions-uuid-fields-mysql-innodb/ | |
def configure_logging(): | |
FORMAT = "[%(asctime)-15s] [%(filename)s:%(lineno)d] [%(process)d/%(processName)s] %(message)s" | |
logging.basicConfig(format=FORMAT, level=logging.DEBUG) | |
configure_logging() | |
def flatten_list(seq): | |
merged = [] | |
for s in seq: | |
for x in s: | |
merged.append(x) | |
return merged | |
class Benchmark(object): | |
def __init__(self): | |
self.start = time.time() | |
self.marks = [] | |
self.times = [] | |
@contextmanager | |
def benchmark(self): | |
t = time.time() | |
yield | |
now = time.time() | |
elapsed = now - t | |
o = (t, now, elapsed) | |
self.times.append(o) | |
@contextmanager | |
def mark(self, data): | |
assert isinstance(data, dict) | |
total_elapsed = sum(map(lambda x: x[2], self.times)) | |
o = (time.time(), total_elapsed, data) | |
self.marks.append(o) | |
self.times = [] | |
def dump(self): | |
h = "%-20s %-20s %-20s" | |
print h % ( "total_rows", "chunk_size", "time_taken") | |
for x in self.marks: | |
d = x[2] | |
print h % ( d['total_rows'], d['chunk_size'], x[1]) | |
class UUIDTest(object): | |
# The amount of UUIDs pre-generated for each database commit | |
# Setting this too high may result in "MySQL server has gone away" errors | |
COMMIT_CHUNK_SIZE = 20000 | |
# The amount of rows to insert at any given time in a single | |
# commit using executemany(). | |
LOOP_SIZE = 100000 | |
# The amount of rows to insert in total for each test | |
TOTAL_ROWS = 400000 | |
# The amount of UUIDs to select against | |
SELECT_CHUNK_SIZE = 2000 | |
def __init__(self, dbconn): | |
self.db = dbconn | |
def start(self): | |
self.create_database() | |
self.run_test('char32') | |
self.run_test('bin16') | |
self.run_test('longblob') | |
def generate_uuid(self, total): | |
"""Generate a list of random UUIDs. Time spent generating | |
these is not taken into consideration when comparing performance | |
between each test. This is because we are only interested in the | |
db select/insert performance""" | |
x = map(lambda x: uuid.uuid4(), range(self.COMMIT_CHUNK_SIZE)) | |
return x | |
###################################################### | |
# CHAR32 TESTS | |
###################################################### | |
def char32_insert(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist) | |
# Insert UUIDs into database | |
c = self.db.cursor() | |
with self.ib.benchmark(): | |
c.executemany(""" | |
INSERT INTO char32 (uuid) | |
VALUES (%s) | |
""", ui) | |
self.db.commit() | |
c.close() | |
def char32_select(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist) | |
selectui_values = ui | |
selectui_sql = [ '%s', ] * len(selectui_values) | |
selectui_sql = ",".join(selectui_sql) | |
# select | |
c = self.db.cursor() | |
with self.sb.benchmark(): | |
sql = """ | |
SELECT | |
id | |
FROM | |
`char32` | |
WHERE | |
uuid IN (%s) | |
""" % ( selectui_sql, ) | |
c.execute(sql, selectui_values) | |
r = c.fetchall() | |
assert len(r) == self.SELECT_CHUNK_SIZE | |
c.close() | |
###################################################### | |
# BIN16 TESTS | |
###################################################### | |
def bin16_insert(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (x.bytes, ), uuidlist) | |
# Insert UUIDs into database | |
c = self.db.cursor() | |
with self.ib.benchmark(): | |
c.executemany(""" | |
INSERT INTO `bin16` (uuid) | |
VALUES (%s) | |
""", ui) | |
self.db.commit() | |
c.close() | |
def bin16_select(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (x.bytes, ), uuidlist) | |
selectui_values = ui | |
selectui_sql = [ '%s', ] * len(selectui_values) | |
selectui_sql = ",".join(selectui_sql) | |
# select | |
c = self.db.cursor() | |
with self.sb.benchmark(): | |
sql = """ | |
SELECT | |
id | |
FROM | |
`bin16` | |
WHERE | |
uuid IN (%s) | |
""" % ( selectui_sql, ) | |
c.execute(sql, selectui_values) | |
r = c.fetchall() | |
assert len(r) == self.SELECT_CHUNK_SIZE | |
c.close() | |
###################################################### | |
# LongBlob TESTS | |
###################################################### | |
def longblob_insert(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (x.bytes, ), uuidlist) | |
# Insert UUIDs into database | |
c = self.db.cursor() | |
with self.ib.benchmark(): | |
c.executemany(""" | |
INSERT INTO `longblob` (uuid) | |
VALUES (%s) | |
""", ui) | |
self.db.commit() | |
c.close() | |
def longblob_select(self, uuidlist): | |
# convert list into necessary format | |
ui = map(lambda x: (x.bytes, ), uuidlist) | |
selectui_values = ui | |
selectui_sql = [ '%s', ] * len(selectui_values) | |
selectui_sql = ",".join(selectui_sql) | |
# select | |
c = self.db.cursor() | |
with self.sb.benchmark(): | |
sql = """ | |
SELECT | |
id | |
FROM | |
`longblob` | |
WHERE | |
uuid IN (%s) | |
""" % ( selectui_sql, ) | |
c.execute(sql, selectui_values) | |
r = c.fetchall() | |
assert len(r) == self.SELECT_CHUNK_SIZE | |
c.close() | |
def run_test(self, test_name): | |
# create benchmark object | |
self.ib = Benchmark() | |
self.sb = Benchmark() | |
# Fetch some UUIDs | |
cnt=0 | |
loopcnt=0 | |
selectcnt = 0 | |
while cnt < self.TOTAL_ROWS: | |
# incr | |
loopcnt += self.COMMIT_CHUNK_SIZE | |
cnt += self.COMMIT_CHUNK_SIZE | |
selectcnt += self.SELECT_CHUNK_SIZE | |
print "currently at", cnt | |
# Generate a list of UUIDs | |
uuidlist = self.generate_uuid(self.COMMIT_CHUNK_SIZE) | |
# insert | |
insert_fn = "%s_insert" % ( test_name, ) | |
getattr(self, insert_fn)(uuidlist) | |
# select | |
selectui = random.sample(uuidlist, self.SELECT_CHUNK_SIZE) | |
select_fn = "%s_select" % ( test_name, ) | |
getattr(self, select_fn)(selectui) | |
# to avoid skewing the graphs, group into | |
if loopcnt >= self.LOOP_SIZE: | |
self.ib.mark({ | |
'total_rows' : cnt, | |
'chunk_size' : loopcnt | |
}) | |
self.sb.mark({ | |
'total_rows' : cnt, | |
'chunk_size' : selectcnt | |
}) | |
loopcnt = 0 | |
selectcnt = 0 | |
print "TEST: INSERT %s" % ( test_name, ) | |
self.ib.dump() | |
print "TEST: SELECT %s" % ( test_name, ) | |
self.sb.dump() | |
def create_database(self): | |
"""Create databases and tables""" | |
c = self.db.cursor() | |
c.execute("DROP DATABASE IF EXISTS `uuidtest`") | |
c.execute("CREATE DATABASE `uuidtest`") | |
self.db.select_db("uuidtest") | |
c.execute(""" | |
CREATE TABLE `longblob` ( | |
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | |
`uuid` longblob NOT NULL, | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `uuid` (`uuid`(16)) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8; | |
""") | |
c.execute(""" | |
CREATE TABLE `bin16` ( | |
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | |
`uuid` binary(16) NOT NULL, | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `uuid` (`uuid`) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8; | |
""") | |
c.execute(""" | |
CREATE TABLE `char32` ( | |
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | |
`uuid` char(32) DEFAULT NULL, | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `uuid` (`uuid`) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8; | |
""") | |
self.db.commit() | |
c = MySQLdb.connect('localhost', user='uuidtest', passwd='uuidtest') | |
u = UUIDTest(c) | |
u.start() |
Results with a slightly larger data set:
TEST: INSERT char32
total_rows chunk_size time_taken
100000 100000 2.46878623962
200000 100000 1.79723906517
300000 100000 1.73530197144
400000 100000 1.73919773102
500000 100000 1.63856697083
600000 100000 1.59954094887
700000 100000 1.87198996544
800000 100000 2.29019093513
900000 100000 1.92165780067
1000000 100000 1.8013048172
TEST: SELECT char32
total_rows chunk_size time_taken
100000 10000 0.127480745316
200000 10000 0.126168727875
300000 10000 0.145627975464
400000 10000 0.127923727036
500000 10000 0.127927780151
600000 10000 0.129226922989
700000 10000 0.142919063568
800000 10000 0.130170106888
900000 10000 0.134680986404
1000000 10000 0.130425453186
TEST: INSERT bin16
total_rows chunk_size time_taken
100000 100000 2.27195692062
200000 100000 1.28541326523
300000 100000 1.55490493774
400000 100000 1.20640492439
500000 100000 1.34324932098
600000 100000 1.86617588997
700000 100000 1.31899404526
800000 100000 1.47097420692
900000 100000 1.38960409164
1000000 100000 1.34337806702
TEST: SELECT bin16
total_rows chunk_size time_taken
100000 10000 0.0792698860168
200000 10000 0.077299118042
300000 10000 0.0773229598999
400000 10000 0.0775167942047
500000 10000 0.0775558948517
600000 10000 0.0786061286926
700000 10000 0.0788822174072
800000 10000 0.0800371170044
900000 10000 0.0799143314362
1000000 10000 0.0797553062439
TEST: INSERT longblob
total_rows chunk_size time_taken
100000 100000 2.22277736664
200000 100000 1.17504191399
300000 100000 1.37590384483
400000 100000 1.10555195808
500000 100000 1.09845614433
600000 100000 1.39847588539
700000 100000 1.51331496239
800000 100000 1.67613840103
900000 100000 1.16158485413
1000000 100000 1.25154685974
TEST: SELECT longblob
total_rows chunk_size time_taken
100000 10000 0.141802072525
200000 10000 0.149259090424
300000 10000 0.146551847458
400000 10000 0.160199165344
500000 10000 0.150816202164
600000 10000 0.16828584671
700000 10000 0.161404132843
800000 10000 0.158686876297
900000 10000 0.153688192368
1000000 10000 0.154032468796
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So, overall, a modest gain by using longblob (which is supported by Django), but really the payoff would be in using a binary field directly. Still, not sure if that's worth it.