Skip to content

Instantly share code, notes, and snippets.

@kevinlondon
Last active August 29, 2015 14:04
Show Gist options
  • Save kevinlondon/fd4b36851abc9eafa296 to your computer and use it in GitHub Desktop.
Save kevinlondon/fd4b36851abc9eafa296 to your computer and use it in GitHub Desktop.
MySQL UUID Field Benchmarks
#!/usr/bin/env python
import MySQLdb
import logging
import uuid
import time
import random
from contextlib import contextmanager
from binascii import unhexlify, hexlify
# Modified from http://iops.io/blog/storing-billions-uuid-fields-mysql-innodb/
def configure_logging():
FORMAT = "[%(asctime)-15s] [%(filename)s:%(lineno)d] [%(process)d/%(processName)s] %(message)s"
logging.basicConfig(format=FORMAT, level=logging.DEBUG)
configure_logging()
def flatten_list(seq):
merged = []
for s in seq:
for x in s:
merged.append(x)
return merged
class Benchmark(object):
def __init__(self):
self.start = time.time()
self.marks = []
self.times = []
@contextmanager
def benchmark(self):
t = time.time()
yield
now = time.time()
elapsed = now - t
o = (t, now, elapsed)
self.times.append(o)
@contextmanager
def mark(self, data):
assert isinstance(data, dict)
total_elapsed = sum(map(lambda x: x[2], self.times))
o = (time.time(), total_elapsed, data)
self.marks.append(o)
self.times = []
def dump(self):
h = "%-20s %-20s %-20s"
print h % ( "total_rows", "chunk_size", "time_taken")
for x in self.marks:
d = x[2]
print h % ( d['total_rows'], d['chunk_size'], x[1])
class UUIDTest(object):
# The amount of UUIDs pre-generated for each database commit
# Setting this too high may result in "MySQL server has gone away" errors
COMMIT_CHUNK_SIZE = 20000
# The amount of rows to insert at any given time in a single
# commit using executemany().
LOOP_SIZE = 100000
# The amount of rows to insert in total for each test
TOTAL_ROWS = 400000
# The amount of UUIDs to select against
SELECT_CHUNK_SIZE = 2000
def __init__(self, dbconn):
self.db = dbconn
def start(self):
self.create_database()
self.run_test('char32')
self.run_test('bin16')
self.run_test('longblob')
def generate_uuid(self, total):
"""Generate a list of random UUIDs. Time spent generating
these is not taken into consideration when comparing performance
between each test. This is because we are only interested in the
db select/insert performance"""
x = map(lambda x: uuid.uuid4(), range(self.COMMIT_CHUNK_SIZE))
return x
######################################################
# CHAR32 TESTS
######################################################
def char32_insert(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)
# Insert UUIDs into database
c = self.db.cursor()
with self.ib.benchmark():
c.executemany("""
INSERT INTO char32 (uuid)
VALUES (%s)
""", ui)
self.db.commit()
c.close()
def char32_select(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)
selectui_values = ui
selectui_sql = [ '%s', ] * len(selectui_values)
selectui_sql = ",".join(selectui_sql)
# select
c = self.db.cursor()
with self.sb.benchmark():
sql = """
SELECT
id
FROM
`char32`
WHERE
uuid IN (%s)
""" % ( selectui_sql, )
c.execute(sql, selectui_values)
r = c.fetchall()
assert len(r) == self.SELECT_CHUNK_SIZE
c.close()
######################################################
# BIN16 TESTS
######################################################
def bin16_insert(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (x.bytes, ), uuidlist)
# Insert UUIDs into database
c = self.db.cursor()
with self.ib.benchmark():
c.executemany("""
INSERT INTO `bin16` (uuid)
VALUES (%s)
""", ui)
self.db.commit()
c.close()
def bin16_select(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (x.bytes, ), uuidlist)
selectui_values = ui
selectui_sql = [ '%s', ] * len(selectui_values)
selectui_sql = ",".join(selectui_sql)
# select
c = self.db.cursor()
with self.sb.benchmark():
sql = """
SELECT
id
FROM
`bin16`
WHERE
uuid IN (%s)
""" % ( selectui_sql, )
c.execute(sql, selectui_values)
r = c.fetchall()
assert len(r) == self.SELECT_CHUNK_SIZE
c.close()
######################################################
# LongBlob TESTS
######################################################
def longblob_insert(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (x.bytes, ), uuidlist)
# Insert UUIDs into database
c = self.db.cursor()
with self.ib.benchmark():
c.executemany("""
INSERT INTO `longblob` (uuid)
VALUES (%s)
""", ui)
self.db.commit()
c.close()
def longblob_select(self, uuidlist):
# convert list into necessary format
ui = map(lambda x: (x.bytes, ), uuidlist)
selectui_values = ui
selectui_sql = [ '%s', ] * len(selectui_values)
selectui_sql = ",".join(selectui_sql)
# select
c = self.db.cursor()
with self.sb.benchmark():
sql = """
SELECT
id
FROM
`longblob`
WHERE
uuid IN (%s)
""" % ( selectui_sql, )
c.execute(sql, selectui_values)
r = c.fetchall()
assert len(r) == self.SELECT_CHUNK_SIZE
c.close()
def run_test(self, test_name):
# create benchmark object
self.ib = Benchmark()
self.sb = Benchmark()
# Fetch some UUIDs
cnt=0
loopcnt=0
selectcnt = 0
while cnt < self.TOTAL_ROWS:
# incr
loopcnt += self.COMMIT_CHUNK_SIZE
cnt += self.COMMIT_CHUNK_SIZE
selectcnt += self.SELECT_CHUNK_SIZE
print "currently at", cnt
# Generate a list of UUIDs
uuidlist = self.generate_uuid(self.COMMIT_CHUNK_SIZE)
# insert
insert_fn = "%s_insert" % ( test_name, )
getattr(self, insert_fn)(uuidlist)
# select
selectui = random.sample(uuidlist, self.SELECT_CHUNK_SIZE)
select_fn = "%s_select" % ( test_name, )
getattr(self, select_fn)(selectui)
# to avoid skewing the graphs, group into
if loopcnt >= self.LOOP_SIZE:
self.ib.mark({
'total_rows' : cnt,
'chunk_size' : loopcnt
})
self.sb.mark({
'total_rows' : cnt,
'chunk_size' : selectcnt
})
loopcnt = 0
selectcnt = 0
print "TEST: INSERT %s" % ( test_name, )
self.ib.dump()
print "TEST: SELECT %s" % ( test_name, )
self.sb.dump()
def create_database(self):
"""Create databases and tables"""
c = self.db.cursor()
c.execute("DROP DATABASE IF EXISTS `uuidtest`")
c.execute("CREATE DATABASE `uuidtest`")
self.db.select_db("uuidtest")
c.execute("""
CREATE TABLE `longblob` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`uuid` longblob NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uuid` (`uuid`(16))
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""")
c.execute("""
CREATE TABLE `bin16` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`uuid` binary(16) NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uuid` (`uuid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""")
c.execute("""
CREATE TABLE `char32` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`uuid` char(32) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uuid` (`uuid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""")
self.db.commit()
c = MySQLdb.connect('localhost', user='uuidtest', passwd='uuidtest')
u = UUIDTest(c)
u.start()
@kevinlondon
Copy link
Author

TEST: INSERT char32
total_rows           chunk_size           time_taken
100000               100000               2.77846813202
200000               100000               1.848954916
300000               100000               1.37305521965
400000               100000               1.61225795746
TEST: SELECT char32
total_rows           chunk_size           time_taken
100000               10000                0.129801273346
200000               10000                0.127355575562
300000               10000                0.128769874573
400000               10000                0.135525465012


TEST: INSERT bin16
total_rows           chunk_size           time_taken
100000               100000               1.93657398224
200000               100000               1.30170202255
300000               100000               1.35442113876
400000               100000               1.13799524307
TEST: SELECT bin16
total_rows           chunk_size           time_taken
100000               10000                0.0754082202911
200000               10000                0.074910402298
300000               10000                0.0947077274323
400000               10000                0.0760788917542


TEST: INSERT longblob
total_rows           chunk_size           time_taken
100000               100000               1.95601987839
200000               100000               1.2258939743
300000               100000               1.33127903938
400000               100000               1.08236384392
TEST: SELECT longblob
total_rows           chunk_size           time_taken
100000               10000                0.105763196945
200000               10000                0.104069232941
300000               10000                0.106174945831
400000               10000                0.108389854431

So, overall, a modest gain by using longblob (which is supported by Django), but really the payoff would be in using a binary field directly. Still, not sure if that's worth it.

@kevinlondon
Copy link
Author

Results with a slightly larger data set:

TEST: INSERT char32
total_rows           chunk_size           time_taken
100000               100000               2.46878623962
200000               100000               1.79723906517
300000               100000               1.73530197144
400000               100000               1.73919773102
500000               100000               1.63856697083
600000               100000               1.59954094887
700000               100000               1.87198996544
800000               100000               2.29019093513
900000               100000               1.92165780067
1000000              100000               1.8013048172
TEST: SELECT char32
total_rows           chunk_size           time_taken
100000               10000                0.127480745316
200000               10000                0.126168727875
300000               10000                0.145627975464
400000               10000                0.127923727036
500000               10000                0.127927780151
600000               10000                0.129226922989
700000               10000                0.142919063568
800000               10000                0.130170106888
900000               10000                0.134680986404
1000000              10000                0.130425453186

TEST: INSERT bin16
total_rows           chunk_size           time_taken
100000               100000               2.27195692062
200000               100000               1.28541326523
300000               100000               1.55490493774
400000               100000               1.20640492439
500000               100000               1.34324932098
600000               100000               1.86617588997
700000               100000               1.31899404526
800000               100000               1.47097420692
900000               100000               1.38960409164
1000000              100000               1.34337806702
TEST: SELECT bin16
total_rows           chunk_size           time_taken
100000               10000                0.0792698860168
200000               10000                0.077299118042
300000               10000                0.0773229598999
400000               10000                0.0775167942047
500000               10000                0.0775558948517
600000               10000                0.0786061286926
700000               10000                0.0788822174072
800000               10000                0.0800371170044
900000               10000                0.0799143314362
1000000              10000                0.0797553062439


TEST: INSERT longblob
total_rows           chunk_size           time_taken
100000               100000               2.22277736664
200000               100000               1.17504191399
300000               100000               1.37590384483
400000               100000               1.10555195808
500000               100000               1.09845614433
600000               100000               1.39847588539
700000               100000               1.51331496239
800000               100000               1.67613840103
900000               100000               1.16158485413
1000000              100000               1.25154685974
TEST: SELECT longblob
total_rows           chunk_size           time_taken
100000               10000                0.141802072525
200000               10000                0.149259090424
300000               10000                0.146551847458
400000               10000                0.160199165344
500000               10000                0.150816202164
600000               10000                0.16828584671
700000               10000                0.161404132843
800000               10000                0.158686876297
900000               10000                0.153688192368
1000000              10000                0.154032468796

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment