kevinlondon/uuid_benchmarks.py

## uuid_benchmarks.py
#!/usr/bin/env python

import MySQLdb
import logging
import uuid
import time
import random
from contextlib import contextmanager
from binascii import unhexlify, hexlify

# Modified from http://iops.io/blog/storing-billions-uuid-fields-mysql-innodb/


def configure_logging():
    FORMAT = "[%(asctime)-15s] [%(filename)s:%(lineno)d] [%(process)d/%(processName)s] %(message)s"
    logging.basicConfig(format=FORMAT, level=logging.DEBUG)

configure_logging()

def flatten_list(seq):
    merged = []
    for s in seq:
        for x in s:
            merged.append(x)
    return merged

class Benchmark(object):
    def __init__(self):
        self.start = time.time()
        self.marks = []
        self.times = []

    @contextmanager
    def benchmark(self):
        t = time.time()
        yield
        now = time.time()
        elapsed = now - t
        o = (t, now, elapsed)
        self.times.append(o)

    @contextmanager
    def mark(self, data):
        assert isinstance(data, dict)
        total_elapsed = sum(map(lambda x: x[2], self.times))
        o = (time.time(), total_elapsed, data)
        self.marks.append(o)
        self.times = []

    def dump(self):
        h = "%-20s %-20s %-20s"
        print h % ( "total_rows", "chunk_size", "time_taken")
        for x in self.marks:
            d = x[2]
            print h % ( d['total_rows'], d['chunk_size'], x[1])


class UUIDTest(object):

    # The amount of UUIDs pre-generated for each database commit
    # Setting this too high may result in "MySQL server has gone away" errors
    COMMIT_CHUNK_SIZE = 20000

    # The amount of rows to insert at any given time in a single
    # commit using executemany().
    LOOP_SIZE = 100000

    # The amount of rows to insert in total for each test
    TOTAL_ROWS = 400000

    # The amount of UUIDs to select against
    SELECT_CHUNK_SIZE = 2000

    def __init__(self, dbconn):
        self.db = dbconn

    def start(self):
        self.create_database()
        self.run_test('char32')
        self.run_test('bin16')
        self.run_test('longblob')

    def generate_uuid(self, total):
        """Generate a list of random UUIDs. Time spent generating
        these is not taken into consideration when comparing performance
        between each test. This is because we are only interested in the
        db select/insert performance"""

        x = map(lambda x: uuid.uuid4(), range(self.COMMIT_CHUNK_SIZE))
        return x

    ######################################################
    # CHAR32 TESTS
    ######################################################
    def char32_insert(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)

        # Insert UUIDs into database
        c = self.db.cursor()
        with self.ib.benchmark():
            c.executemany("""
                INSERT INTO char32 (uuid)
                VALUES (%s)
            """, ui)
            self.db.commit()
        c.close()

    def char32_select(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)

        selectui_values = ui
        selectui_sql = [ '%s', ] * len(selectui_values)
        selectui_sql = ",".join(selectui_sql)

        # select
        c = self.db.cursor()
        with self.sb.benchmark():
            sql = """
                SELECT
                    id
                FROM
                    `char32`
                WHERE
                    uuid IN (%s)
            """ % ( selectui_sql, )
            c.execute(sql, selectui_values)
            r = c.fetchall()
            assert len(r) == self.SELECT_CHUNK_SIZE
        c.close()

    ######################################################
    # BIN16 TESTS
    ######################################################
    def bin16_insert(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (x.bytes, ), uuidlist)

        # Insert UUIDs into database
        c = self.db.cursor()
        with self.ib.benchmark():
            c.executemany("""
                INSERT INTO `bin16` (uuid)
                VALUES (%s)
            """, ui)
            self.db.commit()
        c.close()

    def bin16_select(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (x.bytes, ), uuidlist)

        selectui_values = ui
        selectui_sql = [ '%s', ] * len(selectui_values)
        selectui_sql = ",".join(selectui_sql)

        # select
        c = self.db.cursor()
        with self.sb.benchmark():
            sql = """
                SELECT
                    id
                FROM
                    `bin16`
                WHERE
                    uuid IN (%s)
            """ % ( selectui_sql, )
            c.execute(sql, selectui_values)
            r = c.fetchall()
            assert len(r) == self.SELECT_CHUNK_SIZE
        c.close()

    ######################################################
    # LongBlob TESTS
    ######################################################
    def longblob_insert(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (x.bytes, ), uuidlist)

        # Insert UUIDs into database
        c = self.db.cursor()
        with self.ib.benchmark():
            c.executemany("""
                INSERT INTO `longblob` (uuid)
                VALUES (%s)
            """, ui)
            self.db.commit()
        c.close()

    def longblob_select(self, uuidlist):
        # convert list into necessary format
        ui = map(lambda x: (x.bytes, ), uuidlist)

        selectui_values = ui
        selectui_sql = [ '%s', ] * len(selectui_values)
        selectui_sql = ",".join(selectui_sql)

        # select
        c = self.db.cursor()
        with self.sb.benchmark():
            sql = """
                SELECT
                    id
                FROM
                    `longblob`
                WHERE
                    uuid IN (%s)
            """ % ( selectui_sql, )
            c.execute(sql, selectui_values)
            r = c.fetchall()
            assert len(r) == self.SELECT_CHUNK_SIZE
        c.close()

    def run_test(self, test_name):
        # create benchmark object
        self.ib = Benchmark()
        self.sb = Benchmark()

        # Fetch some UUIDs
        cnt=0
        loopcnt=0
        selectcnt = 0
        while cnt < self.TOTAL_ROWS:
            # incr
            loopcnt += self.COMMIT_CHUNK_SIZE
            cnt += self.COMMIT_CHUNK_SIZE
            selectcnt += self.SELECT_CHUNK_SIZE
            print "currently at", cnt

            # Generate a list of UUIDs
            uuidlist = self.generate_uuid(self.COMMIT_CHUNK_SIZE)

            # insert
            insert_fn = "%s_insert" % ( test_name, )
            getattr(self, insert_fn)(uuidlist)

            # select
            selectui = random.sample(uuidlist, self.SELECT_CHUNK_SIZE)
            select_fn = "%s_select" % ( test_name, )
            getattr(self, select_fn)(selectui)

            # to avoid skewing the graphs, group into
            if loopcnt >= self.LOOP_SIZE:
                self.ib.mark({
                    'total_rows' : cnt,
                    'chunk_size' : loopcnt
                })

                self.sb.mark({
                    'total_rows' : cnt,
                    'chunk_size' : selectcnt
                })
                loopcnt = 0
                selectcnt = 0

        print "TEST: INSERT %s" % ( test_name, )
        self.ib.dump()

        print "TEST: SELECT %s" % ( test_name, )
        self.sb.dump()

    def create_database(self):
        """Create databases and tables"""

        c = self.db.cursor()
        c.execute("DROP DATABASE IF EXISTS `uuidtest`")
        c.execute("CREATE DATABASE `uuidtest`")

        self.db.select_db("uuidtest")

        c.execute("""
            CREATE TABLE `longblob` (
              `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
              `uuid` longblob NOT NULL,
              PRIMARY KEY (`id`),
              UNIQUE KEY `uuid` (`uuid`(16))
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
        """)

        c.execute("""
            CREATE TABLE `bin16` (
              `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
              `uuid` binary(16) NOT NULL,
              PRIMARY KEY (`id`),
              UNIQUE KEY `uuid` (`uuid`)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
        """)

        c.execute("""
            CREATE TABLE `char32` (
              `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
              `uuid` char(32) DEFAULT NULL,
              PRIMARY KEY (`id`),
              UNIQUE KEY `uuid` (`uuid`)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
        """)

        self.db.commit()

c = MySQLdb.connect('localhost', user='uuidtest', passwd='uuidtest')
u = UUIDTest(c)
u.start()
	#!/usr/bin/env python

	import MySQLdb
	import logging
	import uuid
	import time
	import random
	from contextlib import contextmanager
	from binascii import unhexlify, hexlify

	# Modified from http://iops.io/blog/storing-billions-uuid-fields-mysql-innodb/


	def configure_logging():
	FORMAT = "[%(asctime)-15s] [%(filename)s:%(lineno)d] [%(process)d/%(processName)s] %(message)s"
	logging.basicConfig(format=FORMAT, level=logging.DEBUG)

	configure_logging()

	def flatten_list(seq):
	merged = []
	for s in seq:
	for x in s:
	merged.append(x)
	return merged

	class Benchmark(object):
	def __init__(self):
	self.start = time.time()
	self.marks = []
	self.times = []

	@contextmanager
	def benchmark(self):
	t = time.time()
	yield
	now = time.time()
	elapsed = now - t
	o = (t, now, elapsed)
	self.times.append(o)

	@contextmanager
	def mark(self, data):
	assert isinstance(data, dict)
	total_elapsed = sum(map(lambda x: x[2], self.times))
	o = (time.time(), total_elapsed, data)
	self.marks.append(o)
	self.times = []

	def dump(self):
	h = "%-20s %-20s %-20s"
	print h % ( "total_rows", "chunk_size", "time_taken")
	for x in self.marks:
	d = x[2]
	print h % ( d['total_rows'], d['chunk_size'], x[1])


	class UUIDTest(object):

	# The amount of UUIDs pre-generated for each database commit
	# Setting this too high may result in "MySQL server has gone away" errors
	COMMIT_CHUNK_SIZE = 20000

	# The amount of rows to insert at any given time in a single
	# commit using executemany().
	LOOP_SIZE = 100000

	# The amount of rows to insert in total for each test
	TOTAL_ROWS = 400000

	# The amount of UUIDs to select against
	SELECT_CHUNK_SIZE = 2000

	def __init__(self, dbconn):
	self.db = dbconn

	def start(self):
	self.create_database()
	self.run_test('char32')
	self.run_test('bin16')
	self.run_test('longblob')

	def generate_uuid(self, total):
	"""Generate a list of random UUIDs. Time spent generating
	these is not taken into consideration when comparing performance
	between each test. This is because we are only interested in the
	db select/insert performance"""

	x = map(lambda x: uuid.uuid4(), range(self.COMMIT_CHUNK_SIZE))
	return x

	######################################################
	# CHAR32 TESTS
	######################################################
	def char32_insert(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)

	# Insert UUIDs into database
	c = self.db.cursor()
	with self.ib.benchmark():
	c.executemany("""
	INSERT INTO char32 (uuid)
	VALUES (%s)
	""", ui)
	self.db.commit()
	c.close()

	def char32_select(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (str(x).replace("-", ""), ), uuidlist)

	selectui_values = ui
	selectui_sql = [ '%s', ] * len(selectui_values)
	selectui_sql = ",".join(selectui_sql)

	# select
	c = self.db.cursor()
	with self.sb.benchmark():
	sql = """
	SELECT
	id
	FROM
	`char32`
	WHERE
	uuid IN (%s)
	""" % ( selectui_sql, )
	c.execute(sql, selectui_values)
	r = c.fetchall()
	assert len(r) == self.SELECT_CHUNK_SIZE
	c.close()

	######################################################
	# BIN16 TESTS
	######################################################
	def bin16_insert(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (x.bytes, ), uuidlist)

	# Insert UUIDs into database
	c = self.db.cursor()
	with self.ib.benchmark():
	c.executemany("""
	INSERT INTO `bin16` (uuid)
	VALUES (%s)
	""", ui)
	self.db.commit()
	c.close()

	def bin16_select(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (x.bytes, ), uuidlist)

	selectui_values = ui
	selectui_sql = [ '%s', ] * len(selectui_values)
	selectui_sql = ",".join(selectui_sql)

	# select
	c = self.db.cursor()
	with self.sb.benchmark():
	sql = """
	SELECT
	id
	FROM
	`bin16`
	WHERE
	uuid IN (%s)
	""" % ( selectui_sql, )
	c.execute(sql, selectui_values)
	r = c.fetchall()
	assert len(r) == self.SELECT_CHUNK_SIZE
	c.close()

	######################################################
	# LongBlob TESTS
	######################################################
	def longblob_insert(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (x.bytes, ), uuidlist)

	# Insert UUIDs into database
	c = self.db.cursor()
	with self.ib.benchmark():
	c.executemany("""
	INSERT INTO `longblob` (uuid)
	VALUES (%s)
	""", ui)
	self.db.commit()
	c.close()

	def longblob_select(self, uuidlist):
	# convert list into necessary format
	ui = map(lambda x: (x.bytes, ), uuidlist)

	selectui_values = ui
	selectui_sql = [ '%s', ] * len(selectui_values)
	selectui_sql = ",".join(selectui_sql)

	# select
	c = self.db.cursor()
	with self.sb.benchmark():
	sql = """
	SELECT
	id
	FROM
	`longblob`
	WHERE
	uuid IN (%s)
	""" % ( selectui_sql, )
	c.execute(sql, selectui_values)
	r = c.fetchall()
	assert len(r) == self.SELECT_CHUNK_SIZE
	c.close()

	def run_test(self, test_name):
	# create benchmark object
	self.ib = Benchmark()
	self.sb = Benchmark()

	# Fetch some UUIDs
	cnt=0
	loopcnt=0
	selectcnt = 0
	while cnt < self.TOTAL_ROWS:
	# incr
	loopcnt += self.COMMIT_CHUNK_SIZE
	cnt += self.COMMIT_CHUNK_SIZE
	selectcnt += self.SELECT_CHUNK_SIZE
	print "currently at", cnt

	# Generate a list of UUIDs
	uuidlist = self.generate_uuid(self.COMMIT_CHUNK_SIZE)

	# insert
	insert_fn = "%s_insert" % ( test_name, )
	getattr(self, insert_fn)(uuidlist)

	# select
	selectui = random.sample(uuidlist, self.SELECT_CHUNK_SIZE)
	select_fn = "%s_select" % ( test_name, )
	getattr(self, select_fn)(selectui)

	# to avoid skewing the graphs, group into
	if loopcnt >= self.LOOP_SIZE:
	self.ib.mark({
	'total_rows' : cnt,
	'chunk_size' : loopcnt
	})

	self.sb.mark({
	'total_rows' : cnt,
	'chunk_size' : selectcnt
	})
	loopcnt = 0
	selectcnt = 0

	print "TEST: INSERT %s" % ( test_name, )
	self.ib.dump()

	print "TEST: SELECT %s" % ( test_name, )
	self.sb.dump()

	def create_database(self):
	"""Create databases and tables"""

	c = self.db.cursor()
	c.execute("DROP DATABASE IF EXISTS `uuidtest`")
	c.execute("CREATE DATABASE `uuidtest`")

	self.db.select_db("uuidtest")

	c.execute("""
	CREATE TABLE `longblob` (
	`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
	`uuid` longblob NOT NULL,
	PRIMARY KEY (`id`),
	UNIQUE KEY `uuid` (`uuid`(16))
	) ENGINE=InnoDB DEFAULT CHARSET=utf8;
	""")

	c.execute("""
	CREATE TABLE `bin16` (
	`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
	`uuid` binary(16) NOT NULL,
	PRIMARY KEY (`id`),
	UNIQUE KEY `uuid` (`uuid`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8;
	""")

	c.execute("""
	CREATE TABLE `char32` (
	`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
	`uuid` char(32) DEFAULT NULL,
	PRIMARY KEY (`id`),
	UNIQUE KEY `uuid` (`uuid`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8;
	""")

	self.db.commit()

	c = MySQLdb.connect('localhost', user='uuidtest', passwd='uuidtest')
	u = UUIDTest(c)
	u.start()