jacobian/benchmark-storage.py

## benchmark-storage.py
import sys
import collections
import gridfs
import io
import psycopg2
import pymongo
import random
import time

# For fairness use the same chunk size - 512k.
CHUNK_SIZE = 1024 * 512

class GridFSBench(object):
    def setup(self):
        self.mongo = pymongo.MongoClient()
        self.fs = gridfs.GridFS(self.mongo.gridfs_benchmark_db)

    def store(self, fp):
        self.fs.put(fp, chunk_size=CHUNK_SIZE)

    def cleanup(self):
        if 'gridfs_benchmark_db' in self.mongo.database_names():
            self.mongo.drop_database('gridfs_benchmark_db')

class LobjectBench(object):
    def setup(self):
        c = psycopg2.connect(dbname='template1')
        c.autocommit = True
        c.cursor().execute('CREATE DATABASE lobs_benchmark_db')
        c.close()
        self.conn = psycopg2.connect(dbname='lobs_benchmark_db')

    def store(self, fp):
        lob = self.conn.lobject(mode='wb')
        while 1:
            chunk = fp.read(CHUNK_SIZE)
            if not chunk:
                break
            lob.write(chunk)
        self.conn.commit()

    def cleanup(self):
        self.conn.close()
        c = psycopg2.connect(dbname='template1')
        c.autocommit = True
        c.cursor().execute('DROP DATABASE IF EXISTS lobs_benchmark_db')
        c.close()

class RandomData(object):
    # See http://jessenoller.com/blog/2008/05/30/making-re-creatable-random-data-files-really-fast-in-python
    def __init__(self, seed):
        r = random.SystemRandom(seed)
        self.seed = "".join(str(r.randint(0,9)) for i in range(20))
        self.words = open("lorem.txt", "r").read().replace("\n", '').split()

    def __iter__(self):
        a = collections.deque(self.words)
        b = collections.deque(self.seed)
        while True:
            yield ' '.join(list(a)[0:1024])
            a.rotate(int(b[0]))
            b.rotate(1)

def randombytes(datasource, size):
    fp = io.BytesIO()
    while fp.tell() < size:
        fp.write(datasource.next())
    fp.seek(0)
    return fp

def humanize_bytes(size):
    for factor, suffix in ((1<<20, 'M'), (1<<10, 'k'), (1, 'b')):
        if size >= factor:
            break
    return '%i%s' % (size/factor, suffix)

def benchmark(size, reps):
    seed = time.time()

    gridbench = GridFSBench()
    gridreport = open('gridfs-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
    lobbench = LobjectBench()
    lobreport = open('lobject-%s-%s.txt' % (humanize_bytes(size), reps), 'w')

    for (bench, report) in ((gridbench, gridreport), (lobbench, lobreport)):
        data = iter(RandomData(seed))
        bench.setup()
        try:
            for i in xrange(reps):
                fp = randombytes(data, size)
                start = time.clock()
                bench.store(fp)
                elapsed = time.clock() - start
                report.write('%s\n' % elapsed)
        finally:
            bench.cleanup()

if __name__ == '__main__':
    size = int(sys.argv[1])
    reps = int(sys.argv[2])
    print "Benchmarking %s x %s..." % (humanize_bytes(size), reps)
    benchmark(size, reps)

## results.md

      
    Raw
  

              results.md
            
          
    Results summary

Times are seconds required to store a file.
File size   GridFS avg      Lobject avg  Lobject diff
==========  ==============  ===========  ============
1k          0.0017968       0.0002748    85%
10k         0.00180485      0.0003415    81%
1M          0.00667265      0.00271265   59%
10M         0.04660705      0.0214903    54%
50M         0.22207705      0.1296414    42%

Detailed results

(Courtesy of ministat.)
1k

$ python benchmark-storage.py 1024 20
$ ministat -c99 -w74 gridfs-1k-20.txt lobject-1k-20.txt
x gridfs-1k-20.txt
+ lobject-1k-20.txt
+--------------------------------------------------------------------------+
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   ++                                                                     |
|   ++                                                                     |
|   ++ +                         x                                         |
|   ++ +                         x   x                                     |
|   ++ +                 +       x  xxxx x x xx x   xx xx x x             x|
||__M_A___|                        |________M_A__________|                 |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001297       0.00286     0.0017275     0.0017968   0.000412761
+  20      0.000199      0.000995      0.000227     0.0002748 0.00017411327
Difference at 99.0% confidence
    -0.001522 +/- 0.000271665
    -84.7061% +/- 15.1194%
    (Student's t, pooled s = 0.00031677)

10k

$ python benchmark-storage.py 10240 20
$ ministat -c99 -w74 gridfs-10k-20.txt lobject-10k-20.txt
x gridfs-10k-20.txt
+ lobject-10k-20.txt
+--------------------------------------------------------------------------+
|+                                                                         |
|+                                                                         |
|+ +                                                                       |
|+ ++ +                           x                                        |
|++++ +                           x  x  x                                  |
|+++++++                  x x   xxxx xxxxx   xx     x  x                  x|
||_A_|                       |_______M__A__________|                       |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001258       0.00316     0.0017045    0.00180485 0.00042674154
+  20      0.000253      0.000498     0.0003245     0.0003415 7.8833268e-05
Difference at 99.0% confidence
    -0.00146335 +/- 0.000263164
    -81.0788% +/- 14.5809%
    (Student's t, pooled s = 0.000306857)

1M

$ python benchmark-storage.py 1048576 20
$ ministat -c99 -w74 gridfs-1M-20.txt lobject-1M-20.txt
x gridfs-1M-20.txt
+ lobject-1M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                   x                                   |
| ++                                   x                                   |
| ++                               x x x                                   |
| ++++                             x x x     x                             |
|+++++  ++ +                     xxx x xxx  xxx                           x|
||_MA_|                         |______M_A_______|                         |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.005882      0.010248     0.0064845    0.00667265 0.00093660683
+  20      0.002418      0.003446     0.0026345    0.00271265  0.0002695611
Difference at 99.0% confidence
    -0.00396 +/- 0.000591034
    -59.3467% +/- 8.85756%
    (Student's t, pooled s = 0.000689165)

10M

$ python benchmark-storage.py 10485760 20
$ ministat -c99 -w74 gridfs-10M-20.txt lobject-10M-20.txt
x gridfs-10M-20.txt
+ lobject-10M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                                                       |
| ++                                                                       |
| ++                                                                       |
| ++                                         x                             |
|+++                                       xxx                             |
|+++                                       xxx     x                       |
|++++                                      xxx x   x x   xxx x     x      x|
| A|                                      |___M____A________|              |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.042279      0.058557     0.0440435    0.04660705  0.0046465036
+  20      0.020823      0.022263     0.0215675     0.0214903 0.00034488108
Difference at 99.0% confidence
    -0.0251167 +/- 0.00282549
    -53.8905% +/- 6.06237%
    (Student's t, pooled s = 0.00329461)

50M

$ python benchmark-storage.py 52428800 20
$ ministat -c99 -w74 gridfs-50M-20.txt lobject-50M-20.txt
x gridfs-50M-20.txt
+ lobject-50M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
|++                                                 x                      |
|++                                                 x                      |
|++                + +                       x  x x x                      |
|++  +           ++++++++                  xxxxxx x xxx       x    x  x   x|
| |________A_________|                      |______M_A_______|             |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.201251      0.269507     0.2179865    0.22207705   0.019609101
+  20       0.10715      0.157961     0.1285205     0.1296414   0.021250329
Difference at 99.0% confidence
    -0.0924356 +/- 0.0175349
    -41.6232% +/- 7.89584%
    (Student's t, pooled s = 0.0204462)
	import sys
	import collections
	import gridfs
	import io
	import psycopg2
	import pymongo
	import random
	import time

	# For fairness use the same chunk size - 512k.
	CHUNK_SIZE = 1024 * 512

	class GridFSBench(object):
	def setup(self):
	self.mongo = pymongo.MongoClient()
	self.fs = gridfs.GridFS(self.mongo.gridfs_benchmark_db)

	def store(self, fp):
	self.fs.put(fp, chunk_size=CHUNK_SIZE)

	def cleanup(self):
	if 'gridfs_benchmark_db' in self.mongo.database_names():
	self.mongo.drop_database('gridfs_benchmark_db')

	class LobjectBench(object):
	def setup(self):
	c = psycopg2.connect(dbname='template1')
	c.autocommit = True
	c.cursor().execute('CREATE DATABASE lobs_benchmark_db')
	c.close()
	self.conn = psycopg2.connect(dbname='lobs_benchmark_db')

	def store(self, fp):
	lob = self.conn.lobject(mode='wb')
	while 1:
	chunk = fp.read(CHUNK_SIZE)
	if not chunk:
	break
	lob.write(chunk)
	self.conn.commit()

	def cleanup(self):
	self.conn.close()
	c = psycopg2.connect(dbname='template1')
	c.autocommit = True
	c.cursor().execute('DROP DATABASE IF EXISTS lobs_benchmark_db')
	c.close()

	class RandomData(object):
	# See http://jessenoller.com/blog/2008/05/30/making-re-creatable-random-data-files-really-fast-in-python
	def __init__(self, seed):
	r = random.SystemRandom(seed)
	self.seed = "".join(str(r.randint(0,9)) for i in range(20))
	self.words = open("lorem.txt", "r").read().replace("\n", '').split()

	def __iter__(self):
	a = collections.deque(self.words)
	b = collections.deque(self.seed)
	while True:
	yield ' '.join(list(a)[0:1024])
	a.rotate(int(b[0]))
	b.rotate(1)

	def randombytes(datasource, size):
	fp = io.BytesIO()
	while fp.tell() < size:
	fp.write(datasource.next())
	fp.seek(0)
	return fp

	def humanize_bytes(size):
	for factor, suffix in ((1<<20, 'M'), (1<<10, 'k'), (1, 'b')):
	if size >= factor:
	break
	return '%i%s' % (size/factor, suffix)

	def benchmark(size, reps):
	seed = time.time()

	gridbench = GridFSBench()
	gridreport = open('gridfs-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
	lobbench = LobjectBench()
	lobreport = open('lobject-%s-%s.txt' % (humanize_bytes(size), reps), 'w')

	for (bench, report) in ((gridbench, gridreport), (lobbench, lobreport)):
	data = iter(RandomData(seed))
	bench.setup()
	try:
	for i in xrange(reps):
	fp = randombytes(data, size)
	start = time.clock()
	bench.store(fp)
	elapsed = time.clock() - start
	report.write('%s\n' % elapsed)
	finally:
	bench.cleanup()

	if __name__ == '__main__':
	size = int(sys.argv[1])
	reps = int(sys.argv[2])
	print "Benchmarking %s x %s..." % (humanize_bytes(size), reps)
	benchmark(size, reps)