Create a gist now

Instantly share code, notes, and snippets.

import sys
import collections
import gridfs
import io
import psycopg2
import pymongo
import random
import time
# For fairness use the same chunk size - 512k.
CHUNK_SIZE = 1024 * 512
class GridFSBench(object):
def setup(self):
self.mongo = pymongo.MongoClient()
self.fs = gridfs.GridFS(self.mongo.gridfs_benchmark_db)
def store(self, fp):
self.fs.put(fp, chunk_size=CHUNK_SIZE)
def cleanup(self):
if 'gridfs_benchmark_db' in self.mongo.database_names():
self.mongo.drop_database('gridfs_benchmark_db')
class LobjectBench(object):
def setup(self):
c = psycopg2.connect(dbname='template1')
c.autocommit = True
c.cursor().execute('CREATE DATABASE lobs_benchmark_db')
c.close()
self.conn = psycopg2.connect(dbname='lobs_benchmark_db')
def store(self, fp):
lob = self.conn.lobject(mode='wb')
while 1:
chunk = fp.read(CHUNK_SIZE)
if not chunk:
break
lob.write(chunk)
self.conn.commit()
def cleanup(self):
self.conn.close()
c = psycopg2.connect(dbname='template1')
c.autocommit = True
c.cursor().execute('DROP DATABASE IF EXISTS lobs_benchmark_db')
c.close()
class RandomData(object):
# See http://jessenoller.com/blog/2008/05/30/making-re-creatable-random-data-files-really-fast-in-python
def __init__(self, seed):
r = random.SystemRandom(seed)
self.seed = "".join(str(r.randint(0,9)) for i in range(20))
self.words = open("lorem.txt", "r").read().replace("\n", '').split()
def __iter__(self):
a = collections.deque(self.words)
b = collections.deque(self.seed)
while True:
yield ' '.join(list(a)[0:1024])
a.rotate(int(b[0]))
b.rotate(1)
def randombytes(datasource, size):
fp = io.BytesIO()
while fp.tell() < size:
fp.write(datasource.next())
fp.seek(0)
return fp
def humanize_bytes(size):
for factor, suffix in ((1<<20, 'M'), (1<<10, 'k'), (1, 'b')):
if size >= factor:
break
return '%i%s' % (size/factor, suffix)
def benchmark(size, reps):
seed = time.time()
gridbench = GridFSBench()
gridreport = open('gridfs-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
lobbench = LobjectBench()
lobreport = open('lobject-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
for (bench, report) in ((gridbench, gridreport), (lobbench, lobreport)):
data = iter(RandomData(seed))
bench.setup()
try:
for i in xrange(reps):
fp = randombytes(data, size)
start = time.clock()
bench.store(fp)
elapsed = time.clock() - start
report.write('%s\n' % elapsed)
finally:
bench.cleanup()
if __name__ == '__main__':
size = int(sys.argv[1])
reps = int(sys.argv[2])
print "Benchmarking %s x %s..." % (humanize_bytes(size), reps)
benchmark(size, reps)

Results summary

Times are seconds required to store a file.

File size   GridFS avg      Lobject avg  Lobject diff
==========  ==============  ===========  ============
1k          0.0017968       0.0002748    85%
10k         0.00180485      0.0003415    81%
1M          0.00667265      0.00271265   59%
10M         0.04660705      0.0214903    54%
50M         0.22207705      0.1296414    42%

Detailed results

(Courtesy of ministat.)

1k

$ python benchmark-storage.py 1024 20
$ ministat -c99 -w74 gridfs-1k-20.txt lobject-1k-20.txt
x gridfs-1k-20.txt
+ lobject-1k-20.txt
+--------------------------------------------------------------------------+
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   ++                                                                     |
|   ++                                                                     |
|   ++ +                         x                                         |
|   ++ +                         x   x                                     |
|   ++ +                 +       x  xxxx x x xx x   xx xx x x             x|
||__M_A___|                        |________M_A__________|                 |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001297       0.00286     0.0017275     0.0017968   0.000412761
+  20      0.000199      0.000995      0.000227     0.0002748 0.00017411327
Difference at 99.0% confidence
    -0.001522 +/- 0.000271665
    -84.7061% +/- 15.1194%
    (Student's t, pooled s = 0.00031677)

10k

$ python benchmark-storage.py 10240 20
$ ministat -c99 -w74 gridfs-10k-20.txt lobject-10k-20.txt
x gridfs-10k-20.txt
+ lobject-10k-20.txt
+--------------------------------------------------------------------------+
|+                                                                         |
|+                                                                         |
|+ +                                                                       |
|+ ++ +                           x                                        |
|++++ +                           x  x  x                                  |
|+++++++                  x x   xxxx xxxxx   xx     x  x                  x|
||_A_|                       |_______M__A__________|                       |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001258       0.00316     0.0017045    0.00180485 0.00042674154
+  20      0.000253      0.000498     0.0003245     0.0003415 7.8833268e-05
Difference at 99.0% confidence
    -0.00146335 +/- 0.000263164
    -81.0788% +/- 14.5809%
    (Student's t, pooled s = 0.000306857)

1M

$ python benchmark-storage.py 1048576 20
$ ministat -c99 -w74 gridfs-1M-20.txt lobject-1M-20.txt
x gridfs-1M-20.txt
+ lobject-1M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                   x                                   |
| ++                                   x                                   |
| ++                               x x x                                   |
| ++++                             x x x     x                             |
|+++++  ++ +                     xxx x xxx  xxx                           x|
||_MA_|                         |______M_A_______|                         |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.005882      0.010248     0.0064845    0.00667265 0.00093660683
+  20      0.002418      0.003446     0.0026345    0.00271265  0.0002695611
Difference at 99.0% confidence
    -0.00396 +/- 0.000591034
    -59.3467% +/- 8.85756%
    (Student's t, pooled s = 0.000689165)

10M

$ python benchmark-storage.py 10485760 20
$ ministat -c99 -w74 gridfs-10M-20.txt lobject-10M-20.txt
x gridfs-10M-20.txt
+ lobject-10M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                                                       |
| ++                                                                       |
| ++                                                                       |
| ++                                         x                             |
|+++                                       xxx                             |
|+++                                       xxx     x                       |
|++++                                      xxx x   x x   xxx x     x      x|
| A|                                      |___M____A________|              |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.042279      0.058557     0.0440435    0.04660705  0.0046465036
+  20      0.020823      0.022263     0.0215675     0.0214903 0.00034488108
Difference at 99.0% confidence
    -0.0251167 +/- 0.00282549
    -53.8905% +/- 6.06237%
    (Student's t, pooled s = 0.00329461)

50M

$ python benchmark-storage.py 52428800 20
$ ministat -c99 -w74 gridfs-50M-20.txt lobject-50M-20.txt
x gridfs-50M-20.txt
+ lobject-50M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
|++                                                 x                      |
|++                                                 x                      |
|++                + +                       x  x x x                      |
|++  +           ++++++++                  xxxxxx x xxx       x    x  x   x|
| |________A_________|                      |______M_A_______|             |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.201251      0.269507     0.2179865    0.22207705   0.019609101
+  20       0.10715      0.157961     0.1285205     0.1296414   0.021250329
Difference at 99.0% confidence
    -0.0924356 +/- 0.0175349
    -41.6232% +/- 7.89584%
    (Student's t, pooled s = 0.0204462)
@yuwtennis

Have you tried benchmarking using GridFS streaming api (upload_from_stream , download_to_stream)?

@boazin
boazin commented Jan 16, 2017

Great comparison.
Have you done something similar for read operations? (Fetching the data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment