Skip to content

Instantly share code, notes, and snippets.

@jacobian
Last active April 11, 2018 10:04
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jacobian/5005548 to your computer and use it in GitHub Desktop.
Save jacobian/5005548 to your computer and use it in GitHub Desktop.
import sys
import collections
import gridfs
import io
import psycopg2
import pymongo
import random
import time
# For fairness use the same chunk size - 512k.
CHUNK_SIZE = 1024 * 512
class GridFSBench(object):
def setup(self):
self.mongo = pymongo.MongoClient()
self.fs = gridfs.GridFS(self.mongo.gridfs_benchmark_db)
def store(self, fp):
self.fs.put(fp, chunk_size=CHUNK_SIZE)
def cleanup(self):
if 'gridfs_benchmark_db' in self.mongo.database_names():
self.mongo.drop_database('gridfs_benchmark_db')
class LobjectBench(object):
def setup(self):
c = psycopg2.connect(dbname='template1')
c.autocommit = True
c.cursor().execute('CREATE DATABASE lobs_benchmark_db')
c.close()
self.conn = psycopg2.connect(dbname='lobs_benchmark_db')
def store(self, fp):
lob = self.conn.lobject(mode='wb')
while 1:
chunk = fp.read(CHUNK_SIZE)
if not chunk:
break
lob.write(chunk)
self.conn.commit()
def cleanup(self):
self.conn.close()
c = psycopg2.connect(dbname='template1')
c.autocommit = True
c.cursor().execute('DROP DATABASE IF EXISTS lobs_benchmark_db')
c.close()
class RandomData(object):
# See http://jessenoller.com/blog/2008/05/30/making-re-creatable-random-data-files-really-fast-in-python
def __init__(self, seed):
r = random.SystemRandom(seed)
self.seed = "".join(str(r.randint(0,9)) for i in range(20))
self.words = open("lorem.txt", "r").read().replace("\n", '').split()
def __iter__(self):
a = collections.deque(self.words)
b = collections.deque(self.seed)
while True:
yield ' '.join(list(a)[0:1024])
a.rotate(int(b[0]))
b.rotate(1)
def randombytes(datasource, size):
fp = io.BytesIO()
while fp.tell() < size:
fp.write(datasource.next())
fp.seek(0)
return fp
def humanize_bytes(size):
for factor, suffix in ((1<<20, 'M'), (1<<10, 'k'), (1, 'b')):
if size >= factor:
break
return '%i%s' % (size/factor, suffix)
def benchmark(size, reps):
seed = time.time()
gridbench = GridFSBench()
gridreport = open('gridfs-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
lobbench = LobjectBench()
lobreport = open('lobject-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
for (bench, report) in ((gridbench, gridreport), (lobbench, lobreport)):
data = iter(RandomData(seed))
bench.setup()
try:
for i in xrange(reps):
fp = randombytes(data, size)
start = time.clock()
bench.store(fp)
elapsed = time.clock() - start
report.write('%s\n' % elapsed)
finally:
bench.cleanup()
if __name__ == '__main__':
size = int(sys.argv[1])
reps = int(sys.argv[2])
print "Benchmarking %s x %s..." % (humanize_bytes(size), reps)
benchmark(size, reps)

Results summary

Times are seconds required to store a file.

File size   GridFS avg      Lobject avg  Lobject diff
==========  ==============  ===========  ============
1k          0.0017968       0.0002748    85%
10k         0.00180485      0.0003415    81%
1M          0.00667265      0.00271265   59%
10M         0.04660705      0.0214903    54%
50M         0.22207705      0.1296414    42%

Detailed results

(Courtesy of ministat.)

1k

$ python benchmark-storage.py 1024 20
$ ministat -c99 -w74 gridfs-1k-20.txt lobject-1k-20.txt
x gridfs-1k-20.txt
+ lobject-1k-20.txt
+--------------------------------------------------------------------------+
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   +                                                                      |
|   ++                                                                     |
|   ++                                                                     |
|   ++ +                         x                                         |
|   ++ +                         x   x                                     |
|   ++ +                 +       x  xxxx x x xx x   xx xx x x             x|
||__M_A___|                        |________M_A__________|                 |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001297       0.00286     0.0017275     0.0017968   0.000412761
+  20      0.000199      0.000995      0.000227     0.0002748 0.00017411327
Difference at 99.0% confidence
    -0.001522 +/- 0.000271665
    -84.7061% +/- 15.1194%
    (Student's t, pooled s = 0.00031677)

10k

$ python benchmark-storage.py 10240 20
$ ministat -c99 -w74 gridfs-10k-20.txt lobject-10k-20.txt
x gridfs-10k-20.txt
+ lobject-10k-20.txt
+--------------------------------------------------------------------------+
|+                                                                         |
|+                                                                         |
|+ +                                                                       |
|+ ++ +                           x                                        |
|++++ +                           x  x  x                                  |
|+++++++                  x x   xxxx xxxxx   xx     x  x                  x|
||_A_|                       |_______M__A__________|                       |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.001258       0.00316     0.0017045    0.00180485 0.00042674154
+  20      0.000253      0.000498     0.0003245     0.0003415 7.8833268e-05
Difference at 99.0% confidence
    -0.00146335 +/- 0.000263164
    -81.0788% +/- 14.5809%
    (Student's t, pooled s = 0.000306857)

1M

$ python benchmark-storage.py 1048576 20
$ ministat -c99 -w74 gridfs-1M-20.txt lobject-1M-20.txt
x gridfs-1M-20.txt
+ lobject-1M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                   x                                   |
| ++                                   x                                   |
| ++                               x x x                                   |
| ++++                             x x x     x                             |
|+++++  ++ +                     xxx x xxx  xxx                           x|
||_MA_|                         |______M_A_______|                         |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.005882      0.010248     0.0064845    0.00667265 0.00093660683
+  20      0.002418      0.003446     0.0026345    0.00271265  0.0002695611
Difference at 99.0% confidence
    -0.00396 +/- 0.000591034
    -59.3467% +/- 8.85756%
    (Student's t, pooled s = 0.000689165)

10M

$ python benchmark-storage.py 10485760 20
$ ministat -c99 -w74 gridfs-10M-20.txt lobject-10M-20.txt
x gridfs-10M-20.txt
+ lobject-10M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
| +                                                                        |
| ++                                                                       |
| ++                                                                       |
| ++                                                                       |
| ++                                         x                             |
|+++                                       xxx                             |
|+++                                       xxx     x                       |
|++++                                      xxx x   x x   xxx x     x      x|
| A|                                      |___M____A________|              |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.042279      0.058557     0.0440435    0.04660705  0.0046465036
+  20      0.020823      0.022263     0.0215675     0.0214903 0.00034488108
Difference at 99.0% confidence
    -0.0251167 +/- 0.00282549
    -53.8905% +/- 6.06237%
    (Student's t, pooled s = 0.00329461)

50M

$ python benchmark-storage.py 52428800 20
$ ministat -c99 -w74 gridfs-50M-20.txt lobject-50M-20.txt
x gridfs-50M-20.txt
+ lobject-50M-20.txt
+--------------------------------------------------------------------------+
| +                                                                        |
|++                                                 x                      |
|++                                                 x                      |
|++                + +                       x  x x x                      |
|++  +           ++++++++                  xxxxxx x xxx       x    x  x   x|
| |________A_________|                      |______M_A_______|             |
+--------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20      0.201251      0.269507     0.2179865    0.22207705   0.019609101
+  20       0.10715      0.157961     0.1285205     0.1296414   0.021250329
Difference at 99.0% confidence
    -0.0924356 +/- 0.0175349
    -41.6232% +/- 7.89584%
    (Student's t, pooled s = 0.0204462)
@yuwtennis
Copy link

Have you tried benchmarking using GridFS streaming api (upload_from_stream , download_to_stream)?

@boazin
Copy link

boazin commented Jan 16, 2017

Great comparison.
Have you done something similar for read operations? (Fetching the data)

@changrui0608
Copy link

Hey, firstly thank you for the work, this make my work a lot more easier.

But I found something different.

At first I tried the benchmark script, got similar result,
then I add print in each store method, but in each test I felt GridFS takes less time than LO.

So I searched for something, and tried change time.clock() to timeit.default_timer():

def benchmark(size, reps):
    # seed = time.time()
    seed = timeit.default_timer()

    gridbench = GridFSBench()
    gridreport = open('gridfs-%s-%s.txt' % (humanize_bytes(size), reps), 'w')
    lobbench = LobjectBench()
    lobreport = open('lobject-%s-%s.txt' % (humanize_bytes(size), reps), 'w')

    for (bench, report) in ((gridbench, gridreport), (lobbench, lobreport)):
        data = iter(RandomData(seed))
        bench.setup()
        try:
            for i in xrange(reps):
                fp = randombytes(data, size)
                # start = time.clock()
                start = timeit.default_timer()
                bench.store(fp)
                # elapsed = time.clock() - start
                elapsed = timeit.default_timer() - start
                report.write('%s\n' % elapsed)
        finally:
            bench.cleanup()

And I ran some benchmark, and got opposite result, from some of my trying, basically GridFS is faster than LO.

# gridfs-1K-3
0.0497398376465
0.00375580787659
0.0034351348877

# lobject-1k-3
0.0123190879822
0.0057430267334
0.00509405136108
# gridfs-10k-3
0.0447769165039
0.00345492362976
0.00291919708252

# lobject-10k-3
0.0106348991394
0.00574421882629
0.00505495071411
# gridfs-1M-3
0.0604109764099
0.012845993042
0.0128200054169

# lobject-1M-3
0.0277950763702
0.0214290618896
0.0322630405426
# gridfs-50M-3
0.579872131348
0.501438856125
0.47722697258

# lobject-50M-3
0.931740999222
0.932983160019
0.915488004684

There are some different in my environment (DB host, user, password, etc.), and I ran DB in Docker, on my MacBook laptop running macOS, connect with TCP socket not Unix Socket. But it should not be the reason, because with time.clock() I got "LO faster".

Maybe there is something different on time.clock(), not very sure though.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment