Skip to content

Instantly share code, notes, and snippets.

@chdorner
Last active June 23, 2023 20:13
Show Gist options
  • Save chdorner/4660b3069c69a211ea733fb5beaf0ba7 to your computer and use it in GitHub Desktop.
Save chdorner/4660b3069c69a211ea733fb5beaf0ba7 to your computer and use it in GitHub Desktop.
SQLAlchemy scan large table in batches

my database had 72k annotations at the time I ran these benchmarks, here's the result:

$ python scripts/batch_bench.py conf/development-app.ini dumb
Memory summary: start
      types |   # objects |   total size
=========== | =========== | ============
       dict |       13852 |     12.46 MB
  frozenset |         349 |     11.85 MB
VM: 327.29Mb
Memory summary: after dumb query
      types |   # objects |   total size
=========== | =========== | ============
       dict |       14201 |     15.79 MB
  frozenset |         350 |     11.85 MB
VM: 575.88Mb

$ python scripts/batch_bench.py conf/development-app.ini stream
Memory summary: start
      types |   # objects |   total size
=========== | =========== | ============
       dict |       13852 |     12.46 MB
  frozenset |         349 |     11.85 MB
VM: 327.29Mb
Memory summary: after streaming
      types |   # objects |   total size
=========== | =========== | ============
       dict |       14201 |     15.79 MB
  frozenset |         350 |     11.85 MB
VM: 495.90Mb

$ python scripts/batch_bench.py conf/development-app.ini yield_per
Memory summary: start
      types |   # objects |   total size
=========== | =========== | ============
       dict |       13852 |     12.46 MB
  frozenset |         349 |     11.85 MB
VM: 327.29Mb
Memory summary: after yield_per
      types |   # objects |   total size
=========== | =========== | ============
       dict |       14198 |     12.81 MB
  frozenset |         350 |     11.85 MB
VM: 326.30Mb
# -*- coding: utf-8 -*-
import argparse
import itertools
from pympler import summary, muppy
import psutil
from pyramid import paster
from pyramid.request import Request
from sqlalchemy.orm import scoped_session, sessionmaker
from h import db
from h.api import models
parser = argparse.ArgumentParser('batch_bench')
parser.add_argument('config_uri', help='paster configuration URI')
parser.add_argument('type')
Session = scoped_session(sessionmaker())
def main():
args = parser.parse_args()
request = Request.blank('/')
env = paster.bootstrap(args.config_uri, request=request)
request.root = env['root']
engine = db.make_engine(request.registry.settings)
Session.configure(bind=engine)
batchsize = 100
memory_usage('start')
if args.type == 'dumb':
query = Session.query(models.Annotation)
for batch in _batch_iter(batchsize, query):
pass
memory_usage('after dumb query')
elif args.type == 'stream':
query = Session.query(models.Annotation).execution_options(stream_results=True)
for batch in _batch_iter(batchsize, query):
pass
memory_usage('after streaming')
elif args.type == 'yield_per':
for batch in _batch_iter(batchsize, Session.query(models.Annotation).yield_per(batchsize)):
pass
memory_usage('after yield_per')
def get_virtual_memory_usage_kb():
"""
The process's current virtual memory size in Kb, as a float.
"""
return float(psutil.Process().memory_info_ex().vms) / 1024.0
def memory_usage(where):
"""
Print out a basic summary of memory usage.
Taken from: https://hyp.is/AVQ-y75KH9ZO4OKSl23K/www.mobify.com/blog/sqlalchemy-memory-magic/
"""
mem_summary = summary.summarize(muppy.get_objects())
print "Memory summary:", where
summary.print_(mem_summary, limit=2)
print "VM: %.2fMb" % (get_virtual_memory_usage_kb() / 1024.0)
def _batch_iter(n, iterable):
it = iter(iterable)
while True:
batch = list(itertools.islice(it, n))
if not batch:
return
yield batch
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment