Skip to content

Instantly share code, notes, and snippets.

@mlissner
Forked from dbrgn/queryset_generators.py
Created March 10, 2012 04:13
Show Gist options
  • Save mlissner/2010093 to your computer and use it in GitHub Desktop.
Save mlissner/2010093 to your computer and use it in GitHub Desktop.
Adds a date-based queryset generator
from datetime import datetime
from datetime import timedelta
def queryset_generator(queryset, chunksize=1000):
"""
Iterate over a Django Queryset ordered by the primary key
This method loads a maximum of chunksize (default: 1000) rows in its
memory at the same time while django normally would load all rows in its
memory. Using the iterator() method only causes it to not preload all the
classes.
Note that the implementation of the generator does not support ordered query sets.
"""
last_pk = queryset.order_by('-pk')[0].pk
queryset = queryset.order_by('pk')
pk = queryset[0].pk - 1
while pk < last_pk:
for row in queryset.filter(pk__gt=pk)[:chunksize]:
pk = row.pk
yield row
gc.collect()
def queryset_list_generator(queryset, listsize=10000, chunksize=1000):
"""
Iterate over a Django Queryset ordered by the primary key and return a
list of model objects of the size 'listsize'.
This method loads a maximum of chunksize (default: 1000) rows in its memory
at the same time while django normally would load all rows in its memory.
In contrast to the queryset_generator, it doesn't return each row on its own,
but returns a list of listsize (default: 10000) rows at a time.
Note that the implementation of the generator does not support ordered query sets.
"""
it = queryset_generator(queryset, chunksize)
i = 0
row_list = []
for row in it:
i += 1
row_list.append(row)
if i >= listsize:
yield row_list
i = 0
row_list = []
def queryset_generator_by_date(queryset, date_field, start_date, end_date, chunksize=7):
'''
Takes a queryset and chunks it by date. Useful if sorting by pk isn't
needed. For large querysets, such sorting can be very expensive.
date_field is the name of the date field that should be used for chunking.
This field should have db_index=True in your model.
Chunksize should be given in days, and start and end dates should be provided
as strings in the form 2012-03-08.
'''
chunksize = timedelta(chunksize)
end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
bottom_date = datetime.strptime(start_date, '%Y-%m-%d').date()
top_date = bottom_date + chunksize - timedelta(1)
while bottom_date <= end_date:
if top_date > end_date:
# Last iteration
top_date = end_date
keywords = {'%s__gte' % date_field : bottom_date,
'%s__lte' % date_field : top_date}
bottom_date = bottom_date + chunksize
top_date = top_date + chunksize
for row in queryset.filter(**keywords):
yield row
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment