Skip to content

Instantly share code, notes, and snippets.

@jonathanmorgan
Forked from dbrgn/queryset_generators.py
Created May 6, 2011 05:40
Show Gist options
  • Save jonathanmorgan/958496 to your computer and use it in GitHub Desktop.
Save jonathanmorgan/958496 to your computer and use it in GitHub Desktop.
queryset_generator and queryset_list_generator
'''
queryset_generator and queryset_list_generator based on:
https://gist.github.com/897894
'''
#===============================================================================
# imports (in alphabetical order by package, then by name)
#===============================================================================
# python standard libraries
import gc
import logging
def queryset_generator( queryset_IN, chunksize_IN = 1000 ):
"""
Iterate over a Django Queryset ordered by the primary key
This method loads a maximum of chunksize (default: 1000) rows in it's
memory at the same time while django normally would load all rows in it's
memory. Using the iterator() method only causes it to not preload all the
classes.
Note that the implementation of the generator does not support ordered query sets.
Usage:
my_queryset = queryset_iterator( MyItem.objects.all() )
for item in my_queryset:
item.do_something()
"""
# declare variables
last_pk = -1
queryset = None
current_pk = -1
# is queryset non-None, larger than 0?
if ( ( queryset_IN ) and ( len( queryset_IN ) > 0 ) ):
# if query set
# order queryset by primary key, descending, to get the largest ID value.
last_pk = queryset_IN.order_by('-pk')[0].pk
# order queryset by primary key, ascending.
queryset = queryset_IN.order_by('pk')
# get first pk number
current_pk = queryset[0].pk
# make sure the pk is less than or equal the last value (want this to
# work for the one-record case, just as well as it does for the
# gazillion-record case).
if ( current_pk <= last_pk ):
# subtract 1, so that we include this first pk, as well.
current_pk = current_pk - 1
# continue to return stuff while the next pk is less than the last.
while current_pk < last_pk:
# filter the original queryset, getting all greater than current
for row in queryset.filter( pk__gt = current_pk )[:chunksize_IN]:
# record current pk
current_pk = row.pk
# yield current row
yield row
#-- END loop over this chunk --#
# clear memory.
gc.collect()
#-- END loop over chunks --#
#-- END check to make sure original first pk is less than or equal to last --#
#-- END check to see if anything in queryset --#
#-- END function queryset_generator() --#
def queryset_list_generator(queryset, listsize=10000, chunksize=1000):
"""
Iterate over a Django Queryset ordered by the primary key and return a
list of model objects of the size 'listsize'.
This method loads a maximum of chunksize (default: 1000) rows in it's memory
at the same time while django normally would load all rows in it's memory.
In contrast to the queryset_iterator, it doesn't return each row on its own,
but returns a list of listsize (default: 10000) rows at a time.
Note that the implementation of the generator does not support ordered query sets.
"""
it = queryset_iterator(queryset, chunksize)
i = 0
row_list = []
for row in it:
i += 1
row_list.append(row)
if i >= listsize:
yield row_list
i = 0
row_list = []
#-- END function queryset_list_generator() --#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment