ncoghlan/helpers.py

## helpers.py
from concurrent.futures import Future

# This first part is a helper function to wait for the first Future in a group to complete
def _wait_first(futures):
    # futures must be a set as items will be removed as they complete
    # we create a signalling future to return to our caller. We will copy
    # the result of the first future to complete to this signalling future
    signal = Future()
    signal.completed = None
    def copy_result(completed):
        # We ignore every callback after the first one
        if signal.done():
            return
        # Keep track of which ones have been processed across multiple calls
        futures.remove(completed)
        # And report the underlying future which completed
        signal.completed = completed
        # It would be nice if we could also remove our callback from all the other futures at
        # this point, but the Future API doesn't currently allow that
        # Now we pass the result of this future through to our signalling future
        signal.completed_future = completed
        if completed.cancelled():
            signal.cancel()
            signal.set_running_or_notify_cancel()
        else:
            try:
                result = completed.result()
            except Exception as exc:
                signal.set_exception(exc)
            else:
                signal.set_result(result)
    # Here we hook our signalling future up to all our actual operations
    # If any of them are already complete, then the callback will fire immediately
    # and we're OK with that
    for f in futures:
        f.add_done_callback(copy_result)
    # And, for our signalling future to be useful, the caller must be able to access it
    return signal

# This is just a public version of the above helper that works with arbitrary iterables:
def wait_first(futures):
    # Helper needs a real set, so we give it one
    # Also makes sure all operations start immediately when passed a generator
    return _wait_first(set(futures))

# This is the API I'm most interested in, as it's the async equivalent of
# http://docs.python.org/py3k/library/concurrent.futures#concurrent.futures.as_completed,
# which powers the URL retrieval example in the docs:
# http://docs.python.org/py3k/library/concurrent.futures#threadpoolexecutor-example

# Note that this is an *ordinary iterator* that produces futures, not a tasklet
def as_completed(futures):
    # We ensure all the operations have started, and get ourselves a set to work with
    remaining = set(futures)
    while remaining:
        # The trick here is that we *don't yield the original futures directly*
        # Instead, we yield the signalling Futures created by _wait_first
        # We do end up with a bunch of useless callbacks registered since there's no
        # way to say "we're not interested any more" :(
        yield _wait_first(remaining)

## url_loading.py
# Now for the classic "load multiple pages in parallel" async IO example

# First, a tasklet for loading a single page
@task
def load_url_async(url)
    # The async URL open operation does three things:
    # 1. kicks off the connection process
    # 2. registers a callback with the event handler that will signal a Future object when IO is complete
    # 3. returns the future object
    # We then *yield* the Future object, at which point the task decorator takes over and registers a callback
    # with the *Future* object to resume this generator with the *result* that was passed to the Future object
    conn = yield urllib.urlopen_async(url)
    # We assume "conn.read()" is defined in such a way that it allows both "read everything at once" usage *and* a
    # usage where you read the individual bits of data as they arrive like this:
    #    for wait_for_chunk in conn.read():
    #        chunk = yield wait_for_chunk
    # The secret is that conn.read() would be an *ordinary generator* in that case rather than a tasklet.
    # You could also do a version that *only* supported complete reads, in which case the "from" wouldn't be needed
    return yield from conn.read()

# And now the payoff: defining a tasklet to read a bunch of URLs in parallel, processing them in the order of loading rather than the order of requesting them or having to wait until the slowest load completes before doing anything

@task
def example(urls):
    # We define the tasks we want to run based on the given URLs
    # This results in an iterable of Future objects that will fire when
    # the associated page has been read completely
    future_to_url = {load_url_async(url):url for url in urls}
    # And now we use our helper iterable to run things in parallel
    # and get access to the results as they complete
    for wait_for_page in as_completed(future_to_url):
        # At this point, wait_for_page.completed is still None
        # It will be set once we resume after the yield
        try:
            data = yield wait_for_page
        except Exception as exc:
            url = future_to_url[wait_for_page.completed]
            print("Something broke for {!r} ({}: {})".format(url, type(exc), exc))
        else:
            url = future_to_url[wait_for_page.completed]
            print("Loaded {} bytes from {!r}".format(len(data), url))

# The real kicker here? Replace "yield wait_for_page" with "wait_for_page.result()" and "future_to_url[wait_for_page.completed]" with "future_to_url[wait_for_page]" and you have the equivalent concurrent.futures code. The difference is that with the concurrent.futures Executor model, as_completed is a *blocking* iterator, so it knows which underlying future to pass back on each iteration. For async operation, we instead pass back a *new* Future on each iteration, which then needs to be yielded in order to say "tell me when the next operation is complete".
	from concurrent.futures import Future

	# This first part is a helper function to wait for the first Future in a group to complete
	def _wait_first(futures):
	# futures must be a set as items will be removed as they complete
	# we create a signalling future to return to our caller. We will copy
	# the result of the first future to complete to this signalling future
	signal = Future()
	signal.completed = None
	def copy_result(completed):
	# We ignore every callback after the first one
	if signal.done():
	return
	# Keep track of which ones have been processed across multiple calls
	futures.remove(completed)
	# And report the underlying future which completed
	signal.completed = completed
	# It would be nice if we could also remove our callback from all the other futures at
	# this point, but the Future API doesn't currently allow that
	# Now we pass the result of this future through to our signalling future
	signal.completed_future = completed
	if completed.cancelled():
	signal.cancel()
	signal.set_running_or_notify_cancel()
	else:
	try:
	result = completed.result()
	except Exception as exc:
	signal.set_exception(exc)
	else:
	signal.set_result(result)
	# Here we hook our signalling future up to all our actual operations
	# If any of them are already complete, then the callback will fire immediately
	# and we're OK with that
	for f in futures:
	f.add_done_callback(copy_result)
	# And, for our signalling future to be useful, the caller must be able to access it
	return signal

	# This is just a public version of the above helper that works with arbitrary iterables:
	def wait_first(futures):
	# Helper needs a real set, so we give it one
	# Also makes sure all operations start immediately when passed a generator
	return _wait_first(set(futures))

	# This is the API I'm most interested in, as it's the async equivalent of
	# http://docs.python.org/py3k/library/concurrent.futures#concurrent.futures.as_completed,
	# which powers the URL retrieval example in the docs:
	# http://docs.python.org/py3k/library/concurrent.futures#threadpoolexecutor-example

	# Note that this is an ordinary iterator that produces futures, not a tasklet
	def as_completed(futures):
	# We ensure all the operations have started, and get ourselves a set to work with
	remaining = set(futures)
	while remaining:
	# The trick here is that we don't yield the original futures directly
	# Instead, we yield the signalling Futures created by _wait_first
	# We do end up with a bunch of useless callbacks registered since there's no
	# way to say "we're not interested any more" :(
	yield _wait_first(remaining)
	# Now for the classic "load multiple pages in parallel" async IO example

	# First, a tasklet for loading a single page
	@task
	def load_url_async(url)
	# The async URL open operation does three things:
	# 1. kicks off the connection process
	# 2. registers a callback with the event handler that will signal a Future object when IO is complete
	# 3. returns the future object
	# We then yield the Future object, at which point the task decorator takes over and registers a callback
	# with the Future object to resume this generator with the result that was passed to the Future object
	conn = yield urllib.urlopen_async(url)
	# We assume "conn.read()" is defined in such a way that it allows both "read everything at once" usage and a
	# usage where you read the individual bits of data as they arrive like this:
	# for wait_for_chunk in conn.read():
	# chunk = yield wait_for_chunk
	# The secret is that conn.read() would be an ordinary generator in that case rather than a tasklet.
	# You could also do a version that only supported complete reads, in which case the "from" wouldn't be needed
	return yield from conn.read()

	# And now the payoff: defining a tasklet to read a bunch of URLs in parallel, processing them in the order of loading rather than the order of requesting them or having to wait until the slowest load completes before doing anything

	@task
	def example(urls):
	# We define the tasks we want to run based on the given URLs
	# This results in an iterable of Future objects that will fire when
	# the associated page has been read completely
	future_to_url = {load_url_async(url):url for url in urls}
	# And now we use our helper iterable to run things in parallel
	# and get access to the results as they complete
	for wait_for_page in as_completed(future_to_url):
	# At this point, wait_for_page.completed is still None
	# It will be set once we resume after the yield
	try:
	data = yield wait_for_page
	except Exception as exc:
	url = future_to_url[wait_for_page.completed]
	print("Something broke for {!r} ({}: {})".format(url, type(exc), exc))
	else:
	url = future_to_url[wait_for_page.completed]
	print("Loaded {} bytes from {!r}".format(len(data), url))

	# The real kicker here? Replace "yield wait_for_page" with "wait_for_page.result()" and "future_to_url[wait_for_page.completed]" with "future_to_url[wait_for_page]" and you have the equivalent concurrent.futures code. The difference is that with the concurrent.futures Executor model, as_completed is a blocking iterator, so it knows which underlying future to pass back on each iteration. For async operation, we instead pass back a new Future on each iteration, which then needs to be yielded in order to say "tell me when the next operation is complete".