Skip to content

Instantly share code, notes, and snippets.

@hadware
Created January 9, 2017 22:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hadware/1342e76ee910212450b7f9a06801357a to your computer and use it in GitHub Desktop.
Save hadware/1342e76ee910212450b7f9a06801357a to your computer and use it in GitHub Desktop.
A small scraper that uses ayncio to run lots of concurrent request on an API. However, it does use a sempaphore to limit the number of outgoing connection at one time.
import asyncio
import json
from os import path
import aiohttp
import async_timeout
imdb_ids = ['0114319', '0112302', '0114576', '0113189', '0112346', '0112896', '0112453',
'0113987', '0112760', '0112641', '0114388', '0113101', '0112281', '0113845']
api_url = "http://www.omdbapi.com/?i=tt%s&plot=full&r=json"
all_plots = dict()
async def fetch(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def retrieve_plot(imdb_id, count):
# from what i've understood, every usage of this context adds one lock to the semaphore
async with connection_limit:
async with aiohttp.ClientSession() as session: # async context for the client
print("Fetching %i (%s)" % (count, imdb_id))
response_text = await fetch(session, api_url % imdb_id)
print("Got %i (%s)" % (count, imdb_id))
try:
plot = json.loads(response_text)["Plot"]
all_plots[imdb_id] = plot
except KeyError:
pass
loop = asyncio.get_event_loop()
connection_limit = asyncio.Semaphore(10) # rate limit to 10 connections at a time
tasks = []
for i, imdb_id in enumerate(imdb_ids):
task = asyncio.ensure_future(retrieve_plot(imdb_id, i))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
with open(path.join("plots.json", "wb") as jsonfile:
json.dump(all_plots, jsonfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment