Skip to content

Instantly share code, notes, and snippets.

@sergiolucero
Last active October 6, 2019 17:15
Show Gist options
  • Save sergiolucero/fb952c32a9d6b5d53af3e9c54e097ed8 to your computer and use it in GitHub Desktop.
Save sergiolucero/fb952c32a9d6b5d53af3e9c54e097ed8 to your computer and use it in GitHub Desktop.
async scraping
import aiohttp
import asyncio
import time, pandas as pd
def async_http_get(urls, extractor=None, json_response=True):
tasks = []
sem = asyncio.Semaphore(32)
async def fetch(session, url):
async with session.get(url) as response:
if json_response:
response = await response.json()
if extractor:
response = extractor(response)
else:
response = await response.text()
return (url, response)
async def sem_fetch(sem, session, url):
async with sem:
return await fetch(session, url)
async def run(loop, sem):
async with aiohttp.ClientSession(loop=loop) as session:
for url in urls:
task = asyncio.ensure_future(sem_fetch(sem, session, url))
tasks.append(task)
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
return loop.run_until_complete(run(loop, sem))
if __name__ == '__main__':
t0 = time.time()
URL = 'https://pydataflowtest.appspot.com/fetch/%d'
urls = [URL %ix for ix in range(10)]
wot = async_http_get(urls, json_response=False)
dt = time.time()-t0
swot = sum(len(x) for x in wot)
print('SWOT=%d dt=%d' %(swot,dt))
df = pd.DataFrame(eval(w[1]) for w in wot).T
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment