Last active
October 6, 2019 17:15
-
-
Save sergiolucero/fb952c32a9d6b5d53af3e9c54e097ed8 to your computer and use it in GitHub Desktop.
async scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aiohttp | |
import asyncio | |
import time, pandas as pd | |
def async_http_get(urls, extractor=None, json_response=True): | |
tasks = [] | |
sem = asyncio.Semaphore(32) | |
async def fetch(session, url): | |
async with session.get(url) as response: | |
if json_response: | |
response = await response.json() | |
if extractor: | |
response = extractor(response) | |
else: | |
response = await response.text() | |
return (url, response) | |
async def sem_fetch(sem, session, url): | |
async with sem: | |
return await fetch(session, url) | |
async def run(loop, sem): | |
async with aiohttp.ClientSession(loop=loop) as session: | |
for url in urls: | |
task = asyncio.ensure_future(sem_fetch(sem, session, url)) | |
tasks.append(task) | |
return await asyncio.gather(*tasks) | |
loop = asyncio.get_event_loop() | |
return loop.run_until_complete(run(loop, sem)) | |
if __name__ == '__main__': | |
t0 = time.time() | |
URL = 'https://pydataflowtest.appspot.com/fetch/%d' | |
urls = [URL %ix for ix in range(10)] | |
wot = async_http_get(urls, json_response=False) | |
dt = time.time()-t0 | |
swot = sum(len(x) for x in wot) | |
print('SWOT=%d dt=%d' %(swot,dt)) | |
df = pd.DataFrame(eval(w[1]) for w in wot).T |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment