Created
March 16, 2022 10:58
-
-
Save tierpod/172cbb00ce396caec583188687b84bf4 to your computer and use it in GitHub Desktop.
Example: use python asynio + aiohttp + limits for scaping urls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Usage: scrape.py urls.txt | |
# Based on: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html | |
import sys | |
import asyncio | |
import aiohttp | |
async def bound_fetch(sem, url, session): | |
async with sem: | |
await fetch(url, session) | |
async def fetch(url, session): | |
async with session.get(url) as response: | |
html = await response.text() | |
body_len = len(html) | |
if response.status != 200 or body_len <= 1: | |
print(f"status={response.status} body_len={body_len} body={html[:20]} url={url}") | |
async def main(): | |
input_file = sys.argv[1] | |
print(f"process file: {input_file}") | |
tasks = [] | |
sem = asyncio.Semaphore(1000) | |
async with aiohttp.ClientSession() as session: | |
with open(input_file, "rb") as f: | |
for bline in f.readlines(): | |
url = bline.decode().strip() | |
task = asyncio.ensure_future(bound_fetch(sem, url, session)) | |
tasks.append(task) | |
print(f"{len(tasks)} tasks") | |
result = await asyncio.gather(*tasks) | |
asyncio.run(main()) | |
print("done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment