Skip to content

Instantly share code, notes, and snippets.

@tierpod
Created March 16, 2022 10:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tierpod/172cbb00ce396caec583188687b84bf4 to your computer and use it in GitHub Desktop.
Save tierpod/172cbb00ce396caec583188687b84bf4 to your computer and use it in GitHub Desktop.
Example: use python asynio + aiohttp + limits for scaping urls
#!/usr/bin/env python3
# Usage: scrape.py urls.txt
# Based on: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
import sys
import asyncio
import aiohttp
async def bound_fetch(sem, url, session):
async with sem:
await fetch(url, session)
async def fetch(url, session):
async with session.get(url) as response:
html = await response.text()
body_len = len(html)
if response.status != 200 or body_len <= 1:
print(f"status={response.status} body_len={body_len} body={html[:20]} url={url}")
async def main():
input_file = sys.argv[1]
print(f"process file: {input_file}")
tasks = []
sem = asyncio.Semaphore(1000)
async with aiohttp.ClientSession() as session:
with open(input_file, "rb") as f:
for bline in f.readlines():
url = bline.decode().strip()
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
print(f"{len(tasks)} tasks")
result = await asyncio.gather(*tasks)
asyncio.run(main())
print("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment