tierpod/scrape.py

## scrape.py
#!/usr/bin/env python3
# Usage: scrape.py urls.txt
# Based on: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html

import sys
import asyncio

import aiohttp


async def bound_fetch(sem, url, session):
    async with sem:
        await fetch(url, session)


async def fetch(url, session):
    async with session.get(url) as response:
        html = await response.text()
        body_len = len(html)
        if response.status != 200 or body_len <= 1:
            print(f"status={response.status} body_len={body_len} body={html[:20]} url={url}")


async def main():
    input_file = sys.argv[1]
    print(f"process file: {input_file}")

    tasks = []
    sem = asyncio.Semaphore(1000)

    async with aiohttp.ClientSession() as session:
        with open(input_file, "rb") as f:
            for bline in f.readlines():
                url = bline.decode().strip()
                task = asyncio.ensure_future(bound_fetch(sem, url, session))
                tasks.append(task)

        print(f"{len(tasks)} tasks")
        result = await asyncio.gather(*tasks)

asyncio.run(main())
print("done")
	#!/usr/bin/env python3
	# Usage: scrape.py urls.txt
	# Based on: https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html

	import sys
	import asyncio

	import aiohttp


	async def bound_fetch(sem, url, session):
	async with sem:
	await fetch(url, session)


	async def fetch(url, session):
	async with session.get(url) as response:
	html = await response.text()
	body_len = len(html)
	if response.status != 200 or body_len <= 1:
	print(f"status={response.status} body_len={body_len} body={html[:20]} url={url}")


	async def main():
	input_file = sys.argv[1]
	print(f"process file: {input_file}")

	tasks = []
	sem = asyncio.Semaphore(1000)

	async with aiohttp.ClientSession() as session:
	with open(input_file, "rb") as f:
	for bline in f.readlines():
	url = bline.decode().strip()
	task = asyncio.ensure_future(bound_fetch(sem, url, session))
	tasks.append(task)

	print(f"{len(tasks)} tasks")
	result = await asyncio.gather(*tasks)

	asyncio.run(main())
	print("done")