niazangels/crawler.py

## crawler.py
# References:
# https://github.com/PrettyPrinted/youtube_video_code/blob/master/2020/12/31/How%20to%20Speed%20Up%20API%20Requests%20With%20Async%20Python/apiasync/script.py
# https://stackoverflow.com/questions/47934212/how-to-use-python-aiohttp-library-to-download-multiple-webpages
# https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html

import aiohttp
import aiofiles
import asyncio

URLS = [] # populate this

async def main():
    tasks = []
    sem = asyncio.Semaphore(20)

    for id, url in enumerate(URLS):
        task = asyncio.ensure_future(bound_save_webpage(id, url, sem))
        tasks.append(task)

    await asyncio.gather(*tasks)


async def bound_save_webpage(id, url, sem):
    async with sem:
        await save_webpage(id, url)


async def save_webpage(id, url):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                content = await response.read()
                print(f"Fetched {id}")
    except:
        print(f"Failed {id}")

    else:
        async with aiofiles.open(f'webpages/{id}.html', mode='wb') as f:
            await f.write(content)
            print(f"Saved {id}")

asyncio.run(main())
	# References:
	# https://github.com/PrettyPrinted/youtube_video_code/blob/master/2020/12/31/How%20to%20Speed%20Up%20API%20Requests%20With%20Async%20Python/apiasync/script.py
	# https://stackoverflow.com/questions/47934212/how-to-use-python-aiohttp-library-to-download-multiple-webpages
	# https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html

	import aiohttp
	import aiofiles
	import asyncio

	URLS = [] # populate this

	async def main():
	tasks = []
	sem = asyncio.Semaphore(20)

	for id, url in enumerate(URLS):
	task = asyncio.ensure_future(bound_save_webpage(id, url, sem))
	tasks.append(task)

	await asyncio.gather(*tasks)


	async def bound_save_webpage(id, url, sem):
	async with sem:
	await save_webpage(id, url)


	async def save_webpage(id, url):
	try:
	async with aiohttp.ClientSession() as session:
	async with session.get(url) as response:
	content = await response.read()
	print(f"Fetched {id}")
	except:
	print(f"Failed {id}")

	else:
	async with aiofiles.open(f'webpages/{id}.html', mode='wb') as f:
	await f.write(content)
	print(f"Saved {id}")

	asyncio.run(main())