Skip to content

Instantly share code, notes, and snippets.

@barraponto
Last active August 23, 2023 08:00
Show Gist options
  • Save barraponto/4b918fdc7d7191c1cd172f848feaeec5 to your computer and use it in GitHub Desktop.
Save barraponto/4b918fdc7d7191c1cd172f848feaeec5 to your computer and use it in GitHub Desktop.
Basic Webscraper in Python, dealing with cards from Neonmob.com
import aiometer
from aiocsv.readers import AsyncDictReader
from aiohttp import ClientSession as Session, ClientTimeout
from aiofiles import open
from yarl import URL
HOST = "http://d1wwra234reihl.cloudfront.net"
DEFAULT_HEADERS = {"Referer": "https://www.neonmob.com/"}
DEFAULT_SESSION_KWARGS = dict(
headers=DEFAULT_HEADERS,
raise_for_status=True,
timeout=ClientTimeout(total=None)
)
DEFAULT_PATH = "cartinhas"
async def download(session: Session, url_path: str, destination: str):
async with session.get(url_path) as response:
async with open(destination, "wb") as file:
await file.write(await response.read())
return url_path, destination
async def make_download(session: Session, card: dict):
url_path = URL(card["asset"]).path
suffix = Path(url_path).suffix
try:
return await download(session, url_path, f"{DEFAULT_PATH}/{card['id']}{suffix}")
except Exception as err:
logging.exception(f"Download failed for {url_path}.")
async def main():
async with Session(HOST, **DEFAULT_SESSION_KWARGS) as session:
async with open("./remaining.csv") as cards_file:
reader = AsyncDictReader(cards_file)
await anext(reader) # load the headers line
async with aiometer.amap(
partial(make_download, session),
[card async for card in reader],
max_at_once=8,
max_per_second=4,
) as cards:
async for result in cards:
if not result:
continue
path, destination = result
logging.debug(f"Succesfully downloaded {path} to {destination}.")
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment