Skip to content

Instantly share code, notes, and snippets.

@cuducos
Last active June 28, 2023 21:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cuducos/b4d764715ec49bcb546d3c88b628e022 to your computer and use it in GitHub Desktop.
Save cuducos/b4d764715ec49bcb546d3c88b628e022 to your computer and use it in GitHub Desktop.
from asyncio import Semaphore, as_completed, run
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from fire import Fire
from httpx import AsyncClient, get
from tqdm import tqdm
class Downloader:
ACTION = "downloadHighRes"
SEMAPHORE = 8
def __init__(self, domain, event_id, bib):
self.athlete_url = f"https://{domain}/gallery/{event_id }/{bib}/"
self.api_url = f"https://{domain}/_api/func_download_p3.php"
self.queued = set()
self.last_page = None
self.semaphore = None
def picture_page_urls(self, page):
if self.last_page and page > self.last_page:
return
print(f"Fetching page {page}…", end="\r")
resp = get(f"{self.athlete_url}{page}")
dom = BeautifulSoup(resp.read(), "html.parser")
urls = (
a["href"].strip(f"/{page}")
for div in dom.find_all("div", {"class": "thumbContainer"})
for a in div.find_all("a")
)
for url in urls:
if url in self.queued:
self.last_page = page
break
self.queued.add(url)
return
async def download(self, client, picture_page_url):
async with self.semaphore:
resp = await client.get(picture_page_url)
dom = BeautifulSoup(await resp.aread(), "html.parser")
btn = dom.find("button", {"class": self.ACTION})
data = {"action": self.ACTION, "image_id": btn["e-num"]}
async with self.semaphore:
download = await client.post(self.api_url, data=data)
download_url = (await download.aread()).decode("utf-8")
name = Path(urlparse(download_url).path).name
resp = await client.get(download_url)
Path(name).write_bytes(await resp.aread())
async def __call__(self):
page = 1
while not self.last_page:
self.picture_page_urls(page)
page += 1
self.semaphore = Semaphore(self.SEMAPHORE)
async with AsyncClient() as client:
downloads = tuple(self.download(client, url) for url in self.queued)
for download in tqdm(as_completed(downloads), total=len(downloads)):
await download
def cli(domain: str, event_id: int, bib: int):
downloader = Downloader(domain, event_id, bib)
run(downloader())
if __name__ == "__main__":
Fire(cli)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment