Skip to content

Instantly share code, notes, and snippets.

@justinturpin
Created June 10, 2024 15:48
Show Gist options
  • Save justinturpin/50b80c3ff9e7497f9fab1b776708756e to your computer and use it in GitHub Desktop.
Save justinturpin/50b80c3ff9e7497f9fab1b776708756e to your computer and use it in GitHub Desktop.
humble bundle scraper
#!/usr/bin/env python3
import httpx
from bs4 import BeautifulSoup
import json
import click
from pathlib import Path
BASE_URL = "https://humblebundle.com"
def get_bundles(page_text: str) -> list[dict]:
"""Get a list of bundles from the Humble Bundle website."""
soup = BeautifulSoup(page_text, "html.parser")
webpack_data = soup.find(id="webpack-json-data")
webpack_data = json.loads(webpack_data.text)
result = []
for section in webpack_data["mosaic"]:
result += section["products"]
return result
def get_homepage_html(use_cache: bool = True) -> str:
"""
Get the homepage HTML from the Humble Bundle website. Set use_cache to true
during development to avoid hitting the Humble Bundle website during development.
Set it to False when its time to deploy to production.
"""
cache_path = Path("cache.html")
if use_cache and cache_path.is_file():
return cache_path.read_text()
response = httpx.get(BASE_URL, follow_redirects=True)
response.raise_for_status()
cache_path.write_text(response.text)
return response.text
@click.command()
def cmd() -> None:
bundles = get_bundles(get_homepage_html(use_cache=True))
click.echo(json.dumps(bundles, indent=2))
if __name__ == "__main__":
cmd()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment