justinturpin/scrape.py

## scrape.py
#!/usr/bin/env python3

import httpx
from bs4 import BeautifulSoup
import json

import click
from pathlib import Path


BASE_URL = "https://humblebundle.com"


def get_bundles(page_text: str) -> list[dict]:
    """Get a list of bundles from the Humble Bundle website."""

    soup = BeautifulSoup(page_text, "html.parser")
    webpack_data = soup.find(id="webpack-json-data")
    webpack_data = json.loads(webpack_data.text)

    result = []

    for section in webpack_data["mosaic"]:
        result += section["products"]

    return result


def get_homepage_html(use_cache: bool = True) -> str:
    """
    Get the homepage HTML from the Humble Bundle website. Set use_cache to true
    during development to avoid hitting the Humble Bundle website during development.
    Set it to False when its time to deploy to production.
    """

    cache_path = Path("cache.html")

    if use_cache and cache_path.is_file():
        return cache_path.read_text()

    response = httpx.get(BASE_URL, follow_redirects=True)
    response.raise_for_status()

    cache_path.write_text(response.text)

    return response.text


@click.command()
def cmd() -> None:
    bundles = get_bundles(get_homepage_html(use_cache=True))
    click.echo(json.dumps(bundles, indent=2))


if __name__ == "__main__":
    cmd()
	#!/usr/bin/env python3

	import httpx
	from bs4 import BeautifulSoup
	import json

	import click
	from pathlib import Path


	BASE_URL = "https://humblebundle.com"


	def get_bundles(page_text: str) -> list[dict]:
	"""Get a list of bundles from the Humble Bundle website."""

	soup = BeautifulSoup(page_text, "html.parser")
	webpack_data = soup.find(id="webpack-json-data")
	webpack_data = json.loads(webpack_data.text)

	result = []

	for section in webpack_data["mosaic"]:
	result += section["products"]

	return result


	def get_homepage_html(use_cache: bool = True) -> str:
	"""
	Get the homepage HTML from the Humble Bundle website. Set use_cache to true
	during development to avoid hitting the Humble Bundle website during development.
	Set it to False when its time to deploy to production.
	"""

	cache_path = Path("cache.html")

	if use_cache and cache_path.is_file():
	return cache_path.read_text()

	response = httpx.get(BASE_URL, follow_redirects=True)
	response.raise_for_status()

	cache_path.write_text(response.text)

	return response.text


	@click.command()
	def cmd() -> None:
	bundles = get_bundles(get_homepage_html(use_cache=True))
	click.echo(json.dumps(bundles, indent=2))


	if __name__ == "__main__":
	cmd()