Skip to content

Instantly share code, notes, and snippets.

@simonw
Created March 26, 2025 00:54
Show Gist options
  • Save simonw/358b5caa015de53dee0fbc96415ae6d6 to your computer and use it in GitHub Desktop.
Save simonw/358b5caa015de53dee0fbc96415ae6d6 to your computer and use it in GitHub Desktop.

2025-03-26T00:51:34 conversation: 01jq7yywdpz84x8g8ddxyttrmp id: 01jq7yywdt0qfxg93gc67tnrbn

Model: gemini-2.5-pro-exp-03-25

Prompt

import sys
import click
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET
from urllib.parse import urljoin
from lxml import etree


def parse_timestamp(title_attr):
    """Parse timestamp from title attribute in the format 'Mar 13, 2025 12:39 PM UTC'."""
    if not title_attr:
        return datetime.now().isoformat()

    try:
        dt = datetime.strptime(title_attr, "%b %d, %Y %I:%M %p %Z")
        return dt.isoformat()
    except ValueError:
        # Default to current time if unable to parse
        return datetime.now().isoformat()


def create_atom_feed(soup, base_url):
    """Create an Atom feed from the parsed HTML."""
    # Create the root element
    feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom")

    # Add feed metadata
    ET.SubElement(feed, "title").text = "Ollama models"
    ET.SubElement(feed, "id").text = base_url
    author = ET.SubElement(feed, "author")
    ET.SubElement(author, "name").text = "Model Library"
    ET.SubElement(feed, "link", href=base_url, rel="self")

    # Find all model items
    models = soup.find_all("li", attrs={"x-test-model": True})

    for model in models:
        entry = ET.SubElement(feed, "entry")

        # Get the model name
        title_elem = model.find("span", attrs={"x-test-search-response-title": True})
        title = title_elem.text if title_elem else "Unknown Model"
        ET.SubElement(entry, "title").text = title

        # Get the model URL
        href = model.find("a")["href"] if model.find("a") else ""
        model_url = urljoin(base_url, href)
        ET.SubElement(entry, "id").text = model_url
        ET.SubElement(entry, "link", href=model_url)

        # Get the model description
        description = (
            model.find("p", class_="max-w-lg").text.strip()
            if model.find("p", class_="max-w-lg")
            else ""
        )
        ET.SubElement(entry, "summary").text = description

        # Get updated time from the title attribute
        date_span = model.find("span", class_="flex items-center", title=True)
        title_attr = date_span.get("title") if date_span else None
        updated_iso = parse_timestamp(title_attr)
        ET.SubElement(entry, "updated").text = updated_iso

        # Get model sizes as categories
        sizes = model.find_all("span", attrs={"x-test-size": True})
        for size in sizes:
            ET.SubElement(entry, "category", term=size.text.strip())

        # Get capabilities as categories
        capabilities = model.find_all("span", attrs={"x-test-capability": True})
        for capability in capabilities:
            ET.SubElement(entry, "category", term=capability.text.strip())

        # Additional stats in content
        pull_count = model.find("span", attrs={"x-test-pull-count": True})
        tag_count = model.find("span", attrs={"x-test-tag-count": True})

        # Include the summary at the top of the content
        content = f"<p>{description}</p>"
        content += f"<p>Pulls: {pull_count.text if pull_count else 'N/A'}</p>"
        content += f"<p>Tags: {tag_count.text if tag_count else 'N/A'}</p>"

        ET.SubElement(entry, "content", type="html").text = content

    # Convert to ElementTree element, then to lxml element for pretty printing
    xml_string = ET.tostring(feed, encoding="utf-8")
    parser = etree.XMLParser(remove_blank_text=True)
    root = etree.fromstring(xml_string, parser)
    return etree.tostring(root, encoding="unicode", pretty_print=True)


@click.command()
@click.argument("url")
@click.option(
    "--output",
    "-o",
    type=click.File("w"),
    default="-",
    help="Output file path. Defaults to stdout.",
)
def html_to_atom(url, output):
    """
    Convert HTML from the given URL to an Atom feed.

    URL: The web page URL to convert to Atom
    """
    try:
        # If URL is actually a file path, read locally
        if url.startswith("file://"):
            with open(url[7:], "r") as f:
                html_content = f.read()
        else:
            # Fetch the HTML content
            response = requests.get(url)
            response.raise_for_status()
            html_content = response.text

        # Parse the HTML
        soup = BeautifulSoup(html_content, "lxml")

        # Create the atom feed
        atom_feed = create_atom_feed(soup, url)

        # Write the output
        output.write('<?xml version="1.0" encoding="utf-8"?>\n')
        output.write(atom_feed)

    except Exception as e:
        click.echo(f"Error: {e}", err=True)
        sys.exit(1)


if __name__ == "__main__":
    html_to_atom()

System

rewrite this script so that instead of outputting Atom to stdout it saves two files, one called atom.xml with everything and another called atom-recent-20.xml with just the most recent 20 items - remove the output option entirely

Response

import sys
import click
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
import xml.etree.ElementTree as ET
from urllib.parse import urljoin
from lxml import etree
import copy
import os

def parse_timestamp(title_attr):
    """
    Parse timestamp from title attribute like 'Mar 13, 2025 12:39 PM UTC'.
    Returns a timezone-aware datetime object.
    """
    if not title_attr:
        # Default to current UTC time if no attribute provided
        return datetime.now(timezone.utc)

    try:
        # Parse the string and make it timezone-aware (assuming UTC)
        dt_naive = datetime.strptime(title_attr, "%b %d, %Y %I:%M %p %Z")
        dt_aware = dt_naive.replace(tzinfo=timezone.utc)
        return dt_aware
    except ValueError:
        # Default to current UTC time if unable to parse
        # Log this potential issue? For now, just return current time.
        print(f"Warning: Could not parse timestamp '{title_attr}'. Defaulting to now.", file=sys.stderr)
        return datetime.now(timezone.utc)


def create_base_feed_and_entries(soup, base_url):
    """
    Parses HTML, creates the base Atom feed structure, and extracts entry data.
    Returns the base feed element (without entries) and a list of tuples:
    [(datetime_updated, entry_element)].
    """
    # Create the root element for the feed
    feed = ET.Element("feed", xmlns="http://www.w3.org/2005/Atom")

    # Add feed metadata
    ET.SubElement(feed, "title").text = "Ollama models"
    ET.SubElement(feed, "id").text = base_url
    author = ET.SubElement(feed, "author")
    ET.SubElement(author, "name").text = "Model Library"
    ET.SubElement(feed, "link", href=base_url, rel="self")
    # Add a general updated time for the feed itself (last checked time)
    ET.SubElement(feed, "updated").text = datetime.now(timezone.utc).isoformat()

    # Find all model items
    models = soup.find_all("li", attrs={"x-test-model": True})
    entries_data = []

    for model in models:
        entry = ET.Element("entry")

        # Get the model name
        title_elem = model.find("span", attrs={"x-test-search-response-title": True})
        title = title_elem.text if title_elem else "Unknown Model"
        ET.SubElement(entry, "title").text = title

        # Get the model URL
        href = model.find("a")["href"] if model.find("a") else ""
        model_url = urljoin(base_url, href)
        ET.SubElement(entry, "id").text = model_url
        ET.SubElement(entry, "link", href=model_url)

        # Get the model description
        description_elem = model.find("p", class_="max-w-lg")
        description = description_elem.text.strip() if description_elem else ""
        ET.SubElement(entry, "summary").text = description

        # Get updated time from the title attribute
        date_span = model.find("span", class_="flex items-center", title=True)
        title_attr = date_span.get("title") if date_span else None
        # Parse timestamp into a datetime object for sorting
        updated_dt = parse_timestamp(title_attr)
        # Format the datetime object into ISO format string for the XML element
        ET.SubElement(entry, "updated").text = updated_dt.isoformat()

        # Get model sizes as categories
        sizes = model.find_all("span", attrs={"x-test-size": True})
        for size in sizes:
            ET.SubElement(entry, "category", term=size.text.strip())

        # Get capabilities as categories
        capabilities = model.find_all("span", attrs={"x-test-capability": True})
        for capability in capabilities:
            ET.SubElement(entry, "category", term=capability.text.strip())

        # Additional stats in content
        pull_count = model.find("span", attrs={"x-test-pull-count": True})
        tag_count = model.find("span", attrs={"x-test-tag-count": True})

        # Include the summary at the top of the content
        content = f"<p>{description}</p>" if description else ""
        content += f"<p>Pulls: {pull_count.text if pull_count else 'N/A'}</p>"
        content += f"<p>Tags: {tag_count.text if tag_count else 'N/A'}</p>"

        ET.SubElement(entry, "content", type="html").text = content

        # Store the datetime object and the entry element for sorting
        entries_data.append((updated_dt, entry))

    return feed, entries_data


def save_atom_feed(filename, base_feed_element, entries_data):
    """
    Adds entries to a copy of the base feed element, serializes,
    and saves the Atom feed to a file.

    Args:
        filename (str): The path to save the file.
        base_feed_element (ET.Element): The basic <feed> structure.
        entries_data (list): A list of (datetime_obj, entry_element) tuples.
                             Assumed to be sorted if order matters.
    """
    # Create a deep copy to avoid modifying the original base element
    feed_copy = copy.deepcopy(base_feed_element)

    # Add entry elements to the copied feed
    for _, entry_element in entries_data:
        feed_copy.append(entry_element)

    # Convert ElementTree element to bytes, then parse with lxml for pretty printing
    xml_string_bytes = ET.tostring(feed_copy, encoding="utf-8")
    parser = etree.XMLParser(remove_blank_text=True)
    try:
        root = etree.fromstring(xml_string_bytes, parser)
        pretty_xml_string = etree.tostring(root, encoding="unicode", pretty_print=True)
    except Exception as e:
        print(f"Error pretty-printing XML for {filename}: {e}", file=sys.stderr)
        # Fallback to non-pretty printing if lxml fails
        pretty_xml_string = xml_string_bytes.decode('utf-8')


    # Write the output to the specified file
    try:
        with open(filename, "w", encoding="utf-8") as f:
            f.write('<?xml version="1.0" encoding="utf-8"?>\n')
            f.write(pretty_xml_string)
        print(f"Successfully saved feed to {filename}")
    except IOError as e:
        print(f"Error writing file {filename}: {e}", file=sys.stderr)


@click.command()
@click.argument("url")
def html_to_atom(url):
    """
    Fetches HTML from a URL, converts it to an Atom feed,
    and saves two files: 'atom.xml' (all items) and
    'atom-recent-20.xml' (most recent 20 items).

    URL: The web page URL (or file://path/to/file.html) to convert.
    """
    output_full_filename = "atom.xml"
    output_recent_filename = "atom-recent-20.xml"

    try:
        # Determine if URL is a local file path
        if url.startswith("file://"):
            filepath = url[7:]
            # Ensure file exists
            if not os.path.exists(filepath):
                raise FileNotFoundError(f"Local file not found: {filepath}")
            with open(filepath, "r", encoding="utf-8") as f:
                html_content = f.read()
            # Use file path directory as base for relative links if needed, though ollama.com uses absolute
            base_url = os.path.dirname(filepath) # Or maybe keep the original url? Let's stick to url for id
        else:
            # Fetch the HTML content via HTTP/S
            headers = {'User-Agent': 'AtomFeedGenerator/1.0'} # Be a good citizen
            response = requests.get(url, headers=headers, timeout=30) # Add timeout
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            html_content = response.text
            base_url = url # Use the fetched URL as the base

        # Parse the HTML
        soup = BeautifulSoup(html_content, "lxml") # Use lxml parser

        # Create the base feed structure and extract entries with timestamps
        base_feed_element, entries_data = create_base_feed_and_entries(soup, base_url)

        # Sort entries by updated timestamp, most recent first
        entries_data.sort(key=lambda item: item[0], reverse=True)

        # --- Save the full feed ---
        save_atom_feed(output_full_filename, base_feed_element, entries_data)

        # --- Save the recent feed (top 20) ---
        recent_entries_data = entries_data[:20]
        save_atom_feed(output_recent_filename, base_feed_element, recent_entries_data)

    except requests.exceptions.RequestException as e:
        click.echo(f"Error fetching URL {url}: {e}", err=True)
        sys.exit(1)
    except FileNotFoundError as e:
        click.echo(f"Error accessing file: {e}", err=True)
        sys.exit(1)
    except Exception as e:
        # Catch other potential errors (parsing, file writing, etc.)
        click.echo(f"An unexpected error occurred: {e}", err=True)
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        sys.exit(1)


if __name__ == "__main__":
    html_to_atom()

Key Changes:

  1. Removed --output Option: The @click.option for --output and the corresponding output parameter in html_to_atom have been removed.
  2. Defined Output Filenames: Hardcoded filenames atom.xml and atom-recent-20.xml are defined within html_to_atom.
  3. Separated Feed Creation and Entry Extraction:
    • create_base_feed_and_entries now focuses on parsing the HTML, creating the main <feed> structure (without entries), and extracting data for each <entry>.
    • It returns the base <feed> element and a list of tuples (datetime_object, entry_element). The datetime_object is used for sorting.
  4. Timestamp Handling:
    • parse_timestamp now returns timezone-aware datetime objects (assuming UTC from the source format). This is crucial for correct sorting. It also handles potential parsing errors more gracefully.
    • The ISO format string is still generated from the datetime object when setting the updated element's text within create_base_feed_and_entries.
  5. Sorting: The entries_data list is explicitly sorted in reverse chronological order (most recent first) in the html_to_atom function after parsing and before saving.
  6. save_atom_feed Helper Function:
    • A new function save_atom_feed handles the process of adding entries to a copy of the base feed, serializing the result to a pretty-printed XML string using lxml, and writing it to the specified file.
    • It uses copy.deepcopy to ensure that adding entries for one file doesn't affect the base structure used for the other file.
  7. Saving Two Files: html_to_atom now calls save_atom_feed twice:
    • Once with all the sorted entries_data for atom.xml.
    • Once with a slice entries_data[:20] containing only the first 20 entries (the most recent) for atom-recent-20.xml.
  8. File Handling: Uses with open(...) for writing files to ensure they are properly closed, specifying utf-8 encoding.
  9. Error Handling: Includes specific checks for requests.exceptions.RequestException and FileNotFoundError, along with a general Exception catch. Added more informative error messages and a traceback print for unexpected errors.
  10. Minor Improvements: Added a User-Agent to the requests.get call, included a timeout, and used lxml parser explicitly in BeautifulSoup. Added a top-level <updated> tag to the feed itself.

Token usage:

1,421 input, 3,056 output, {"promptTokensDetails": [{"modality": "TEXT", "tokenCount": 1421}]}

@wizlijun
Copy link

not bad

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment