Skip to content

Instantly share code, notes, and snippets.

@simonw
Created March 10, 2025 07:40
Show Gist options
  • Select an option

  • Save simonw/5aed8bd87016c77465c23e0dc4563ec9 to your computer and use it in GitHub Desktop.

Select an option

Save simonw/5aed8bd87016c77465c23e0dc4563ec9 to your computer and use it in GitHub Desktop.
import asyncio
import httpx
import sqlite3
import tempfile
import pathlib
from typing import Optional
class DatabaseTooLargeError(Exception):
"""Raised when a database exceeds the maximum allowed size."""
pass
class DatabaseValidationError(Exception):
"""Raised when a downloaded file fails SQLite validation."""
pass
class ContentLengthMismatchError(Exception):
"""Raised when the actual content length differs from the declared length."""
pass
async def download_db(url: str, max_size_bytes: int = 5 * 1024 * 1024) -> pathlib.Path:
"""
Download a SQLite database from a URL to a temporary file.
Args:
url: The URL of the SQLite database to download
max_size_bytes: Maximum allowed size in bytes (default: 5MB)
Returns:
pathlib.Path: Path to the downloaded database file
Raises:
httpx.HTTPError: If there's an HTTP-related error during download
DatabaseTooLargeError: If the database exceeds the size limit
ContentLengthMismatchError: If the actual size exceeds the declared size
DatabaseValidationError: If the downloaded file is not a valid SQLite database
"""
# Create a temporary directory for the database
temp_dir = pathlib.Path(tempfile.mkdtemp())
db_path = temp_dir / "database.db"
# Track the actual bytes received
bytes_received = 0
content_length: Optional[int] = None
async with httpx.AsyncClient() as client:
async with client.stream("GET", url) as response:
response.raise_for_status()
# Check for Content-Length header
if "content-length" in response.headers:
content_length = int(response.headers["content-length"])
if content_length > max_size_bytes:
raise DatabaseTooLargeError(
f"Database size ({content_length} bytes) exceeds the maximum allowed size "
f"({max_size_bytes} bytes)"
)
else:
# We'll still enforce the limit while streaming
content_length = None
# Create and open the file for writing
with open(db_path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=8192):
# Update bytes received counter
bytes_received += len(chunk)
# Check if we've exceeded the size limit
if bytes_received > max_size_bytes:
# Clean up the file
try:
db_path.unlink()
except:
pass
if content_length is not None and bytes_received > content_length:
raise ContentLengthMismatchError(
f"Actual size ({bytes_received} bytes) exceeds declared size "
f"({content_length} bytes)"
)
else:
raise DatabaseTooLargeError(
f"Database size exceeds the maximum allowed size "
f"({max_size_bytes} bytes)"
)
# Check if content length is being honest when provided
if content_length is not None and bytes_received > content_length:
# Clean up the file
try:
db_path.unlink()
except:
pass
raise ContentLengthMismatchError(
f"Actual size ({bytes_received} bytes) exceeds declared size "
f"({content_length} bytes)"
)
# Write the chunk to the file
f.write(chunk)
# Validate that this is a legitimate SQLite database
try:
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
# Run PRAGMA quick_check to validate the database
cursor.execute("PRAGMA quick_check")
result = cursor.fetchone()[0]
if result != "ok":
try:
db_path.unlink()
except:
pass
raise DatabaseValidationError(f"Database validation failed: {result}")
cursor.close()
conn.close()
except sqlite3.Error as e:
# Clean up the file if it's not a valid SQLite database
try:
db_path.unlink()
except:
pass
raise DatabaseValidationError(f"Not a valid SQLite database: {str(e)}")
return db_path
@bobek
Copy link

bobek commented Mar 13, 2025

For anybody wondering what blogpost is everybody referring to: https://simonwillison.net/2025/Mar/11/using-llms-for-code/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment