Skip to content

Instantly share code, notes, and snippets.

@kmad
Created August 2, 2025 19:15
Show Gist options
  • Select an option

  • Save kmad/76814081edd55f9cc03e612f4d9731b5 to your computer and use it in GitHub Desktop.

Select an option

Save kmad/76814081edd55f9cc03e612f4d9731b5 to your computer and use it in GitHub Desktop.
DSPy Document Boundary Detection
# /// script
# dependencies = [
# "requests<3",
# "rich",
# "dspy",
# "python-dotenv",
# "pymupdf",
# ]
# ///
import logging
import dspy
import asyncio
import os
from dotenv import load_dotenv
import pymupdf
import io
import base64
import sys
from typing import List, Literal, Dict, Tuple
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
### CLASS DEFINITIONS ###
CLASSES = (
"COVER PAGE",
"TERMS_AND_CONDITIONS",
"SIGNATURE_PAGE",
"SCHEDULE OR TABLE",
"START_OF_APPENDIX",
"START_OF_EXHIBIT",
)
class ClassifyPage(dspy.Signature):
"""
Classifies a single page from a PDF order form into one of several predefined classes.
"""
page_image: dspy.Image = dspy.InputField(
desc="The image of a single page from the PDF order form."
)
page_class = dspy.OutputField(desc="The type or class of the page.")
class PDFBoundaryDetector:
"""
A class to detect boundaries in PDF documents by converting pages to images
and classifying them to identify document sections.
"""
def __init__(self, pdf_file: str):
self.pdf_file = pdf_file
self.page_images: List[dspy.Image] = []
self.page_classifications: Dict[int, str] = {}
self.document_boundaries: Dict[str, Tuple[int, int]] = {}
def get_page_images(self, pages: list[int]) -> List[dspy.Image]:
"""Get the page images. Be mindful of context length restrictions and don't return more than 8 images at a time."""
return [self.page_images[i] for i in pages]
def convert_to_img(self, data: bytes, pages: int = -1) -> List[dspy.Image]:
"""Convert PDF data to base64 encoded images."""
pdf_file = io.BytesIO(data)
pdf_reader = pymupdf.open(stream=pdf_file, filetype="pdf")
images = []
max_pages = pages if pages > 0 else pdf_reader.page_count
for page_num in range(max_pages):
page = pdf_reader.load_page(page_num)
pix = page.get_pixmap()
img_data = pix.tobytes("png")
images.append(dspy.Image.from_PIL(f"data:image/png;base64,{base64.b64encode(img_data).decode('utf-8')}"))
return images
async def process_pdf(self) -> Dict[str, Tuple[int, int]]:
"""Process the PDF file to detect document boundaries."""
# Convert PDF to images and save to self.page_images
with open(self.pdf_file, "rb") as f:
data = f.read()
self.page_images = self.convert_to_img(data)
# Classify pages
# Optionally toggle with_updated_fields to constrain the classifications (or not)
classify_signature = ClassifyPage.with_updated_fields(
"page_class",
type_=Literal[tuple(CLASSES)], # type: ignore
)
classifier = dspy.Predict(classify_signature)
# # Process all pages concurrently
async def classify_page(i: int, img: dspy.Image):
result = await classifier.acall(page_image=img)
return i, result.page_class
# Use asyncio.gather to process all pages concurrently
tasks = [classify_page(i, img) for i, img in enumerate(self.page_images)]
results = await asyncio.gather(*tasks)
# Convert results back to dictionary and save to self.page_classifications
self.page_classifications = results
logger.info(self.page_classifications)
# Detect boundaries
boundary_detector = dspy.Signature(
"pages_and_classifications -> document_boundaries: dict[str, tuple[int, int]]"
).with_instructions(
"Detect boundaries between documents, such as order forms or agreements. A typical order form has a header, details, and signature page. You should separate things like schedules, appendices, and exhibits."
)
detector = dspy.ReAct(
boundary_detector, tools=[self.get_page_images], max_iters=10
)
LM_CONFIG_SMART = {
"model": os.getenv("DSPY_SMART_MODEL"),
"api_key": os.getenv("DSPY_API_KEY"),
"api_base": os.getenv("DSPY_ENDPOINT"),
"api_version": os.getenv("DSPY_SMART_API_VERSION"),
"max_tokens": int(os.getenv("DSPY_SMART_MAX_TOKENS")),
"temperature": float(os.getenv("DSPY_SMART_TEMPERATURE")),
"cache": True,
}
lm_smart = dspy.LM(**LM_CONFIG_SMART)
with dspy.context(lm=lm_smart):
response = await detector.acall(pages_and_classifications=self.page_classifications)
logger.info(response)
# Save boundaries to self.document_boundaries
self.document_boundaries = response.document_boundaries
return self.document_boundaries
def get_page_classifications(self) -> Dict[int, str]:
"""Get the page classifications."""
return self.page_classifications
def get_document_boundaries(self) -> Dict[str, Tuple[int, int]]:
"""Get the detected document boundaries."""
return self.document_boundaries
async def process_pdf(pdf_file: str) -> dict:
"""Legacy function for backward compatibility."""
detector = PDFBoundaryDetector(pdf_file)
return await detector.process_pdf()
if __name__ == "__main__":
load_dotenv()
LM_CONFIG = {
"model": os.getenv("DSPY_FAST_MODEL"),
"api_key": os.getenv("DSPY_API_KEY"),
"api_base": os.getenv("DSPY_ENDPOINT"),
"api_version": os.getenv("DSPY_FAST_API_VERSION"),
"max_tokens": int(os.getenv("DSPY_FAST_MAX_TOKENS", 50_000)),
"temperature": float(os.getenv("DSPY_FAST_TEMPERATURE", 1.0)),
"cache": True,
}
lm = dspy.LM(**LM_CONFIG)
dspy.configure(lm=lm)
if len(sys.argv) < 2:
logger.info("Usage: python detect_boundaries.py <pdf_file>")
sys.exit(1)
pdf_file = sys.argv[1]
# Run the async function
boundaries = asyncio.run(process_pdf(pdf_file))
logger.info(f"Detected boundaries: \n{boundaries}")
@Nasreddine
Copy link

 Thanks for sharing the code.
Since we can have multiple classes on a single page, how should we handle this case?

@kmad
Copy link
Author

kmad commented Sep 22, 2025

Thanks, @Nasreddine - that's up to you and your use case. If you explicitly want multiple classes you could set the OutputField type to something like List[Literal[tuple(CLASSES)]] (on line 93) which would give you more than one category but still constrained to that list. Otherwise the code as it stands should just return a single classification for a page.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment