Created
August 2, 2025 19:15
-
-
Save kmad/76814081edd55f9cc03e612f4d9731b5 to your computer and use it in GitHub Desktop.
DSPy Document Boundary Detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "requests<3", | |
| # "rich", | |
| # "dspy", | |
| # "python-dotenv", | |
| # "pymupdf", | |
| # ] | |
| # /// | |
| import logging | |
| import dspy | |
| import asyncio | |
| import os | |
| from dotenv import load_dotenv | |
| import pymupdf | |
| import io | |
| import base64 | |
| import sys | |
| from typing import List, Literal, Dict, Tuple | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| ### CLASS DEFINITIONS ### | |
| CLASSES = ( | |
| "COVER PAGE", | |
| "TERMS_AND_CONDITIONS", | |
| "SIGNATURE_PAGE", | |
| "SCHEDULE OR TABLE", | |
| "START_OF_APPENDIX", | |
| "START_OF_EXHIBIT", | |
| ) | |
| class ClassifyPage(dspy.Signature): | |
| """ | |
| Classifies a single page from a PDF order form into one of several predefined classes. | |
| """ | |
| page_image: dspy.Image = dspy.InputField( | |
| desc="The image of a single page from the PDF order form." | |
| ) | |
| page_class = dspy.OutputField(desc="The type or class of the page.") | |
| class PDFBoundaryDetector: | |
| """ | |
| A class to detect boundaries in PDF documents by converting pages to images | |
| and classifying them to identify document sections. | |
| """ | |
| def __init__(self, pdf_file: str): | |
| self.pdf_file = pdf_file | |
| self.page_images: List[dspy.Image] = [] | |
| self.page_classifications: Dict[int, str] = {} | |
| self.document_boundaries: Dict[str, Tuple[int, int]] = {} | |
| def get_page_images(self, pages: list[int]) -> List[dspy.Image]: | |
| """Get the page images. Be mindful of context length restrictions and don't return more than 8 images at a time.""" | |
| return [self.page_images[i] for i in pages] | |
| def convert_to_img(self, data: bytes, pages: int = -1) -> List[dspy.Image]: | |
| """Convert PDF data to base64 encoded images.""" | |
| pdf_file = io.BytesIO(data) | |
| pdf_reader = pymupdf.open(stream=pdf_file, filetype="pdf") | |
| images = [] | |
| max_pages = pages if pages > 0 else pdf_reader.page_count | |
| for page_num in range(max_pages): | |
| page = pdf_reader.load_page(page_num) | |
| pix = page.get_pixmap() | |
| img_data = pix.tobytes("png") | |
| images.append(dspy.Image.from_PIL(f"data:image/png;base64,{base64.b64encode(img_data).decode('utf-8')}")) | |
| return images | |
| async def process_pdf(self) -> Dict[str, Tuple[int, int]]: | |
| """Process the PDF file to detect document boundaries.""" | |
| # Convert PDF to images and save to self.page_images | |
| with open(self.pdf_file, "rb") as f: | |
| data = f.read() | |
| self.page_images = self.convert_to_img(data) | |
| # Classify pages | |
| # Optionally toggle with_updated_fields to constrain the classifications (or not) | |
| classify_signature = ClassifyPage.with_updated_fields( | |
| "page_class", | |
| type_=Literal[tuple(CLASSES)], # type: ignore | |
| ) | |
| classifier = dspy.Predict(classify_signature) | |
| # # Process all pages concurrently | |
| async def classify_page(i: int, img: dspy.Image): | |
| result = await classifier.acall(page_image=img) | |
| return i, result.page_class | |
| # Use asyncio.gather to process all pages concurrently | |
| tasks = [classify_page(i, img) for i, img in enumerate(self.page_images)] | |
| results = await asyncio.gather(*tasks) | |
| # Convert results back to dictionary and save to self.page_classifications | |
| self.page_classifications = results | |
| logger.info(self.page_classifications) | |
| # Detect boundaries | |
| boundary_detector = dspy.Signature( | |
| "pages_and_classifications -> document_boundaries: dict[str, tuple[int, int]]" | |
| ).with_instructions( | |
| "Detect boundaries between documents, such as order forms or agreements. A typical order form has a header, details, and signature page. You should separate things like schedules, appendices, and exhibits." | |
| ) | |
| detector = dspy.ReAct( | |
| boundary_detector, tools=[self.get_page_images], max_iters=10 | |
| ) | |
| LM_CONFIG_SMART = { | |
| "model": os.getenv("DSPY_SMART_MODEL"), | |
| "api_key": os.getenv("DSPY_API_KEY"), | |
| "api_base": os.getenv("DSPY_ENDPOINT"), | |
| "api_version": os.getenv("DSPY_SMART_API_VERSION"), | |
| "max_tokens": int(os.getenv("DSPY_SMART_MAX_TOKENS")), | |
| "temperature": float(os.getenv("DSPY_SMART_TEMPERATURE")), | |
| "cache": True, | |
| } | |
| lm_smart = dspy.LM(**LM_CONFIG_SMART) | |
| with dspy.context(lm=lm_smart): | |
| response = await detector.acall(pages_and_classifications=self.page_classifications) | |
| logger.info(response) | |
| # Save boundaries to self.document_boundaries | |
| self.document_boundaries = response.document_boundaries | |
| return self.document_boundaries | |
| def get_page_classifications(self) -> Dict[int, str]: | |
| """Get the page classifications.""" | |
| return self.page_classifications | |
| def get_document_boundaries(self) -> Dict[str, Tuple[int, int]]: | |
| """Get the detected document boundaries.""" | |
| return self.document_boundaries | |
| async def process_pdf(pdf_file: str) -> dict: | |
| """Legacy function for backward compatibility.""" | |
| detector = PDFBoundaryDetector(pdf_file) | |
| return await detector.process_pdf() | |
| if __name__ == "__main__": | |
| load_dotenv() | |
| LM_CONFIG = { | |
| "model": os.getenv("DSPY_FAST_MODEL"), | |
| "api_key": os.getenv("DSPY_API_KEY"), | |
| "api_base": os.getenv("DSPY_ENDPOINT"), | |
| "api_version": os.getenv("DSPY_FAST_API_VERSION"), | |
| "max_tokens": int(os.getenv("DSPY_FAST_MAX_TOKENS", 50_000)), | |
| "temperature": float(os.getenv("DSPY_FAST_TEMPERATURE", 1.0)), | |
| "cache": True, | |
| } | |
| lm = dspy.LM(**LM_CONFIG) | |
| dspy.configure(lm=lm) | |
| if len(sys.argv) < 2: | |
| logger.info("Usage: python detect_boundaries.py <pdf_file>") | |
| sys.exit(1) | |
| pdf_file = sys.argv[1] | |
| # Run the async function | |
| boundaries = asyncio.run(process_pdf(pdf_file)) | |
| logger.info(f"Detected boundaries: \n{boundaries}") | |
Author
Thanks, @Nasreddine - that's up to you and your use case. If you explicitly want multiple classes you could set the OutputField type to something like List[Literal[tuple(CLASSES)]] (on line 93) which would give you more than one category but still constrained to that list. Otherwise the code as it stands should just return a single classification for a page.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for sharing the code.
Since we can have multiple classes on a single page, how should we handle this case?