Skip to content

Instantly share code, notes, and snippets.

@philippdubach
Created November 24, 2025 18:08
Show Gist options
  • Select an option

  • Save philippdubach/0087fdaefb8ca905e5df87176c1a31e3 to your computer and use it in GitHub Desktop.

Select an option

Save philippdubach/0087fdaefb8ca905e5df87176c1a31e3 to your computer and use it in GitHub Desktop.
A Python script that extracts text from PDF files and converts them to Markdown format with intelligent token counting and chunking for LLM processing.
#pip install PyMuPDF Pillow pytesseract tiktoken
import fitz
from PIL import Image
import pytesseract
import re
import tiktoken
def extract_pdf_optimized(pdf_path, max_tokens=8000):
doc = fitz.open(pdf_path)
all_text = ""
for page_num in range(len(doc)):
page = doc[page_num]
# Try direct extraction first
text = page.get_text()
# Use OCR only if needed
if len(text.strip()) < 50:
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text = pytesseract.image_to_string(img)
all_text += text + "\n\n"
# Clean text
all_text = clean_text(all_text)
all_text = remove_headers_footers(all_text)
# Count tokens
encoding = tiktoken.encoding_for_model("gpt-4")
token_count = len(encoding.encode(all_text))
print(f"Total tokens: {token_count}")
# If too large, chunk it
if token_count > max_tokens:
chunks = chunk_by_tokens(all_text, max_tokens)
for i, chunk in enumerate(chunks):
with open(f"output_part{i+1}.md", "w", encoding="utf-8") as f:
f.write(chunk)
print(f"Split into {len(chunks)} chunks")
else:
with open("output.md", "w", encoding="utf-8") as f:
f.write(all_text)
return all_text
def clean_text(text):
# Remove excessive whitespace
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'\t+', ' ', text)
# Normalize bullets and special chars
text = re.sub(r'[•●▪▫■□]', '-', text)
return text.strip()
def remove_headers_footers(text):
lines = text.split('\n')
line_counts = {}
for line in lines:
cleaned = line.strip()
if 5 < len(cleaned) < 100: # Likely header/footer length
line_counts[cleaned] = line_counts.get(cleaned, 0) + 1
repeated_lines = {line for line, count in line_counts.items() if count > 2}
filtered_lines = [line for line in lines if line.strip() not in repeated_lines]
return '\n'.join(filtered_lines)
def chunk_by_tokens(text, max_tokens=6000, model="gpt-4"):
encoding = tiktoken.encoding_for_model(model)
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = len(encoding.encode(para))
if current_tokens + para_tokens > max_tokens:
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_tokens = para_tokens
else:
current_chunk.append(para)
current_tokens += para_tokens
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
# Usage
extract_pdf_optimized("jpm.pdf", max_tokens=50000) #current claud limit is 2000000 tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment