-
-
Save philippdubach/0087fdaefb8ca905e5df87176c1a31e3 to your computer and use it in GitHub Desktop.
A Python script that extracts text from PDF files and converts them to Markdown format with intelligent token counting and chunking for LLM processing.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #pip install PyMuPDF Pillow pytesseract tiktoken | |
| import fitz | |
| from PIL import Image | |
| import pytesseract | |
| import re | |
| import tiktoken | |
| def extract_pdf_optimized(pdf_path, max_tokens=8000): | |
| doc = fitz.open(pdf_path) | |
| all_text = "" | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # Try direct extraction first | |
| text = page.get_text() | |
| # Use OCR only if needed | |
| if len(text.strip()) < 50: | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| text = pytesseract.image_to_string(img) | |
| all_text += text + "\n\n" | |
| # Clean text | |
| all_text = clean_text(all_text) | |
| all_text = remove_headers_footers(all_text) | |
| # Count tokens | |
| encoding = tiktoken.encoding_for_model("gpt-4") | |
| token_count = len(encoding.encode(all_text)) | |
| print(f"Total tokens: {token_count}") | |
| # If too large, chunk it | |
| if token_count > max_tokens: | |
| chunks = chunk_by_tokens(all_text, max_tokens) | |
| for i, chunk in enumerate(chunks): | |
| with open(f"output_part{i+1}.md", "w", encoding="utf-8") as f: | |
| f.write(chunk) | |
| print(f"Split into {len(chunks)} chunks") | |
| else: | |
| with open("output.md", "w", encoding="utf-8") as f: | |
| f.write(all_text) | |
| return all_text | |
| def clean_text(text): | |
| # Remove excessive whitespace | |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) | |
| text = re.sub(r' +', ' ', text) | |
| text = re.sub(r'\t+', ' ', text) | |
| # Normalize bullets and special chars | |
| text = re.sub(r'[•●▪▫■□]', '-', text) | |
| return text.strip() | |
| def remove_headers_footers(text): | |
| lines = text.split('\n') | |
| line_counts = {} | |
| for line in lines: | |
| cleaned = line.strip() | |
| if 5 < len(cleaned) < 100: # Likely header/footer length | |
| line_counts[cleaned] = line_counts.get(cleaned, 0) + 1 | |
| repeated_lines = {line for line, count in line_counts.items() if count > 2} | |
| filtered_lines = [line for line in lines if line.strip() not in repeated_lines] | |
| return '\n'.join(filtered_lines) | |
| def chunk_by_tokens(text, max_tokens=6000, model="gpt-4"): | |
| encoding = tiktoken.encoding_for_model(model) | |
| paragraphs = text.split('\n\n') | |
| chunks = [] | |
| current_chunk = [] | |
| current_tokens = 0 | |
| for para in paragraphs: | |
| para_tokens = len(encoding.encode(para)) | |
| if current_tokens + para_tokens > max_tokens: | |
| if current_chunk: | |
| chunks.append('\n\n'.join(current_chunk)) | |
| current_chunk = [para] | |
| current_tokens = para_tokens | |
| else: | |
| current_chunk.append(para) | |
| current_tokens += para_tokens | |
| if current_chunk: | |
| chunks.append('\n\n'.join(current_chunk)) | |
| return chunks | |
| # Usage | |
| extract_pdf_optimized("jpm.pdf", max_tokens=50000) #current claud limit is 2000000 tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment