Last active
May 5, 2025 20:38
-
-
Save jfrobbins/3b77b26319ef609c4cefea6cc08ceb07 to your computer and use it in GitHub Desktop.
Python script that allows you to export pages from a PDF file to PNG images. The script can be run from the command line, supports exporting specific pages or a range of pages, and includes an option to export all pages. The output images are saved in a directory named after the PDF file (without the extension) within the same directory as the P…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Filename: pdf_to_png.py | |
# Summary: Converts specified or all PDF pages to PNG images on macOS, extracting the first non-empty text line as the title for filenames. | |
# Filenames are structured as <file-name>_pg-<page-number>_<slide-title>.png for better sorting by page number. | |
# Page numbers are padded with leading zeros based on the total number of pages. | |
# Usage: | |
# - Convert all pages: python pdf_to_png.py input.pdf | |
# - Convert a range (e.g., pages 3-7): python pdf_to_png.py input.pdf --pages 3-7 | |
# - Convert specific pages (e.g., pages 1,3,5): python pdf_to_png.py input.pdf --pages 1,3,5 | |
# Dependencies: | |
# - pdf2image: Install via `pip install pdf2image` | |
# - PyPDF2: Install via `pip install PyPDF2` | |
# - pdfminer.six: Install via `pip install pdfminer.six` | |
# - poppler: Install on macOS via `brew install poppler` | |
import argparse | |
import os | |
import re | |
from pdf2image import convert_from_path | |
from PyPDF2 import PdfReader | |
from pdfminer.high_level import extract_text | |
def parse_pages(pages_str): | |
"""Parses the pages argument into a list of page numbers.""" | |
try: | |
if '-' in pages_str: | |
start, end = map(int, pages_str.split('-')) | |
if start > end: | |
raise ValueError("Start page must be less than or equal to end page") | |
return list(range(start, end + 1)) | |
else: | |
return list(map(int, pages_str.split(','))) | |
except ValueError: | |
raise ValueError("Invalid pages format. Use '3-7' for a range or '1,3,5' for specific pages.") | |
def group_contiguous_pages(pages): | |
"""Groups page numbers into contiguous ranges for efficient processing.""" | |
pages = sorted(set(pages)) # Remove duplicates and sort | |
groups = [] | |
if not pages: | |
return groups | |
current_group = [pages[0]] | |
for p in pages[1:]: | |
if p == current_group[-1] + 1: | |
current_group.append(p) | |
else: | |
groups.append(current_group) | |
current_group = [p] | |
if current_group: | |
groups.append(current_group) | |
return groups | |
def extract_title_from_page(pdf_path, page_num): | |
"""Extracts the first non-empty line of text from a PDF page as the title.""" | |
text = extract_text(pdf_path, page_numbers=[page_num - 1]) # 0-based page numbering | |
lines = text.split('\n') | |
for line in lines: | |
if line.strip(): | |
return line.strip() | |
return None | |
def process_title(title): | |
"""Cleans the title for use in filenames by removing special characters and formatting.""" | |
if not title: | |
return None | |
# Keep only alphanumeric characters, spaces, and underscores | |
title = re.sub(r'[^a-zA-Z0-9 _]', '', title) | |
# Replace spaces with underscores and convert to lowercase | |
title = title.replace(' ', '_').lower() | |
# Limit length to 50 characters | |
return title[:50] | |
def main(): | |
# Parse command-line arguments | |
parser = argparse.ArgumentParser(description="Convert PDF pages to PNG images on macOS, including titles in filenames") | |
parser.add_argument("pdf_path", help="Path to the input PDF file (e.g., input.pdf)") | |
parser.add_argument("--pages", help="Pages to convert (e.g., '3-7' or '1,3,5'); omit for all pages") | |
args = parser.parse_args() | |
pdf_path = args.pdf_path | |
# Determine total number of pages | |
reader = PdfReader(pdf_path) | |
num_pages = len(reader.pages) | |
# Select pages to convert | |
if args.pages: | |
pages_to_convert = parse_pages(args.pages) | |
if not all(1 <= p <= num_pages for p in pages_to_convert): | |
raise ValueError(f"Page numbers out of range. PDF has {num_pages} pages.") | |
else: | |
pages_to_convert = list(range(1, num_pages + 1)) | |
# Process the base filename (without extension) | |
filename = os.path.basename(pdf_path) | |
base = os.path.splitext(filename)[0].replace(" ", "_").lower() | |
output_dir = os.path.join(os.path.dirname(pdf_path), base) | |
os.makedirs(output_dir, exist_ok=True) | |
# Calculate padding for page numbers based on total pages | |
num_digits = len(str(num_pages)) | |
# Group pages for efficient conversion | |
groups = group_contiguous_pages(pages_to_convert) | |
# Convert each group of pages to PNG | |
for group in groups: | |
first_page = group[0] | |
last_page = group[-1] | |
images = convert_from_path(pdf_path, first_page=first_page, last_page=last_page) | |
for i, p in enumerate(group): | |
image = images[i] | |
padded_page = f"{p:0{num_digits}d}" | |
title = extract_title_from_page(pdf_path, p) | |
processed_title = process_title(title) | |
title_suffix = f"_{processed_title}" if processed_title else "" | |
output_filename = os.path.join(output_dir, f"{base}_pg-{padded_page}{title_suffix}.png") | |
image.save(output_filename, 'PNG') | |
if __name__ == "__main__": | |
main() | |
# MIT License | |
# | |
# Copyright (c) 2025 Jon Robbins | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment