Skip to content

Instantly share code, notes, and snippets.

@jfrobbins
Last active May 5, 2025 20:38
Show Gist options
  • Save jfrobbins/3b77b26319ef609c4cefea6cc08ceb07 to your computer and use it in GitHub Desktop.
Save jfrobbins/3b77b26319ef609c4cefea6cc08ceb07 to your computer and use it in GitHub Desktop.
Python script that allows you to export pages from a PDF file to PNG images. The script can be run from the command line, supports exporting specific pages or a range of pages, and includes an option to export all pages. The output images are saved in a directory named after the PDF file (without the extension) within the same directory as the P…
# Filename: pdf_to_png.py
# Summary: Converts specified or all PDF pages to PNG images on macOS, extracting the first non-empty text line as the title for filenames.
# Filenames are structured as <file-name>_pg-<page-number>_<slide-title>.png for better sorting by page number.
# Page numbers are padded with leading zeros based on the total number of pages.
# Usage:
# - Convert all pages: python pdf_to_png.py input.pdf
# - Convert a range (e.g., pages 3-7): python pdf_to_png.py input.pdf --pages 3-7
# - Convert specific pages (e.g., pages 1,3,5): python pdf_to_png.py input.pdf --pages 1,3,5
# Dependencies:
# - pdf2image: Install via `pip install pdf2image`
# - PyPDF2: Install via `pip install PyPDF2`
# - pdfminer.six: Install via `pip install pdfminer.six`
# - poppler: Install on macOS via `brew install poppler`
import argparse
import os
import re
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
def parse_pages(pages_str):
"""Parses the pages argument into a list of page numbers."""
try:
if '-' in pages_str:
start, end = map(int, pages_str.split('-'))
if start > end:
raise ValueError("Start page must be less than or equal to end page")
return list(range(start, end + 1))
else:
return list(map(int, pages_str.split(',')))
except ValueError:
raise ValueError("Invalid pages format. Use '3-7' for a range or '1,3,5' for specific pages.")
def group_contiguous_pages(pages):
"""Groups page numbers into contiguous ranges for efficient processing."""
pages = sorted(set(pages)) # Remove duplicates and sort
groups = []
if not pages:
return groups
current_group = [pages[0]]
for p in pages[1:]:
if p == current_group[-1] + 1:
current_group.append(p)
else:
groups.append(current_group)
current_group = [p]
if current_group:
groups.append(current_group)
return groups
def extract_title_from_page(pdf_path, page_num):
"""Extracts the first non-empty line of text from a PDF page as the title."""
text = extract_text(pdf_path, page_numbers=[page_num - 1]) # 0-based page numbering
lines = text.split('\n')
for line in lines:
if line.strip():
return line.strip()
return None
def process_title(title):
"""Cleans the title for use in filenames by removing special characters and formatting."""
if not title:
return None
# Keep only alphanumeric characters, spaces, and underscores
title = re.sub(r'[^a-zA-Z0-9 _]', '', title)
# Replace spaces with underscores and convert to lowercase
title = title.replace(' ', '_').lower()
# Limit length to 50 characters
return title[:50]
def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Convert PDF pages to PNG images on macOS, including titles in filenames")
parser.add_argument("pdf_path", help="Path to the input PDF file (e.g., input.pdf)")
parser.add_argument("--pages", help="Pages to convert (e.g., '3-7' or '1,3,5'); omit for all pages")
args = parser.parse_args()
pdf_path = args.pdf_path
# Determine total number of pages
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
# Select pages to convert
if args.pages:
pages_to_convert = parse_pages(args.pages)
if not all(1 <= p <= num_pages for p in pages_to_convert):
raise ValueError(f"Page numbers out of range. PDF has {num_pages} pages.")
else:
pages_to_convert = list(range(1, num_pages + 1))
# Process the base filename (without extension)
filename = os.path.basename(pdf_path)
base = os.path.splitext(filename)[0].replace(" ", "_").lower()
output_dir = os.path.join(os.path.dirname(pdf_path), base)
os.makedirs(output_dir, exist_ok=True)
# Calculate padding for page numbers based on total pages
num_digits = len(str(num_pages))
# Group pages for efficient conversion
groups = group_contiguous_pages(pages_to_convert)
# Convert each group of pages to PNG
for group in groups:
first_page = group[0]
last_page = group[-1]
images = convert_from_path(pdf_path, first_page=first_page, last_page=last_page)
for i, p in enumerate(group):
image = images[i]
padded_page = f"{p:0{num_digits}d}"
title = extract_title_from_page(pdf_path, p)
processed_title = process_title(title)
title_suffix = f"_{processed_title}" if processed_title else ""
output_filename = os.path.join(output_dir, f"{base}_pg-{padded_page}{title_suffix}.png")
image.save(output_filename, 'PNG')
if __name__ == "__main__":
main()
# MIT License
#
# Copyright (c) 2025 Jon Robbins
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment