jfrobbins/pdf_to_png.py

## pdf_to_png.py
# Filename: pdf_to_png.py
# Summary: Converts specified or all PDF pages to PNG images on macOS, extracting the first non-empty text line as the title for filenames.
#          Filenames are structured as <file-name>_pg-<page-number>_<slide-title>.png for better sorting by page number.
#          Page numbers are padded with leading zeros based on the total number of pages.
# Usage:
#   - Convert all pages: python pdf_to_png.py input.pdf
#   - Convert a range (e.g., pages 3-7): python pdf_to_png.py input.pdf --pages 3-7
#   - Convert specific pages (e.g., pages 1,3,5): python pdf_to_png.py input.pdf --pages 1,3,5
# Dependencies:
#   - pdf2image: Install via `pip install pdf2image`
#   - PyPDF2: Install via `pip install PyPDF2`
#   - pdfminer.six: Install via `pip install pdfminer.six`
#   - poppler: Install on macOS via `brew install poppler`

import argparse
import os
import re
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text

def parse_pages(pages_str):
    """Parses the pages argument into a list of page numbers."""
    try:
        if '-' in pages_str:
            start, end = map(int, pages_str.split('-'))
            if start > end:
                raise ValueError("Start page must be less than or equal to end page")
            return list(range(start, end + 1))
        else:
            return list(map(int, pages_str.split(',')))
    except ValueError:
        raise ValueError("Invalid pages format. Use '3-7' for a range or '1,3,5' for specific pages.")

def group_contiguous_pages(pages):
    """Groups page numbers into contiguous ranges for efficient processing."""
    pages = sorted(set(pages))  # Remove duplicates and sort
    groups = []
    if not pages:
        return groups
    current_group = [pages[0]]
    for p in pages[1:]:
        if p == current_group[-1] + 1:
            current_group.append(p)
        else:
            groups.append(current_group)
            current_group = [p]
    if current_group:
        groups.append(current_group)
    return groups

def extract_title_from_page(pdf_path, page_num):
    """Extracts the first non-empty line of text from a PDF page as the title."""
    text = extract_text(pdf_path, page_numbers=[page_num - 1])  # 0-based page numbering
    lines = text.split('\n')
    for line in lines:
        if line.strip():
            return line.strip()
    return None

def process_title(title):
    """Cleans the title for use in filenames by removing special characters and formatting."""
    if not title:
        return None
    # Keep only alphanumeric characters, spaces, and underscores
    title = re.sub(r'[^a-zA-Z0-9 _]', '', title)
    # Replace spaces with underscores and convert to lowercase
    title = title.replace(' ', '_').lower()
    # Limit length to 50 characters
    return title[:50]

def main():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Convert PDF pages to PNG images on macOS, including titles in filenames")
    parser.add_argument("pdf_path", help="Path to the input PDF file (e.g., input.pdf)")
    parser.add_argument("--pages", help="Pages to convert (e.g., '3-7' or '1,3,5'); omit for all pages")
    args = parser.parse_args()

    pdf_path = args.pdf_path

    # Determine total number of pages
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)

    # Select pages to convert
    if args.pages:
        pages_to_convert = parse_pages(args.pages)
        if not all(1 <= p <= num_pages for p in pages_to_convert):
            raise ValueError(f"Page numbers out of range. PDF has {num_pages} pages.")
    else:
        pages_to_convert = list(range(1, num_pages + 1))

    # Process the base filename (without extension)
    filename = os.path.basename(pdf_path)
    base = os.path.splitext(filename)[0].replace(" ", "_").lower()
    output_dir = os.path.join(os.path.dirname(pdf_path), base)
    os.makedirs(output_dir, exist_ok=True)

    # Calculate padding for page numbers based on total pages
    num_digits = len(str(num_pages))

    # Group pages for efficient conversion
    groups = group_contiguous_pages(pages_to_convert)

    # Convert each group of pages to PNG
    for group in groups:
        first_page = group[0]
        last_page = group[-1]
        images = convert_from_path(pdf_path, first_page=first_page, last_page=last_page)
        for i, p in enumerate(group):
            image = images[i]
            padded_page = f"{p:0{num_digits}d}"
            title = extract_title_from_page(pdf_path, p)
            processed_title = process_title(title)
            title_suffix = f"_{processed_title}" if processed_title else ""
            output_filename = os.path.join(output_dir, f"{base}_pg-{padded_page}{title_suffix}.png")
            image.save(output_filename, 'PNG')

if __name__ == "__main__":
    main()

# MIT License
#
# Copyright (c) 2025 Jon Robbins
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
	# Filename: pdf_to_png.py
	# Summary: Converts specified or all PDF pages to PNG images on macOS, extracting the first non-empty text line as the title for filenames.
	# Filenames are structured as <file-name>_pg-<page-number>_<slide-title>.png for better sorting by page number.
	# Page numbers are padded with leading zeros based on the total number of pages.
	# Usage:
	# - Convert all pages: python pdf_to_png.py input.pdf
	# - Convert a range (e.g., pages 3-7): python pdf_to_png.py input.pdf --pages 3-7
	# - Convert specific pages (e.g., pages 1,3,5): python pdf_to_png.py input.pdf --pages 1,3,5
	# Dependencies:
	# - pdf2image: Install via `pip install pdf2image`
	# - PyPDF2: Install via `pip install PyPDF2`
	# - pdfminer.six: Install via `pip install pdfminer.six`
	# - poppler: Install on macOS via `brew install poppler`

	import argparse
	import os
	import re
	from pdf2image import convert_from_path
	from PyPDF2 import PdfReader
	from pdfminer.high_level import extract_text

	def parse_pages(pages_str):
	"""Parses the pages argument into a list of page numbers."""
	try:
	if '-' in pages_str:
	start, end = map(int, pages_str.split('-'))
	if start > end:
	raise ValueError("Start page must be less than or equal to end page")
	return list(range(start, end + 1))
	else:
	return list(map(int, pages_str.split(',')))
	except ValueError:
	raise ValueError("Invalid pages format. Use '3-7' for a range or '1,3,5' for specific pages.")

	def group_contiguous_pages(pages):
	"""Groups page numbers into contiguous ranges for efficient processing."""
	pages = sorted(set(pages)) # Remove duplicates and sort
	groups = []
	if not pages:
	return groups
	current_group = [pages[0]]
	for p in pages[1:]:
	if p == current_group[-1] + 1:
	current_group.append(p)
	else:
	groups.append(current_group)
	current_group = [p]
	if current_group:
	groups.append(current_group)
	return groups

	def extract_title_from_page(pdf_path, page_num):
	"""Extracts the first non-empty line of text from a PDF page as the title."""
	text = extract_text(pdf_path, page_numbers=[page_num - 1]) # 0-based page numbering
	lines = text.split('\n')
	for line in lines:
	if line.strip():
	return line.strip()
	return None

	def process_title(title):
	"""Cleans the title for use in filenames by removing special characters and formatting."""
	if not title:
	return None
	# Keep only alphanumeric characters, spaces, and underscores
	title = re.sub(r'[^a-zA-Z0-9 _]', '', title)
	# Replace spaces with underscores and convert to lowercase
	title = title.replace(' ', '_').lower()
	# Limit length to 50 characters
	return title[:50]

	def main():
	# Parse command-line arguments
	parser = argparse.ArgumentParser(description="Convert PDF pages to PNG images on macOS, including titles in filenames")
	parser.add_argument("pdf_path", help="Path to the input PDF file (e.g., input.pdf)")
	parser.add_argument("--pages", help="Pages to convert (e.g., '3-7' or '1,3,5'); omit for all pages")
	args = parser.parse_args()

	pdf_path = args.pdf_path

	# Determine total number of pages
	reader = PdfReader(pdf_path)
	num_pages = len(reader.pages)

	# Select pages to convert
	if args.pages:
	pages_to_convert = parse_pages(args.pages)
	if not all(1 <= p <= num_pages for p in pages_to_convert):
	raise ValueError(f"Page numbers out of range. PDF has {num_pages} pages.")
	else:
	pages_to_convert = list(range(1, num_pages + 1))

	# Process the base filename (without extension)
	filename = os.path.basename(pdf_path)
	base = os.path.splitext(filename)[0].replace(" ", "_").lower()
	output_dir = os.path.join(os.path.dirname(pdf_path), base)
	os.makedirs(output_dir, exist_ok=True)

	# Calculate padding for page numbers based on total pages
	num_digits = len(str(num_pages))

	# Group pages for efficient conversion
	groups = group_contiguous_pages(pages_to_convert)

	# Convert each group of pages to PNG
	for group in groups:
	first_page = group[0]
	last_page = group[-1]
	images = convert_from_path(pdf_path, first_page=first_page, last_page=last_page)
	for i, p in enumerate(group):
	image = images[i]
	padded_page = f"{p:0{num_digits}d}"
	title = extract_title_from_page(pdf_path, p)
	processed_title = process_title(title)
	title_suffix = f"_{processed_title}" if processed_title else ""
	output_filename = os.path.join(output_dir, f"{base}_pg-{padded_page}{title_suffix}.png")
	image.save(output_filename, 'PNG')

	if __name__ == "__main__":
	main()

	# MIT License
	#
	# Copyright (c) 2025 Jon Robbins
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.