Skip to content

Instantly share code, notes, and snippets.

@Pandapip1
Created February 19, 2024 17:22
Show Gist options
  • Save Pandapip1/a0ce0e0806effe84e71d872589cfb073 to your computer and use it in GitHub Desktop.
Save Pandapip1/a0ce0e0806effe84e71d872589cfb073 to your computer and use it in GitHub Desktop.
A simple gist to pull all the images from a PDF file
#!/usr/bin/env python3
# coding=utf-8
from PIL import Image
import fitz
import os
import sys
import io
def pdf_extract_images(pdf_path, output_dir):
# Open the PDF
pdf = fitz.open(pdf_path)
# Iterate over each page
for page_number in range(pdf.page_count):
page = pdf[page_number]
image_list = page.get_images(full=True)
for image_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image.save(os.path.join(output_dir, f"page_{page_number}_image_{image_index}.png"))
# Close the PDF
pdf.close()
if __name__ == "__main__":
pdf_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace(".pdf", "")
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
elif input(f"Output directory {output_dir} already exists. Continue? (y/n): ").lower() != "y":
sys.exit(1)
pdf_extract_images(pdf_path, output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment