Skip to content

Instantly share code, notes, and snippets.

@dpanic
Created April 4, 2023 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dpanic/6ec64fb9c86fd654ddce0a2a09412e51 to your computer and use it in GitHub Desktop.
Save dpanic/6ec64fb9c86fd654ddce0a2a09412e51 to your computer and use it in GitHub Desktop.
extract images from pdf
import os
import sys
import fitz
def extract_images(pdf_path, output_folder):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
with open(os.path.join(output_folder, f"image_page{page_num}_img{img_index}.png"), "wb") as f:
f.write(image_bytes)
if __name__ == "__main__":
pdf_path = sys.argv[1]
output_folder = "out"
try:
mkdir(output_folder)
except:
pass
extract_images(pdf_path, output_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment