Skip to content

Instantly share code, notes, and snippets.

@Arkanayan
Last active June 2, 2018 05:45
Show Gist options
  • Save Arkanayan/f86de17f48e4130c27a40440acacdbd8 to your computer and use it in GitHub Desktop.
Save Arkanayan/f86de17f48e4130c27a40440acacdbd8 to your computer and use it in GitHub Desktop.
Extract image(s) from pdf
import subprocess
import tempfile
import shutil
from pathlib import Path
import os
def path2num(path):
"""
'/hello/hi/my2and3.jpg to 23
"""
return ''.join(filter(str.isdigit, path.parts[-1]))
def extract_images_from_pdf(pdf_path, destination_path):
"""
Extract images from pdfs (using poppler-utils)
:param pdf_path: str Path to the pdf file
:param destination_path: str Destination directory where the extracted images are saved
:return: list Names of the extracted images ordered by pages
"""
args = ['pdfimages', '-all', '-p']
extracted_images = []
destination_path = Path(destination_path)
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
pdf_filename = os.path.split(pdf_path)[-1]
fname, extension = os.path.splitext(pdf_filename)
args.extend([pdf_path, str(tmpdir / fname)])
proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode == 0 and not len(proc.stderr):
images = list(tmpdir.glob('*.*'))
# sort images by the order of the pages they are extracted from
images = sorted(images, key=path2num)
for idx, image in enumerate(images):
if image.exists():
image_extension = os.path.splitext(image.parts[-1])[-1]
# output format: file.1.jpg, file.2.jpg
new_filename = fname + '.' + str(idx + 1) + image_extension
new_filepath = destination_path.absolute() / new_filename
shutil.copyfile(str(image), str(new_filepath))
extracted_images.append(new_filename)
else:
raise Exception('PDF extraction error. Please make sure popper-utils is installed. see: '
'https://www.howtogeek.com/228531/')
return extracted_images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment