Last active
June 2, 2018 05:45
-
-
Save Arkanayan/f86de17f48e4130c27a40440acacdbd8 to your computer and use it in GitHub Desktop.
Extract image(s) from pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import tempfile | |
import shutil | |
from pathlib import Path | |
import os | |
def path2num(path): | |
""" | |
'/hello/hi/my2and3.jpg to 23 | |
""" | |
return ''.join(filter(str.isdigit, path.parts[-1])) | |
def extract_images_from_pdf(pdf_path, destination_path): | |
""" | |
Extract images from pdfs (using poppler-utils) | |
:param pdf_path: str Path to the pdf file | |
:param destination_path: str Destination directory where the extracted images are saved | |
:return: list Names of the extracted images ordered by pages | |
""" | |
args = ['pdfimages', '-all', '-p'] | |
extracted_images = [] | |
destination_path = Path(destination_path) | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
tmpdir = Path(tmpdirname) | |
pdf_filename = os.path.split(pdf_path)[-1] | |
fname, extension = os.path.splitext(pdf_filename) | |
args.extend([pdf_path, str(tmpdir / fname)]) | |
proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
if proc.returncode == 0 and not len(proc.stderr): | |
images = list(tmpdir.glob('*.*')) | |
# sort images by the order of the pages they are extracted from | |
images = sorted(images, key=path2num) | |
for idx, image in enumerate(images): | |
if image.exists(): | |
image_extension = os.path.splitext(image.parts[-1])[-1] | |
# output format: file.1.jpg, file.2.jpg | |
new_filename = fname + '.' + str(idx + 1) + image_extension | |
new_filepath = destination_path.absolute() / new_filename | |
shutil.copyfile(str(image), str(new_filepath)) | |
extracted_images.append(new_filename) | |
else: | |
raise Exception('PDF extraction error. Please make sure popper-utils is installed. see: ' | |
'https://www.howtogeek.com/228531/') | |
return extracted_images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment