Skip to content

Instantly share code, notes, and snippets.

@Arkanayan
Created August 29, 2018 11:15
Show Gist options
  • Save Arkanayan/35ed90e3c051893729cc0fc6210eff26 to your computer and use it in GitHub Desktop.
Save Arkanayan/35ed90e3c051893729cc0fc6210eff26 to your computer and use it in GitHub Desktop.
Extract image from pdf
FROM continuumio/miniconda3
MAINTAINER Arka Nayan <arka@mantralabsglobal.com>
VOLUME ["/app"]
WORKDIR /app
RUN apt-get update && \
apt-get install -y build-essential \
libgtk2.0-dev libgl1-mesa-glx
RUN apt-get install -y tesseract-ocr libtesseract-dev poppler-utils enchant
RUN conda update -n base conda
#RUN conda install -y -c menpo opencv3
COPY environment.yml /tmp/
RUN conda env update --name base -f /tmp/environment.yml
EXPOSE 5000
CMD ["python", "run.py"]
from pathlib import Path
import os
import tempfile
import subprocess
# Dependencies poppler-utils
# See Dockerfile on how to install poppler-utils and https://www.howtogeek.com/228531/
# Tested on Linux
# Distributed with absolutely no guarantee
def extract_images_from_pdf(pdf_path, destination_path):
"""
Extract images from pdfs (using poppler-utils)
:param pdf_path: str Path to the pdf file
:param destination_path: str Destination directory where the extracted images are saved
:return: list Names of the extracted images ordered by pages
"""
args = ['pdfimages', '-png', '-p']
extracted_images = []
destination_path = Path(destination_path)
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
pdf_filename = os.path.split(pdf_path)[-1]
fname, extension = os.path.splitext(pdf_filename)
args.extend([pdf_path, str(tmpdir / fname)])
proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode == 0 and not len(proc.stderr):
images = list(tmpdir.glob('*.*'))
# sort images by the order of the pages they are extracted from
images = sorted(images, key=path2num)
for idx, image in enumerate(images):
if image.exists():
image_extension = os.path.splitext(image.parts[-1])[-1]
# output format: file.1.jpg, file.2.jpg
new_filename = fname + '.' + str(idx + 1) + image_extension
new_filepath = destination_path.absolute() / new_filename
shutil.copyfile(str(image), str(new_filepath))
extracted_images.append(new_filename)
else:
raise Exception('PDF extraction error. Please make sure popper-utils is installed: see: '
'https://www.howtogeek.com/228531/')
return extracted_images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment