Arkanayan/Dockerfile

## Dockerfile
FROM continuumio/miniconda3

MAINTAINER Arka Nayan <arka@mantralabsglobal.com>

VOLUME ["/app"]

WORKDIR /app

RUN apt-get update && \
    apt-get install -y build-essential \
    libgtk2.0-dev libgl1-mesa-glx

RUN apt-get install -y tesseract-ocr libtesseract-dev poppler-utils enchant

RUN conda update -n base conda
#RUN conda install -y -c menpo opencv3
COPY environment.yml /tmp/

RUN conda env update --name base -f /tmp/environment.yml

EXPOSE 5000

CMD ["python", "run.py"]

## extract_image_from_pdf.py
from pathlib import Path
import os
import tempfile
import subprocess

# Dependencies poppler-utils
# See Dockerfile on how to install poppler-utils and https://www.howtogeek.com/228531/
# Tested on Linux
# Distributed with absolutely no guarantee

def extract_images_from_pdf(pdf_path, destination_path):
    """
    Extract images from pdfs (using poppler-utils)
    :param pdf_path: str Path to the pdf file
    :param destination_path: str Destination directory where the extracted images are saved
    :return: list Names of the extracted images ordered by pages
    """
    args = ['pdfimages', '-png', '-p']
    extracted_images = []
    destination_path = Path(destination_path)

    with tempfile.TemporaryDirectory() as tmpdirname:
        tmpdir = Path(tmpdirname)
        pdf_filename = os.path.split(pdf_path)[-1]
        fname, extension = os.path.splitext(pdf_filename)
        args.extend([pdf_path, str(tmpdir / fname)])
        proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if proc.returncode == 0 and not len(proc.stderr):
            images = list(tmpdir.glob('*.*'))
            # sort images by the order of the pages they are extracted from
            images = sorted(images, key=path2num)
            for idx, image in enumerate(images):
                if image.exists():
                    image_extension = os.path.splitext(image.parts[-1])[-1]
                    # output format: file.1.jpg, file.2.jpg
                    new_filename = fname + '.' + str(idx + 1) + image_extension
                    new_filepath = destination_path.absolute() / new_filename
                    shutil.copyfile(str(image), str(new_filepath))
                    extracted_images.append(new_filename)
        else:
            raise Exception('PDF extraction error. Please make sure popper-utils is installed: see: '
                            'https://www.howtogeek.com/228531/')

    return extracted_images
	FROM continuumio/miniconda3

	MAINTAINER Arka Nayan <arka@mantralabsglobal.com>

	VOLUME ["/app"]

	WORKDIR /app

	RUN apt-get update && \
	apt-get install -y build-essential \
	libgtk2.0-dev libgl1-mesa-glx

	RUN apt-get install -y tesseract-ocr libtesseract-dev poppler-utils enchant

	RUN conda update -n base conda
	#RUN conda install -y -c menpo opencv3
	COPY environment.yml /tmp/

	RUN conda env update --name base -f /tmp/environment.yml

	EXPOSE 5000

	CMD ["python", "run.py"]
	from pathlib import Path
	import os
	import tempfile
	import subprocess

	# Dependencies poppler-utils
	# See Dockerfile on how to install poppler-utils and https://www.howtogeek.com/228531/
	# Tested on Linux
	# Distributed with absolutely no guarantee

	def extract_images_from_pdf(pdf_path, destination_path):
	"""
	Extract images from pdfs (using poppler-utils)
	:param pdf_path: str Path to the pdf file
	:param destination_path: str Destination directory where the extracted images are saved
	:return: list Names of the extracted images ordered by pages
	"""
	args = ['pdfimages', '-png', '-p']
	extracted_images = []
	destination_path = Path(destination_path)

	with tempfile.TemporaryDirectory() as tmpdirname:
	tmpdir = Path(tmpdirname)
	pdf_filename = os.path.split(pdf_path)[-1]
	fname, extension = os.path.splitext(pdf_filename)
	args.extend([pdf_path, str(tmpdir / fname)])
	proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	if proc.returncode == 0 and not len(proc.stderr):
	images = list(tmpdir.glob('.'))
	# sort images by the order of the pages they are extracted from
	images = sorted(images, key=path2num)
	for idx, image in enumerate(images):
	if image.exists():
	image_extension = os.path.splitext(image.parts[-1])[-1]
	# output format: file.1.jpg, file.2.jpg
	new_filename = fname + '.' + str(idx + 1) + image_extension
	new_filepath = destination_path.absolute() / new_filename
	shutil.copyfile(str(image), str(new_filepath))
	extracted_images.append(new_filename)
	else:
	raise Exception('PDF extraction error. Please make sure popper-utils is installed: see: '
	'https://www.howtogeek.com/228531/')

	return extracted_images