Skip to content

Instantly share code, notes, and snippets.

@lebedov lebedov/
Last active Aug 4, 2019

What would you like to do?
How to call pdfbox's API with JPype.
#!/usr/bin/env python3
How to call pdfbox's API with JPype.
import jpype
import jpype.imports
import numpy as np
# Replace with path to pdfbox jar file:
if not jpype.isJVMStarted():
jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.awt.headless=true', convertStrings=False)
from java.awt.image import BufferedImage
from import File
from org.apache.pdfbox.pdmodel import PDDocument
from org.apache.pdfbox.rendering import ImageType, PDFRenderer
def extract_images(in_file, pages=None, dpi=72):
Extract pages of PDF file as images.
in_file : str
Path to input PDF file.
pages : iterable
Numbers of pages to extract (0-indexed). If None, return all pages.
dpi : int
Resolution at which to render output images.
output : list of numpy.ndarray
PDF pages rendered into RGB numpy arrays.
doc = PDDocument.load(File(in_file))
pdf_renderer = PDFRenderer(doc)
output = []
if pages == None:
pages = range(doc.getNumberOfPages())
for i in pages:
im = pdf_renderer.renderImageWithDPI(i, dpi)
h = im.getHeight()
w = im.getWidth()
# Retrieve data as numpy array of RGB values packed into int32:
data = im.getRGB(0, 0, w, h, None, 0, w)[:]
# Separate RGB channels, return as array of bytes:
output.append(data.view(np.uint8).reshape(h, w, 4)[..., :3])
return output
if __name__ == '__main__':
import os
import tempfile
import urllib
# Download sample multipage PDF:
data = urllib.request.urlopen('').read()
fd, name = tempfile.mkstemp()
f = open(name, 'wb')
result = extract_images(name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.