Skip to content

Instantly share code, notes, and snippets.

@mara004
Last active June 20, 2024 16:44
Show Gist options
  • Save mara004/51c3216a9eabd3dcbc78a86d877a61dc to your computer and use it in GitHub Desktop.
Save mara004/51c3216a9eabd3dcbc78a86d877a61dc to your computer and use it in GitHub Desktop.
PDF rendering with PDFBox, from Python
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0
# Assuming you have an Apache PDFBox 3 jar in the same directory
from pathlib import Path
import jpype
import jpype.imports
import PIL.Image
RELATIVE_DIR = Path(__file__).resolve().parent
PDFBOX_JAR = RELATIVE_DIR / "pdfbox.jar"
jpype.addClassPath(PDFBOX_JAR)
jpype.startJVM("-Djava.awt.headless=true")
import java.io as jio
import org.apache.pdfbox as pdfbox
from org.apache.pdfbox.rendering import ImageType
DPI = 300
TEST_FILE = Path("~/projects/scripts/out/38.pdf").expanduser()
OUTPUT_DIR = RELATIVE_DIR / "out"
OUTPUT_DIR.mkdir(exist_ok=True)
ImageTypeToPIL = {
ImageType.BINARY: "1", # TYPE_BYTE_BINARY
ImageType.GRAY: "L", # TYPE_BYTE_GRAY
ImageType.RGB: "BGRX", # TYPE_INT_RGB, actually BGRX in memory
ImageType.ARGB: "BGRA", # TYPE_INT_ARGB, actually BGRA in memory
ImageType.BGR: "BGR", # TYPE_3BYTE_BGR
}
imgtype = ImageType.BGR
pil_srcmode = ImageTypeToPIL[imgtype]
pil_dstmode = pil_srcmode.replace("BGR", "RGB")
pdf = pdfbox.Loader.loadPDF( jio.File(str(TEST_FILE)) )
renderer = pdfbox.rendering.PDFRenderer(pdf)
n_pages = int( pdf.getNumberOfPages() )
for i in range(n_pages):
print(f"Rendering page {i+1} ...")
j_image = renderer.renderImageWithDPI(i, DPI, imgtype)
w, h = int(j_image.getWidth()), int(j_image.getHeight())
j_data = j_image.getRaster().getDataBuffer().getData()
py_data = memoryview(j_data) # requires PIL >= 9.5
py_image = PIL.Image.frombuffer(pil_dstmode, (w, h), py_data, "raw", pil_srcmode, 0, 1)
py_image.save(OUTPUT_DIR / f"render_{i+1}.jpg")
jpype.shutdownJVM()
@mara004
Copy link
Author

mara004 commented Jun 22, 2023

todo: might theoretically want to handle offset, though it always seems to be 0 in practice, as per my testing

also want to look into different input types (bytes, byte buffer), and parallelization, as would be relevant in a generic API

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment