This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 | |
# Assuming you have an Apache PDFBox 3 jar in the same directory | |
from pathlib import Path | |
import jpype | |
import jpype.imports | |
import PIL.Image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 | |
import re | |
from datetime import datetime | |
from urllib.request import urlopen | |
from packaging.version import Version | |
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/" | |
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 | |
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py | |
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge) | |
# Js-Depends: pdfjs-dist, canvas | |
# Use `python -m pip install` and `python -m javascript --install` | |
import argparse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause OR MPL-2.0 | |
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc. | |
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6) | |
__all__ = ["safer_tar_unpack"] | |
import sys | |
if sys.version_info >= (3, 11, 4): # PEP 706 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause | |
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API | |
# See also https://stackoverflow.com/q/76995969/15547292 | |
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version | |
import re | |
import json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause | |
import math | |
import argparse | |
import itertools | |
import pypdfium2 as pdfium | |
from pathlib import Path | |
parser = argparse.ArgumentParser() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause | |
import sys | |
import argparse | |
if sys.version_info >= (3, 9): | |
from argparse import BooleanOptionalAction | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: MPL-2.0 | |
# Sophisticated parser for a page number specification mini-language | |
# Technically, this might be a use case for a parser generator like pyparsing or PLY, but this is a manual implementation based on common string operations. | |
__all__ = ["parse_pagenums"] | |
import logging | |
from collections import namedtuple |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: MPL-2.0 | |
# Note that Poppler is GPL-licensed, so this code is altogether affected by copyleft | |
import math | |
from pathlib import Path | |
import PIL.Image | |
import cairo | |
import gi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: MPL-2.0 | |
# Note that Poppler is GPL-licensed, so this code is altogether affected by copyleft | |
import PIL.Image | |
import poppler # python-poppler | |
from poppler.cpp.page_renderer import render_hint | |
def _translate_rotation(rotation): |
OlderNewer