This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 mara004 <geisserml@gmail.com> | |
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause OR CC-BY-4.0 | |
import sys | |
import argparse | |
if sys.version_info >= (3, 9): | |
from argparse import BooleanOptionalAction | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 mara004 | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause | |
import math | |
import argparse | |
import itertools | |
import pypdfium2 as pdfium | |
from pathlib import Path | |
parser = argparse.ArgumentParser() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause | |
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API | |
# See also https://stackoverflow.com/q/76995969/15547292 | |
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version | |
import re | |
import json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 mara004 | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause | |
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc. | |
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6) | |
__all__ = ["safer_tar_unpack"] | |
import sys | |
if sys.version_info >= (3, 11, 4): # PEP 706 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 mara004 | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 | |
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py | |
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge) | |
# Js-Depends: pdfjs-dist, canvas | |
# Use `python -m pip install` and `python -m javascript --install` | |
import argparse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 | |
import re | |
from datetime import datetime | |
from urllib.request import urlopen | |
from packaging.version import Version | |
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/" | |
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 mara004 | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 | |
# Assuming you have an Apache PDFBox 3 jar in the same directory | |
from pathlib import Path | |
import jpype | |
import jpype.imports | |
import PIL.Image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: CC-BY-4.0 | |
# Parser for a page text mini-language | |
# Technically, this might be a use case for some parser generator, but for now | |
# it's implemented "manually" with common string operations | |
__all__ = ["parse_pagetext"] | |
try: |