Skip to content

Instantly share code, notes, and snippets.

View mara004's full-sized avatar
💭
Might stop working on software soon

mara004

💭
Might stop working on software soon
View GitHub Profile
@mara004
mara004 / pnp.py
Last active July 2, 2024 19:07
Page number spec parser [Draft]
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: MPL-2.0
# Sophisticated parser for a page number specification mini-language
# Technically, this might be a use case for a parser generator like pyparsing or PLY, but this is a manual implementation based on common string operations.
__all__ = ["parse_pagenums"]
# stdlib
import logging
@mara004
mara004 / argparse_compat.py
Last active June 20, 2024 10:19
Argparse compat extensions
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause OR CC-BY-4.0
import sys
import argparse
if sys.version_info >= (3, 9):
from argparse import BooleanOptionalAction
else:
@mara004
mara004 / tile.py
Last active June 20, 2024 16:45
JPEG to PDF N-up with pypdfium2
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
import math
import argparse
import itertools
import pypdfium2 as pdfium
from pathlib import Path
parser = argparse.ArgumentParser()
@mara004
mara004 / parse_gh_release.py
Last active September 26, 2023 00:28
Extract information from GitHub release notes
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API
# See also https://stackoverflow.com/q/76995969/15547292
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version
import re
import json
@mara004
mara004 / safer_tar_extract.py
Last active June 20, 2024 10:21
Safer tar extraction
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc.
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6)
__all__ = ["safer_tar_unpack"]
import sys
if sys.version_info >= (3, 11, 4): # PEP 706
@mara004
mara004 / pypdfjs.py
Last active June 20, 2024 16:45
PDF rendering with pdf.js, from Python
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge)
# Js-Depends: pdfjs-dist, canvas
# Use `python -m pip install` and `python -m javascript --install`
import argparse
@mara004
mara004 / pdfbox_version_parsing.py
Last active July 14, 2023 11:50
Parse pdfbox versions and build a nice, robust representation
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
import re
from datetime import datetime
from urllib.request import urlopen
from packaging.version import Version
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/"
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)'
@mara004
mara004 / pdfbox.py
Last active June 20, 2024 16:44
PDF rendering with PDFBox, from Python
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0
# Assuming you have an Apache PDFBox 3 jar in the same directory
from pathlib import Path
import jpype
import jpype.imports
import PIL.Image