Skip to content

Instantly share code, notes, and snippets.

View mara004's full-sized avatar
💭
Might stop working on software soon

mara004

💭
Might stop working on software soon
View GitHub Profile
@mara004
mara004 / argparse_compat.py
Last active March 31, 2024 14:03
Argparse compat extensions
# SPDX-FileCopyrightText: 2024 mara004 <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause OR CC-BY-4.0
import sys
import argparse
if sys.version_info >= (3, 9):
from argparse import BooleanOptionalAction
else:
@mara004
mara004 / tile.py
Last active January 10, 2024 18:30
JPEG to PDF N-up with pypdfium2
# SPDX-FileCopyrightText: 2023 mara004
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
import math
import argparse
import itertools
import pypdfium2 as pdfium
from pathlib import Path
parser = argparse.ArgumentParser()
@mara004
mara004 / parse_gh_release.py
Last active September 26, 2023 00:28
Extract information from GitHub release notes
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API
# See also https://stackoverflow.com/q/76995969/15547292
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version
import re
import json
@mara004
mara004 / safer_tar_extract.py
Last active February 23, 2024 02:13
Safer tar extraction
# SPDX-FileCopyrightText: 2023 mara004
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc.
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6)
__all__ = ["safer_tar_unpack"]
import sys
if sys.version_info >= (3, 11, 4): # PEP 706
@mara004
mara004 / pypdfjs.py
Last active May 5, 2024 14:39
PDF rendering with pdf.js, from Python
# SPDX-FileCopyrightText: 2023 mara004
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge)
# Js-Depends: pdfjs-dist, canvas
# Use `python -m pip install` and `python -m javascript --install`
import argparse
@mara004
mara004 / pdfbox_version_parsing.py
Last active July 14, 2023 11:50
Parse pdfbox versions and build a nice, robust representation
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
import re
from datetime import datetime
from urllib.request import urlopen
from packaging.version import Version
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/"
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)'
@mara004
mara004 / pdfbox.py
Last active March 22, 2024 14:57
PDF rendering with PDFBox, from Python
# SPDX-FileCopyrightText: 2023 mara004
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
# Assuming you have an Apache PDFBox 3 jar in the same directory
from pathlib import Path
import jpype
import jpype.imports
import PIL.Image
@mara004
mara004 / ptp.py
Created June 16, 2023 19:52
Parser for a page text mini-language (WIP)
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0
# Parser for a page text mini-language
# Technically, this might be a use case for some parser generator, but for now
# it's implemented "manually" with common string operations
__all__ = ["parse_pagetext"]
try: