Skip to content

Instantly share code, notes, and snippets.

View mara004's full-sized avatar
💭
Might stop working on software soon

mara004

💭
Might stop working on software soon
View GitHub Profile
@mara004
mara004 / pnp.py
Last active June 21, 2024 13:09
Page number spec parser [Draft]
# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: MPL-2.0
# Sophisticated parser for a page number specification mini-language
# Technically, this might be a use case for a parser generator like pyparsing or PLY, but this is a manual implementation based on common string operations.
__all__ = ["parse_pagenums"]
import enum
import logging
@mara004
mara004 / tile.py
Last active June 20, 2024 16:45
JPEG to PDF N-up with pypdfium2
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
import math
import argparse
import itertools
import pypdfium2 as pdfium
from pathlib import Path
parser = argparse.ArgumentParser()
@mara004
mara004 / parse_gh_release.py
Last active September 26, 2023 00:28
Extract information from GitHub release notes
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API
# See also https://stackoverflow.com/q/76995969/15547292
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version
import re
import json
@mara004
mara004 / safer_tar_extract.py
Last active June 20, 2024 10:21
Safer tar extraction
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc.
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6)
__all__ = ["safer_tar_unpack"]
import sys
if sys.version_info >= (3, 11, 4): # PEP 706
@mara004
mara004 / pypdfjs.py
Last active June 20, 2024 16:45
PDF rendering with pdf.js, from Python
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge)
# Js-Depends: pdfjs-dist, canvas
# Use `python -m pip install` and `python -m javascript --install`
import argparse
@mara004
mara004 / pdfbox_version_parsing.py
Last active July 14, 2023 11:50
Parse pdfbox versions and build a nice, robust representation
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
import re
from datetime import datetime
from urllib.request import urlopen
from packaging.version import Version
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/"
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)'
@mara004
mara004 / pdfbox.py
Last active June 20, 2024 16:44
PDF rendering with PDFBox, from Python
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0
# Assuming you have an Apache PDFBox 3 jar in the same directory
from pathlib import Path
import jpype
import jpype.imports
import PIL.Image
@lebedov
lebedov / jpype_pdf_text_stripper.py
Created April 28, 2021 12:29
How to use pdfbox's PDFTextStripper class in Python.
#!/usr/bin/env python3
"""
How to use pdfbox's PDFTextStripper class in Python.
"""
import pathlib
import pkg_resources
import re
import urllib.request
@lebedov
lebedov / jpype_api_demo.py
Last active July 10, 2023 14:04
How to call pdfbox's API with JPype.
#!/usr/bin/env python3
"""
How to call pdfbox's API with JPype.
"""
import pathlib
import pkg_resources
import re
import urllib.request
@bittner
bittner / keyboard-keys.md
Created February 28, 2019 22:50
Keyboard keys markup in MarkDown

Ctrl + Alt + Space