eliminmax/gh-md-header-id-patterngen.py

## gh-md-header-id-patterngen.py
#!/usr/bin/env python3
# Copyright © 2024 Eli Array Minkoff

# Zero-Clause BSD
# =============
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

from operator import itemgetter
from itertools import groupby
# unicodedata2 has newer info and the same API as unicodedata use it if able
# (installable from Debian's python3-unicodedata2 package or with pip)
try:
    import unicodedata2 as unicodedata
except ModuleNotFoundError:
    from sys import stderr
    import unicodedata
    print(
        "Failed to import unicodedata2.",
        "Falling back on unicodedata, which may be out of date.",
        file=stderr
    )

PATTERNFILE = "pattern"

# inspired by script/generate-regex.js from github-slugger
# https://github.com/Flet/github-slugger/
# implementation is entirely original

# characters that AWK treats as regex operators, even if '\u'-encoded
AWK_REGEX_OPERATORS = '[]{}\\^$|*?().'

# categories to exclude
badcats = [
    "No",  # Other Number
    "Pe",  # Close Punctuation
    "Pf",  # Final Punctuation
    "Pi",  # Initial Punctuation
    "Ps",  # Open Punctuation
    "Po",  # Other Punctuation
    "Pd",  # Dash Punctuation
    "S",   # Symbol
    "Sc",  # Currency Symbol (Subcategory of Symbol)
    "Sk",  # Modifier Symbol (Subcategory of Symbol)
    "Sm",  # Math Symbol (Subcategory of Symbol)
    "So",  # Other Symbol (Subcategory of Symbol)
    "Cc",  # Control
    "Co",  # Private Use
    "Cf",  # Format
    "Cn",  # Unassigned
    "Z",   # Separator
    "Zl",  # Line Separator (Subcategory of Separator)
    "Zp",  # Paragraph Separator (Subcategory of Separator)
    "Zs",  # Space Separator (Subcategory of Separator)
]

chars_to_filter = []

# iterate over every Unicode codepoint except for NUL,
# check if it's in a bad category, and if so, and it isn't an alphabetical
# character or '-', filter it out.
for c_num in range(1, 0x110000):
    c = chr(c_num)
    if (cat := unicodedata.category(c)) in badcats:
        # skip characters that mess with AWK
        if c in AWK_REGEX_OPERATORS:
            # print(cat, c_num)
            continue
        if not (c.isalpha() or c == '-'):
            chars_to_filter.append(c_num)
    # print(cat, c_num)

char_ranges = []

# gather them into groups of consecutive characters
# https://stackoverflow.com/a/2154437
for k, g in groupby(enumerate(chars_to_filter), lambda x: x[0] - x[1]):
    group = (map(itemgetter(1), g))
    group = list(map(int, group))
    char_ranges.append((group[0], group[-1]))

with open(PATTERNFILE, "w") as f:
    # a behemoth of a group regex pattern
    def write_formatted_line(s):
        f.write(f"    gsub(/[{s}]/, \"\", head_id)\n")
    base_line_len = 27  # the length of the boilerplatey parts of the line
    line = ""
    # write each character range as unicode escapes, split line-by-line
    for range_start, range_end in char_ranges:
        if range_start == range_end:
            # if the range only has one element, then write that element here
            range_str = f"\\u{range_start:x}"
        else:
            # if it has more than one, write it as a regex range
            range_str = f"\\u{range_start:x}-\\u{range_end:x}"
        if base_line_len + len(line) + len(range_str) >= 80:
            write_formatted_line(line)
            line = range_str
        else:
            line += range_str
    # write a formatted line one more time
    write_formatted_line(line)

    # add the AWK_REGEX_OPERATORS back on their own line, backslash-escaped.
    write_formatted_line('\\'.join(AWK_REGEX_OPERATORS))