Skip to content

Instantly share code, notes, and snippets.

@eliminmax
Last active March 20, 2024 17:49
Show Gist options
  • Save eliminmax/c61a3522431f5c675ba4bdd7fcdfc9a2 to your computer and use it in GitHub Desktop.
Save eliminmax/c61a3522431f5c675ba4bdd7fcdfc9a2 to your computer and use it in GitHub Desktop.
A tool to generate a series of AWK `gsub` statments that remove anything that is removed in GitHub Markdown's header ids from the `head_id` variable, created for the cmark-gfm-heading-generator script from my mini-utils project.
#!/usr/bin/env python3
# Copyright © 2024 Eli Array Minkoff
# Zero-Clause BSD
# =============
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from operator import itemgetter
from itertools import groupby
# unicodedata2 has newer info and the same API as unicodedata use it if able
# (installable from Debian's python3-unicodedata2 package or with pip)
try:
import unicodedata2 as unicodedata
except ModuleNotFoundError:
from sys import stderr
import unicodedata
print(
"Failed to import unicodedata2.",
"Falling back on unicodedata, which may be out of date.",
file=stderr
)
PATTERNFILE = "pattern"
# inspired by script/generate-regex.js from github-slugger
# https://github.com/Flet/github-slugger/
# implementation is entirely original
# characters that AWK treats as regex operators, even if '\u'-encoded
AWK_REGEX_OPERATORS = '[]{}\\^$|*?().'
# categories to exclude
badcats = [
"No", # Other Number
"Pe", # Close Punctuation
"Pf", # Final Punctuation
"Pi", # Initial Punctuation
"Ps", # Open Punctuation
"Po", # Other Punctuation
"Pd", # Dash Punctuation
"S", # Symbol
"Sc", # Currency Symbol (Subcategory of Symbol)
"Sk", # Modifier Symbol (Subcategory of Symbol)
"Sm", # Math Symbol (Subcategory of Symbol)
"So", # Other Symbol (Subcategory of Symbol)
"Cc", # Control
"Co", # Private Use
"Cf", # Format
"Cn", # Unassigned
"Z", # Separator
"Zl", # Line Separator (Subcategory of Separator)
"Zp", # Paragraph Separator (Subcategory of Separator)
"Zs", # Space Separator (Subcategory of Separator)
]
chars_to_filter = []
# iterate over every Unicode codepoint except for NUL,
# check if it's in a bad category, and if so, and it isn't an alphabetical
# character or '-', filter it out.
for c_num in range(1, 0x110000):
c = chr(c_num)
if (cat := unicodedata.category(c)) in badcats:
# skip characters that mess with AWK
if c in AWK_REGEX_OPERATORS:
# print(cat, c_num)
continue
if not (c.isalpha() or c == '-'):
chars_to_filter.append(c_num)
# print(cat, c_num)
char_ranges = []
# gather them into groups of consecutive characters
# https://stackoverflow.com/a/2154437
for k, g in groupby(enumerate(chars_to_filter), lambda x: x[0] - x[1]):
group = (map(itemgetter(1), g))
group = list(map(int, group))
char_ranges.append((group[0], group[-1]))
with open(PATTERNFILE, "w") as f:
# a behemoth of a group regex pattern
def write_formatted_line(s):
f.write(f" gsub(/[{s}]/, \"\", head_id)\n")
base_line_len = 27 # the length of the boilerplatey parts of the line
line = ""
# write each character range as unicode escapes, split line-by-line
for range_start, range_end in char_ranges:
if range_start == range_end:
# if the range only has one element, then write that element here
range_str = f"\\u{range_start:x}"
else:
# if it has more than one, write it as a regex range
range_str = f"\\u{range_start:x}-\\u{range_end:x}"
if base_line_len + len(line) + len(range_str) >= 80:
write_formatted_line(line)
line = range_str
else:
line += range_str
# write a formatted line one more time
write_formatted_line(line)
# add the AWK_REGEX_OPERATORS back on their own line, backslash-escaped.
write_formatted_line('\\'.join(AWK_REGEX_OPERATORS))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment