Last active
March 20, 2024 17:49
-
-
Save eliminmax/c61a3522431f5c675ba4bdd7fcdfc9a2 to your computer and use it in GitHub Desktop.
A tool to generate a series of AWK `gsub` statments that remove anything that is removed in GitHub Markdown's header ids from the `head_id` variable, created for the cmark-gfm-heading-generator script from my mini-utils project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Copyright © 2024 Eli Array Minkoff | |
# Zero-Clause BSD | |
# ============= | |
# | |
# Permission to use, copy, modify, and/or distribute this software for | |
# any purpose with or without fee is hereby granted. | |
# | |
# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL | |
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES | |
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE | |
# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY | |
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN | |
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT | |
# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
from operator import itemgetter | |
from itertools import groupby | |
# unicodedata2 has newer info and the same API as unicodedata use it if able | |
# (installable from Debian's python3-unicodedata2 package or with pip) | |
try: | |
import unicodedata2 as unicodedata | |
except ModuleNotFoundError: | |
from sys import stderr | |
import unicodedata | |
print( | |
"Failed to import unicodedata2.", | |
"Falling back on unicodedata, which may be out of date.", | |
file=stderr | |
) | |
PATTERNFILE = "pattern" | |
# inspired by script/generate-regex.js from github-slugger | |
# https://github.com/Flet/github-slugger/ | |
# implementation is entirely original | |
# characters that AWK treats as regex operators, even if '\u'-encoded | |
AWK_REGEX_OPERATORS = '[]{}\\^$|*?().' | |
# categories to exclude | |
badcats = [ | |
"No", # Other Number | |
"Pe", # Close Punctuation | |
"Pf", # Final Punctuation | |
"Pi", # Initial Punctuation | |
"Ps", # Open Punctuation | |
"Po", # Other Punctuation | |
"Pd", # Dash Punctuation | |
"S", # Symbol | |
"Sc", # Currency Symbol (Subcategory of Symbol) | |
"Sk", # Modifier Symbol (Subcategory of Symbol) | |
"Sm", # Math Symbol (Subcategory of Symbol) | |
"So", # Other Symbol (Subcategory of Symbol) | |
"Cc", # Control | |
"Co", # Private Use | |
"Cf", # Format | |
"Cn", # Unassigned | |
"Z", # Separator | |
"Zl", # Line Separator (Subcategory of Separator) | |
"Zp", # Paragraph Separator (Subcategory of Separator) | |
"Zs", # Space Separator (Subcategory of Separator) | |
] | |
chars_to_filter = [] | |
# iterate over every Unicode codepoint except for NUL, | |
# check if it's in a bad category, and if so, and it isn't an alphabetical | |
# character or '-', filter it out. | |
for c_num in range(1, 0x110000): | |
c = chr(c_num) | |
if (cat := unicodedata.category(c)) in badcats: | |
# skip characters that mess with AWK | |
if c in AWK_REGEX_OPERATORS: | |
# print(cat, c_num) | |
continue | |
if not (c.isalpha() or c == '-'): | |
chars_to_filter.append(c_num) | |
# print(cat, c_num) | |
char_ranges = [] | |
# gather them into groups of consecutive characters | |
# https://stackoverflow.com/a/2154437 | |
for k, g in groupby(enumerate(chars_to_filter), lambda x: x[0] - x[1]): | |
group = (map(itemgetter(1), g)) | |
group = list(map(int, group)) | |
char_ranges.append((group[0], group[-1])) | |
with open(PATTERNFILE, "w") as f: | |
# a behemoth of a group regex pattern | |
def write_formatted_line(s): | |
f.write(f" gsub(/[{s}]/, \"\", head_id)\n") | |
base_line_len = 27 # the length of the boilerplatey parts of the line | |
line = "" | |
# write each character range as unicode escapes, split line-by-line | |
for range_start, range_end in char_ranges: | |
if range_start == range_end: | |
# if the range only has one element, then write that element here | |
range_str = f"\\u{range_start:x}" | |
else: | |
# if it has more than one, write it as a regex range | |
range_str = f"\\u{range_start:x}-\\u{range_end:x}" | |
if base_line_len + len(line) + len(range_str) >= 80: | |
write_formatted_line(line) | |
line = range_str | |
else: | |
line += range_str | |
# write a formatted line one more time | |
write_formatted_line(line) | |
# add the AWK_REGEX_OPERATORS back on their own line, backslash-escaped. | |
write_formatted_line('\\'.join(AWK_REGEX_OPERATORS)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment