Skip to content

Instantly share code, notes, and snippets.

@marzer
Last active August 20, 2020 12:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marzer/e7649c763cf4522686f85812f06022aa to your computer and use it in GitHub Desktop.
Save marzer/e7649c763cf4522686f85812f06022aa to your computer and use it in GitHub Desktop.
Python script for enumerating and grouping Unicode character categories in ABNF notation.
#!/usr/bin/env python3
# dependencies:
# pip install --upgrade requests
import os.path
import sys
import re
import requests
import traceback
def print_character_range(s, e, count):
if (count > 0):
print(" / ", end='')
if (count % 4 == 0):
print("\n\t", end='')
if (s == e or e == 0):
print("%x{:X}".format(s), end='')
else:
print("%x{:X}-{:X}".format(s, e), end='')
def print_abnf_for_categories(name, categories, codepoints):
print("\n; unicode codepoints from categories {}".format(', '.join(categories)))
print("{} = ".format(name), end='')
s = -1
e = -1
print_count = 0
count = 0
for codepoint, category in codepoints:
if (category in categories):
if (s == -1):
s = codepoint
e = codepoint
elif (e == codepoint-1):
e = codepoint
else:
print_character_range(s, e, print_count)
count += e - s + 1
print_count += 1
s = codepoint
e = codepoint
if (s != -1):
print_character_range(s, e, print_count)
count += e - s + 1
print("\n\t; {} codepoints in total\n".format(count))
def main():
# get unicode character database
codepoint_list = ''
codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt')
if (not os.path.exists(codepoint_file_path)):
print("Couldn't find unicode database file, will download")
response = requests.get(
'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',
timeout=1
)
codepoint_list = response.text
codepoint_file = open(codepoint_file_path,'w')
print(codepoint_list, end='', file=codepoint_file)
codepoint_file.close()
else:
print("Reading unicode database file into memory")
codepoint_file = open(codepoint_file_path,'r')
codepoint_list = codepoint_file.read()
codepoint_file.close()
# parse the database file into codepoints
re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);')
current_range_start = -1
codepoints = []
for codepoint_entry in codepoint_list.split('\n'):
match = re_codepoint.search(codepoint_entry)
if (match is None):
if (current_range_start > -1):
raise Exception('Previous codepoint indicated the start of a range but the next one was null')
continue
codepoint = int('0x{}'.format(match.group(1)), 16)
if (current_range_start > -1):
for cp in range(current_range_start, codepoint+1):
codepoints.append((cp, match.group(3)))
current_range_start = -1
else:
if (match.group(2).endswith(', First>')):
current_range_start = codepoint
else:
codepoints.append((codepoint, match.group(3)))
print("Parsed {} codepoints from unicode database file.".format(len(codepoints)))
codepoints.sort(key=lambda r:r[0])
# print categories
print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints)
print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints)
print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints)
if __name__ == '__main__':
try:
main()
except Exception as err:
print(
'Fatal error: [{}] {}'.format(
type(err).__name__,
str(err)
),
file=sys.stderr
)
traceback.print_exc(file=sys.stderr)
sys.exit(1)
sys.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment