marzer/rangify_unicode_categories.py

## rangify_unicode_categories.py
#!/usr/bin/env python3

# dependencies:
# pip install --upgrade requests

import os.path
import sys
import re
import requests
import traceback


def print_character_range(s, e, count):
	if (count > 0):
		print(" / ", end='')
		if (count % 4 == 0):
			print("\n\t", end='')
	if (s == e or e == 0):
		print("%x{:X}".format(s), end='')
	else:
		print("%x{:X}-{:X}".format(s, e), end='')


def print_abnf_for_categories(name, categories, codepoints):
	print("\n; unicode codepoints from categories {}".format(', '.join(categories)))
	print("{} = ".format(name), end='')
	s = -1
	e = -1
	print_count = 0
	count = 0
	for codepoint, category in codepoints:
		if (category in categories):
			if (s == -1):
				s = codepoint
				e = codepoint
			elif (e == codepoint-1):
				e = codepoint
			else:
				print_character_range(s, e, print_count)
				count += e - s + 1
				print_count += 1
				s = codepoint
				e = codepoint
	if (s != -1):
		print_character_range(s, e, print_count)
		count += e - s + 1
	print("\n\t; {} codepoints in total\n".format(count))


def main():

	# get unicode character database
	codepoint_list = ''
	codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt')
	if (not os.path.exists(codepoint_file_path)):
		print("Couldn't find unicode database file, will download")
		response = requests.get(
			'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',
			timeout=1
		)
		codepoint_list = response.text
		codepoint_file = open(codepoint_file_path,'w')
		print(codepoint_list, end='', file=codepoint_file)
		codepoint_file.close()
	else:
		print("Reading unicode database file into memory")
		codepoint_file = open(codepoint_file_path,'r')
		codepoint_list = codepoint_file.read()
		codepoint_file.close()

	# parse the database file into codepoints
	re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);')
	current_range_start = -1
	codepoints = []
	for codepoint_entry in codepoint_list.split('\n'):
		match = re_codepoint.search(codepoint_entry)
		if (match is None):
			if (current_range_start > -1):
				raise Exception('Previous codepoint indicated the start of a range but the next one was null')
			continue
		codepoint = int('0x{}'.format(match.group(1)), 16)
		if (current_range_start > -1):
			for cp in range(current_range_start, codepoint+1):
				codepoints.append((cp, match.group(3)))
			current_range_start = -1
		else:
			if (match.group(2).endswith(', First>')):
				current_range_start = codepoint
			else:
				codepoints.append((codepoint, match.group(3)))
	print("Parsed {} codepoints from unicode database file.".format(len(codepoints)))
	codepoints.sort(key=lambda r:r[0])


	# print categories
	print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints)
	print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints)
	print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints)


if __name__ == '__main__':
	try:
		main()
	except Exception as err:
		print(
			'Fatal error: [{}] {}'.format(
				type(err).__name__,
				str(err)
			),
			file=sys.stderr
		)
		traceback.print_exc(file=sys.stderr)
		sys.exit(1)
	sys.exit()
	#!/usr/bin/env python3

	# dependencies:
	# pip install --upgrade requests

	import os.path
	import sys
	import re
	import requests
	import traceback


	def print_character_range(s, e, count):
	if (count > 0):
	print(" / ", end='')
	if (count % 4 == 0):
	print("\n\t", end='')
	if (s == e or e == 0):
	print("%x{:X}".format(s), end='')
	else:
	print("%x{:X}-{:X}".format(s, e), end='')


	def print_abnf_for_categories(name, categories, codepoints):
	print("\n; unicode codepoints from categories {}".format(', '.join(categories)))
	print("{} = ".format(name), end='')
	s = -1
	e = -1
	print_count = 0
	count = 0
	for codepoint, category in codepoints:
	if (category in categories):
	if (s == -1):
	s = codepoint
	e = codepoint
	elif (e == codepoint-1):
	e = codepoint
	else:
	print_character_range(s, e, print_count)
	count += e - s + 1
	print_count += 1
	s = codepoint
	e = codepoint
	if (s != -1):
	print_character_range(s, e, print_count)
	count += e - s + 1
	print("\n\t; {} codepoints in total\n".format(count))


	def main():

	# get unicode character database
	codepoint_list = ''
	codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt')
	if (not os.path.exists(codepoint_file_path)):
	print("Couldn't find unicode database file, will download")
	response = requests.get(
	'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',
	timeout=1
	)
	codepoint_list = response.text
	codepoint_file = open(codepoint_file_path,'w')
	print(codepoint_list, end='', file=codepoint_file)
	codepoint_file.close()
	else:
	print("Reading unicode database file into memory")
	codepoint_file = open(codepoint_file_path,'r')
	codepoint_list = codepoint_file.read()
	codepoint_file.close()

	# parse the database file into codepoints
	re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);')
	current_range_start = -1
	codepoints = []
	for codepoint_entry in codepoint_list.split('\n'):
	match = re_codepoint.search(codepoint_entry)
	if (match is None):
	if (current_range_start > -1):
	raise Exception('Previous codepoint indicated the start of a range but the next one was null')
	continue
	codepoint = int('0x{}'.format(match.group(1)), 16)
	if (current_range_start > -1):
	for cp in range(current_range_start, codepoint+1):
	codepoints.append((cp, match.group(3)))
	current_range_start = -1
	else:
	if (match.group(2).endswith(', First>')):
	current_range_start = codepoint
	else:
	codepoints.append((codepoint, match.group(3)))
	print("Parsed {} codepoints from unicode database file.".format(len(codepoints)))
	codepoints.sort(key=lambda r:r[0])


	# print categories
	print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints)
	print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints)
	print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints)


	if __name__ == '__main__':
	try:
	main()
	except Exception as err:
	print(
	'Fatal error: [{}] {}'.format(
	type(err).__name__,
	str(err)
	),
	file=sys.stderr
	)
	traceback.print_exc(file=sys.stderr)
	sys.exit(1)
	sys.exit()