Skip to content

Instantly share code, notes, and snippets.

@Mr0grog
Last active October 17, 2023 21:32
Show Gist options
  • Save Mr0grog/70ec66c2ed0e7ee9a5d50406534dad46 to your computer and use it in GitHub Desktop.
Save Mr0grog/70ec66c2ed0e7ee9a5d50406534dad46 to your computer and use it in GitHub Desktop.
Compare Unicode and WHATWG encoding mappings
"""
Quickie script for comparing legacy single-byte character encoding definitions
from the Unicode Consortium (found at https://unicode.org/Public/MAPPINGS/) and
the WHATWG (at https://encoding.spec.whatwg.org/#legacy-single-byte-encodings or
https://github.com/whatwg/encoding), since they differ slightly.
Typically, you'll want to download a copy of the Unicode mapping files via FTP:
ncftpget -R ftp.unicode.org . Public/MAPPINGS
mv MAPPINGS unicode
And the WHATWG mapping files via git:
git clone https://github.com/whatwg/encoding.git whatwg
Then you can compare, for example, windows-1255:
python compare.py unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT whatwg/index-windows-1255.txt
Which will output something like:
✘ Definitions for windows-1255 do not match!
Byte 129 (0x81): Unicode = point <UNDEFINED> / WHATWG = point 129 (0x0081) (control character)
Byte 138 (0x8a): Unicode = point <UNDEFINED> / WHATWG = point 138 (0x008a) (control character)
Byte 140 (0x8c): Unicode = point <UNDEFINED> / WHATWG = point 140 (0x008c) (control character)
Byte 141 (0x8d): Unicode = point <UNDEFINED> / WHATWG = point 141 (0x008d) (control character)
Byte 142 (0x8e): Unicode = point <UNDEFINED> / WHATWG = point 142 (0x008e) (control character)
Byte 143 (0x8f): Unicode = point <UNDEFINED> / WHATWG = point 143 (0x008f) (control character)
Byte 144 (0x90): Unicode = point <UNDEFINED> / WHATWG = point 144 (0x0090) (control character)
Byte 154 (0x9a): Unicode = point <UNDEFINED> / WHATWG = point 154 (0x009a) (control character)
Byte 156 (0x9c): Unicode = point <UNDEFINED> / WHATWG = point 156 (0x009c) (control character)
Byte 157 (0x9d): Unicode = point <UNDEFINED> / WHATWG = point 157 (0x009d) (control character)
Byte 158 (0x9e): Unicode = point <UNDEFINED> / WHATWG = point 158 (0x009e) (control character)
Byte 159 (0x9f): Unicode = point <UNDEFINED> / WHATWG = point 159 (0x009f) (control character)
Byte 202 (0xca): Unicode = point <UNDEFINED> / WHATWG = point 1466 (0x05ba) (HEBREW POINT HOLAM HASER FOR VAV)
"""
import re
from typing import Dict, List, Tuple
import unicodedata
EncodingMap = Dict[int, int]
EMPTY_LINE = re.compile(r'^[\s\x00-\x1f]*$')
def is_control_character(point: int) -> bool:
# Technically this should be:
# unicodedata.category(chr(point)) == 'Cc'
# But really we care about the control characters at the start of Latin-1
# Supplement section.
return point in range(0x80, 0xa0)
class MappingParser:
"""
Base class for parsing encoding mapping files.
"""
def parse_file(self, path: str) -> EncodingMap:
with open(path) as file:
return self.parse(file.read(), path)
def parse(self, text: str, filename: str = None) -> EncodingMap:
mapping = {}
for fields, comment, number, line in self.each_table_line(text):
try:
byte_value, point, metadata = self.parse_line(fields, comment)
self.validate_mapping(byte_value, point, metadata, number, line, filename)
mapping[byte_value] = point
except ValueError as error:
raise SyntaxError(f'Error parsing mapping file: {error}', (filename, number, None, line)) from error
return mapping
def each_table_line(self, text: str):
# NOTE: can't use splitlines() because some of the separators it supports
# may be content on a line. WHATWG-style mapping files only list characters
# above 0x7f and Unicode-style files do not list the actual character (just
# the code point as a number), so line feeds are OK.
for number, line in enumerate(text.split('\n'), start=1):
data, _, comment = line.partition('#')
if not EMPTY_LINE.match(data):
fields = data.split('\t')
yield fields, comment, number, line
def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
byte_value = int(fields[0], base=0)
point = int(fields[1], base=16) if fields[1].strip() else None
metadata = f'{"\t".join(fields[2:])} {comment}'.strip()
return byte_value, point, metadata
def validate_mapping(self, byte_value: int, point: int, metadata: str, line_number: int, raw_line: str, filename: str):
if point is not None:
if is_control_character(point) and 'control' not in metadata.lower():
raise SyntaxError(
f'Line maps byte to a control character (0x{point:02x}) but comment did not mention "control"',
(filename, line_number, None, raw_line)
)
elif 'undefined' not in metadata.lower():
raise SyntaxError(
'Line maps byte undefined point, but comment did not mention "undefined"',
(filename, line_number, None, raw_line)
)
class WhatwgMappingParser(MappingParser):
def parse(self, text: str, filename: str = None) -> EncodingMap:
# WHATWG files omit the first 128 values, since they are always ASCII.
ascii = {i: i for i in range(128)}
return ascii | super().parse(text, filename)
def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
byte_value, point, metadata = super().parse_line(fields, comment)
return byte_value + 128, point, metadata
class UnicodeMappingParser(MappingParser):
...
def pretty_code_point(point: int) -> str:
if point is not None:
text = f'{point} (0x{point:04x})'
if is_control_character(point):
text += ' (control character)'
name = unicodedata.name(chr(point), None)
if name:
text += f' ({name})'
return text
else:
return '<UNDEFINED>'
def compare_mappings(name: str, unicode: EncodingMap, whatwg: EncodingMap, ignore_control_chars=False):
same = True
for i in range(256):
unicode_point = unicode.get(i)
whatwg_point = whatwg.get(i)
matched = whatwg_point == unicode_point or (
ignore_control_chars
and (unicode_point is None or is_control_character(unicode_point))
and (whatwg_point is None or is_control_character(whatwg_point))
)
if not matched:
if same:
print(f'✘ Definitions for {name} do not match!')
same = False
print(f' Byte {i} (0x{i:02x}): '
f'Unicode = point {pretty_code_point(unicode_point)} / '
f'WHATWG = point {pretty_code_point(whatwg_point)}')
if same:
print(f'✔︎ Matched: {name}')
def compare_encoding_files(name: str, unicode_path: str, whatwg_path: str, ignore_control_chars=False):
unicode = UnicodeMappingParser().parse_file(unicode_path)
whatwg = WhatwgMappingParser().parse_file(whatwg_path)
compare_mappings(name, unicode, whatwg, ignore_control_chars)
def compare_chardetng(ignore_control_chars=False):
compare_encoding_files("IBM866", 'unicode/VENDORS/MICSFT/PC/CP866.TXT', 'whatwg/index-ibm866.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-2", 'unicode/ISO8859/8859-2.TXT', 'whatwg/index-iso-8859-2.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-4", 'unicode/ISO8859/8859-4.TXT', 'whatwg/index-iso-8859-4.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-5", 'unicode/ISO8859/8859-5.TXT', 'whatwg/index-iso-8859-5.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-6", 'unicode/ISO8859/8859-6.TXT', 'whatwg/index-iso-8859-6.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-7", 'unicode/ISO8859/8859-7.TXT', 'whatwg/index-iso-8859-7.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-8", 'unicode/ISO8859/8859-8.TXT', 'whatwg/index-iso-8859-8.txt', ignore_control_chars)
compare_encoding_files("ISO-8859-13", 'unicode/ISO8859/8859-13.TXT', 'whatwg/index-iso-8859-13.txt', ignore_control_chars)
compare_encoding_files("KOI8-U", 'unicode/VENDORS/MISC/KOI8-U.TXT', 'whatwg/index-koi8-u.txt', ignore_control_chars)
compare_encoding_files("windows-874", 'unicode/VENDORS/MICSFT/WINDOWS/CP874.TXT', 'whatwg/index-windows-874.txt', ignore_control_chars)
compare_encoding_files("windows-1250", 'unicode/VENDORS/MICSFT/WINDOWS/CP1250.TXT', 'whatwg/index-windows-1250.txt', ignore_control_chars)
compare_encoding_files("windows-1251", 'unicode/VENDORS/MICSFT/WINDOWS/CP1251.TXT', 'whatwg/index-windows-1251.txt', ignore_control_chars)
compare_encoding_files("windows-1252", 'unicode/VENDORS/MICSFT/WINDOWS/CP1252.TXT', 'whatwg/index-windows-1252.txt', ignore_control_chars)
compare_encoding_files("windows-1253", 'unicode/VENDORS/MICSFT/WINDOWS/CP1253.TXT', 'whatwg/index-windows-1253.txt', ignore_control_chars)
compare_encoding_files("windows-1254", 'unicode/VENDORS/MICSFT/WINDOWS/CP1254.TXT', 'whatwg/index-windows-1254.txt', ignore_control_chars)
compare_encoding_files("windows-1255", 'unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT', 'whatwg/index-windows-1255.txt', ignore_control_chars)
compare_encoding_files("windows-1256", 'unicode/VENDORS/MICSFT/WINDOWS/CP1256.TXT', 'whatwg/index-windows-1256.txt', ignore_control_chars)
compare_encoding_files("windows-1257", 'unicode/VENDORS/MICSFT/WINDOWS/CP1257.TXT', 'whatwg/index-windows-1257.txt', ignore_control_chars)
compare_encoding_files("windows-1258", 'unicode/VENDORS/MICSFT/WINDOWS/CP1258.TXT', 'whatwg/index-windows-1258.txt', ignore_control_chars)
if __name__ == '__main__':
from argparse import ArgumentParser
from pathlib import Path
parser = ArgumentParser(
prog='compare',
description='Compare Unicode Consortium vs WHATWG encoding definitions.'
)
parser.add_argument('--name', type=str, help='Name of encoding to compare.')
parser.add_argument('--ignore-controls', action='store_true', help='Consider control characters and undefined mappings to be the same.')
parser.add_argument('FILES', nargs='*', help='Path to Unicode Consortium and WHATWG mapping file (in that order).')
args = parser.parse_args()
if len(args.FILES) == 0:
compare_chardetng(args.ignore_controls)
elif len(args.FILES) != 2:
print('You must name two files: a Unicode mapping file and a WHATWG mapping file.')
else:
name = args.name or Path(args.FILES[1]).stem[6:]
compare_encoding_files(name, args.FILES[0], args.FILES[1], args.ignore_controls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment