Mr0grog/compare.py

## compare.py
"""
Quickie script for comparing legacy single-byte character encoding definitions
from the Unicode Consortium (found at https://unicode.org/Public/MAPPINGS/) and
the WHATWG (at https://encoding.spec.whatwg.org/#legacy-single-byte-encodings or
https://github.com/whatwg/encoding), since they differ slightly.

Typically, you'll want to download a copy of the Unicode mapping files via FTP:

    ncftpget -R ftp.unicode.org . Public/MAPPINGS
    mv MAPPINGS unicode

And the WHATWG mapping files via git:

    git clone https://github.com/whatwg/encoding.git whatwg

Then you can compare, for example, windows-1255:

    python compare.py unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT whatwg/index-windows-1255.txt

Which will output something like:

✘ Definitions for windows-1255 do not match!
    Byte 129 (0x81): Unicode = point <UNDEFINED> / WHATWG = point 129 (0x0081) (control character)
    Byte 138 (0x8a): Unicode = point <UNDEFINED> / WHATWG = point 138 (0x008a) (control character)
    Byte 140 (0x8c): Unicode = point <UNDEFINED> / WHATWG = point 140 (0x008c) (control character)
    Byte 141 (0x8d): Unicode = point <UNDEFINED> / WHATWG = point 141 (0x008d) (control character)
    Byte 142 (0x8e): Unicode = point <UNDEFINED> / WHATWG = point 142 (0x008e) (control character)
    Byte 143 (0x8f): Unicode = point <UNDEFINED> / WHATWG = point 143 (0x008f) (control character)
    Byte 144 (0x90): Unicode = point <UNDEFINED> / WHATWG = point 144 (0x0090) (control character)
    Byte 154 (0x9a): Unicode = point <UNDEFINED> / WHATWG = point 154 (0x009a) (control character)
    Byte 156 (0x9c): Unicode = point <UNDEFINED> / WHATWG = point 156 (0x009c) (control character)
    Byte 157 (0x9d): Unicode = point <UNDEFINED> / WHATWG = point 157 (0x009d) (control character)
    Byte 158 (0x9e): Unicode = point <UNDEFINED> / WHATWG = point 158 (0x009e) (control character)
    Byte 159 (0x9f): Unicode = point <UNDEFINED> / WHATWG = point 159 (0x009f) (control character)
    Byte 202 (0xca): Unicode = point <UNDEFINED> / WHATWG = point 1466 (0x05ba) (HEBREW POINT HOLAM HASER FOR VAV)
"""

import re
from typing import Dict, List, Tuple
import unicodedata


EncodingMap = Dict[int, int]

EMPTY_LINE = re.compile(r'^[\s\x00-\x1f]*$')


def is_control_character(point: int) -> bool:
    # Technically this should be:
    #   unicodedata.category(chr(point)) == 'Cc'
    # But really we care about the control characters at the start of Latin-1
    # Supplement section.
    return point in range(0x80, 0xa0)


class MappingParser:
    """
    Base class for parsing encoding mapping files.
    """
    def parse_file(self, path: str) -> EncodingMap:
        with open(path) as file:
            return self.parse(file.read(), path)

    def parse(self, text: str, filename: str = None) -> EncodingMap:
        mapping = {}

        for fields, comment, number, line in self.each_table_line(text):
            try:
                byte_value, point, metadata = self.parse_line(fields, comment)
                self.validate_mapping(byte_value, point, metadata, number, line, filename)
                mapping[byte_value] = point
            except ValueError as error:
                raise SyntaxError(f'Error parsing mapping file: {error}', (filename, number, None, line)) from error

        return mapping

    def each_table_line(self, text: str):
        # NOTE: can't use splitlines() because some of the separators it supports
        # may be content on a line. WHATWG-style mapping files only list characters
        # above 0x7f and Unicode-style files do not list the actual character (just
        # the code point as a number), so line feeds are OK.
        for number, line in enumerate(text.split('\n'), start=1):
            data, _, comment = line.partition('#')
            if not EMPTY_LINE.match(data):
                fields = data.split('\t')
                yield fields, comment, number, line

    def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
        byte_value = int(fields[0], base=0)
        point = int(fields[1], base=16) if fields[1].strip() else None
        metadata = f'{"\t".join(fields[2:])} {comment}'.strip()
        return byte_value, point, metadata

    def validate_mapping(self, byte_value: int, point: int, metadata: str, line_number: int, raw_line: str, filename: str):
        if point is not None:
            if is_control_character(point) and 'control' not in metadata.lower():
                raise SyntaxError(
                    f'Line maps byte to a control character (0x{point:02x}) but comment did not mention "control"',
                    (filename, line_number, None, raw_line)
                )
        elif 'undefined' not in metadata.lower():
            raise SyntaxError(
                'Line maps byte undefined point, but comment did not mention "undefined"',
                (filename, line_number, None, raw_line)
            )


class WhatwgMappingParser(MappingParser):
    def parse(self, text: str, filename: str = None) -> EncodingMap:
        # WHATWG files omit the first 128 values, since they are always ASCII.
        ascii = {i: i for i in range(128)}
        return ascii | super().parse(text, filename)

    def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
        byte_value, point, metadata = super().parse_line(fields, comment)
        return byte_value + 128, point, metadata


class UnicodeMappingParser(MappingParser):
    ...


def pretty_code_point(point: int) -> str:
    if point is not None:
        text = f'{point} (0x{point:04x})'
        if is_control_character(point):
            text += ' (control character)'
        name = unicodedata.name(chr(point), None)
        if name:
            text += f' ({name})'
        return text
    else:
        return '<UNDEFINED>'


def compare_mappings(name: str, unicode: EncodingMap, whatwg: EncodingMap, ignore_control_chars=False):
    same = True
    for i in range(256):
        unicode_point = unicode.get(i)
        whatwg_point = whatwg.get(i)
        matched = whatwg_point == unicode_point or (
            ignore_control_chars
            and (unicode_point is None or is_control_character(unicode_point))
            and (whatwg_point is None or is_control_character(whatwg_point))
        )
        if not matched:
            if same:
                print(f'✘ Definitions for {name} do not match!')
                same = False

            print(f'    Byte {i} (0x{i:02x}): '
                  f'Unicode = point {pretty_code_point(unicode_point)} / '
                  f'WHATWG = point {pretty_code_point(whatwg_point)}')

    if same:
        print(f'✔︎ Matched: {name}')


def compare_encoding_files(name: str, unicode_path: str, whatwg_path: str, ignore_control_chars=False):
    unicode = UnicodeMappingParser().parse_file(unicode_path)
    whatwg = WhatwgMappingParser().parse_file(whatwg_path)
    compare_mappings(name, unicode, whatwg, ignore_control_chars)


def compare_chardetng(ignore_control_chars=False):
    compare_encoding_files("IBM866",       'unicode/VENDORS/MICSFT/PC/CP866.TXT',       'whatwg/index-ibm866.txt',       ignore_control_chars)
    compare_encoding_files("ISO-8859-2",   'unicode/ISO8859/8859-2.TXT',                'whatwg/index-iso-8859-2.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-4",   'unicode/ISO8859/8859-4.TXT',                'whatwg/index-iso-8859-4.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-5",   'unicode/ISO8859/8859-5.TXT',                'whatwg/index-iso-8859-5.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-6",   'unicode/ISO8859/8859-6.TXT',                'whatwg/index-iso-8859-6.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-7",   'unicode/ISO8859/8859-7.TXT',                'whatwg/index-iso-8859-7.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-8",   'unicode/ISO8859/8859-8.TXT',                'whatwg/index-iso-8859-8.txt',   ignore_control_chars)
    compare_encoding_files("ISO-8859-13",  'unicode/ISO8859/8859-13.TXT',               'whatwg/index-iso-8859-13.txt',  ignore_control_chars)
    compare_encoding_files("KOI8-U",       'unicode/VENDORS/MISC/KOI8-U.TXT',           'whatwg/index-koi8-u.txt',       ignore_control_chars)
    compare_encoding_files("windows-874",  'unicode/VENDORS/MICSFT/WINDOWS/CP874.TXT',  'whatwg/index-windows-874.txt',  ignore_control_chars)
    compare_encoding_files("windows-1250", 'unicode/VENDORS/MICSFT/WINDOWS/CP1250.TXT', 'whatwg/index-windows-1250.txt', ignore_control_chars)
    compare_encoding_files("windows-1251", 'unicode/VENDORS/MICSFT/WINDOWS/CP1251.TXT', 'whatwg/index-windows-1251.txt', ignore_control_chars)
    compare_encoding_files("windows-1252", 'unicode/VENDORS/MICSFT/WINDOWS/CP1252.TXT', 'whatwg/index-windows-1252.txt', ignore_control_chars)
    compare_encoding_files("windows-1253", 'unicode/VENDORS/MICSFT/WINDOWS/CP1253.TXT', 'whatwg/index-windows-1253.txt', ignore_control_chars)
    compare_encoding_files("windows-1254", 'unicode/VENDORS/MICSFT/WINDOWS/CP1254.TXT', 'whatwg/index-windows-1254.txt', ignore_control_chars)
    compare_encoding_files("windows-1255", 'unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT', 'whatwg/index-windows-1255.txt', ignore_control_chars)
    compare_encoding_files("windows-1256", 'unicode/VENDORS/MICSFT/WINDOWS/CP1256.TXT', 'whatwg/index-windows-1256.txt', ignore_control_chars)
    compare_encoding_files("windows-1257", 'unicode/VENDORS/MICSFT/WINDOWS/CP1257.TXT', 'whatwg/index-windows-1257.txt', ignore_control_chars)
    compare_encoding_files("windows-1258", 'unicode/VENDORS/MICSFT/WINDOWS/CP1258.TXT', 'whatwg/index-windows-1258.txt', ignore_control_chars)


if __name__ == '__main__':
    from argparse import ArgumentParser
    from pathlib import Path

    parser = ArgumentParser(
        prog='compare',
        description='Compare Unicode Consortium vs WHATWG encoding definitions.'
    )
    parser.add_argument('--name', type=str, help='Name of encoding to compare.')
    parser.add_argument('--ignore-controls', action='store_true', help='Consider control characters and undefined mappings to be the same.')
    parser.add_argument('FILES', nargs='*', help='Path to Unicode Consortium and WHATWG mapping file (in that order).')
    args = parser.parse_args()

    if len(args.FILES) == 0:
        compare_chardetng(args.ignore_controls)
    elif len(args.FILES) != 2:
        print('You must name two files: a Unicode mapping file and a WHATWG mapping file.')
    else:
        name = args.name or Path(args.FILES[1]).stem[6:]
        compare_encoding_files(name, args.FILES[0], args.FILES[1], args.ignore_controls)
	"""
	Quickie script for comparing legacy single-byte character encoding definitions
	from the Unicode Consortium (found at https://unicode.org/Public/MAPPINGS/) and
	the WHATWG (at https://encoding.spec.whatwg.org/#legacy-single-byte-encodings or
	https://github.com/whatwg/encoding), since they differ slightly.

	Typically, you'll want to download a copy of the Unicode mapping files via FTP:

	ncftpget -R ftp.unicode.org . Public/MAPPINGS
	mv MAPPINGS unicode

	And the WHATWG mapping files via git:

	git clone https://github.com/whatwg/encoding.git whatwg

	Then you can compare, for example, windows-1255:

	python compare.py unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT whatwg/index-windows-1255.txt

	Which will output something like:

	✘ Definitions for windows-1255 do not match!
	Byte 129 (0x81): Unicode = point <UNDEFINED> / WHATWG = point 129 (0x0081) (control character)
	Byte 138 (0x8a): Unicode = point <UNDEFINED> / WHATWG = point 138 (0x008a) (control character)
	Byte 140 (0x8c): Unicode = point <UNDEFINED> / WHATWG = point 140 (0x008c) (control character)
	Byte 141 (0x8d): Unicode = point <UNDEFINED> / WHATWG = point 141 (0x008d) (control character)
	Byte 142 (0x8e): Unicode = point <UNDEFINED> / WHATWG = point 142 (0x008e) (control character)
	Byte 143 (0x8f): Unicode = point <UNDEFINED> / WHATWG = point 143 (0x008f) (control character)
	Byte 144 (0x90): Unicode = point <UNDEFINED> / WHATWG = point 144 (0x0090) (control character)
	Byte 154 (0x9a): Unicode = point <UNDEFINED> / WHATWG = point 154 (0x009a) (control character)
	Byte 156 (0x9c): Unicode = point <UNDEFINED> / WHATWG = point 156 (0x009c) (control character)
	Byte 157 (0x9d): Unicode = point <UNDEFINED> / WHATWG = point 157 (0x009d) (control character)
	Byte 158 (0x9e): Unicode = point <UNDEFINED> / WHATWG = point 158 (0x009e) (control character)
	Byte 159 (0x9f): Unicode = point <UNDEFINED> / WHATWG = point 159 (0x009f) (control character)
	Byte 202 (0xca): Unicode = point <UNDEFINED> / WHATWG = point 1466 (0x05ba) (HEBREW POINT HOLAM HASER FOR VAV)
	"""

	import re
	from typing import Dict, List, Tuple
	import unicodedata


	EncodingMap = Dict[int, int]

	EMPTY_LINE = re.compile(r'^[\s\x00-\x1f]*$')


	def is_control_character(point: int) -> bool:
	# Technically this should be:
	# unicodedata.category(chr(point)) == 'Cc'
	# But really we care about the control characters at the start of Latin-1
	# Supplement section.
	return point in range(0x80, 0xa0)


	class MappingParser:
	"""
	Base class for parsing encoding mapping files.
	"""
	def parse_file(self, path: str) -> EncodingMap:
	with open(path) as file:
	return self.parse(file.read(), path)

	def parse(self, text: str, filename: str = None) -> EncodingMap:
	mapping = {}

	for fields, comment, number, line in self.each_table_line(text):
	try:
	byte_value, point, metadata = self.parse_line(fields, comment)
	self.validate_mapping(byte_value, point, metadata, number, line, filename)
	mapping[byte_value] = point
	except ValueError as error:
	raise SyntaxError(f'Error parsing mapping file: {error}', (filename, number, None, line)) from error

	return mapping

	def each_table_line(self, text: str):
	# NOTE: can't use splitlines() because some of the separators it supports
	# may be content on a line. WHATWG-style mapping files only list characters
	# above 0x7f and Unicode-style files do not list the actual character (just
	# the code point as a number), so line feeds are OK.
	for number, line in enumerate(text.split('\n'), start=1):
	data, _, comment = line.partition('#')
	if not EMPTY_LINE.match(data):
	fields = data.split('\t')
	yield fields, comment, number, line

	def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
	byte_value = int(fields[0], base=0)
	point = int(fields[1], base=16) if fields[1].strip() else None
	metadata = f'{"\t".join(fields[2:])} {comment}'.strip()
	return byte_value, point, metadata

	def validate_mapping(self, byte_value: int, point: int, metadata: str, line_number: int, raw_line: str, filename: str):
	if point is not None:
	if is_control_character(point) and 'control' not in metadata.lower():
	raise SyntaxError(
	f'Line maps byte to a control character (0x{point:02x}) but comment did not mention "control"',
	(filename, line_number, None, raw_line)
	)
	elif 'undefined' not in metadata.lower():
	raise SyntaxError(
	'Line maps byte undefined point, but comment did not mention "undefined"',
	(filename, line_number, None, raw_line)
	)


	class WhatwgMappingParser(MappingParser):
	def parse(self, text: str, filename: str = None) -> EncodingMap:
	# WHATWG files omit the first 128 values, since they are always ASCII.
	ascii = {i: i for i in range(128)}
	return ascii \| super().parse(text, filename)

	def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]:
	byte_value, point, metadata = super().parse_line(fields, comment)
	return byte_value + 128, point, metadata


	class UnicodeMappingParser(MappingParser):
	...


	def pretty_code_point(point: int) -> str:
	if point is not None:
	text = f'{point} (0x{point:04x})'
	if is_control_character(point):
	text += ' (control character)'
	name = unicodedata.name(chr(point), None)
	if name:
	text += f' ({name})'
	return text
	else:
	return '<UNDEFINED>'


	def compare_mappings(name: str, unicode: EncodingMap, whatwg: EncodingMap, ignore_control_chars=False):
	same = True
	for i in range(256):
	unicode_point = unicode.get(i)
	whatwg_point = whatwg.get(i)
	matched = whatwg_point == unicode_point or (
	ignore_control_chars
	and (unicode_point is None or is_control_character(unicode_point))
	and (whatwg_point is None or is_control_character(whatwg_point))
	)
	if not matched:
	if same:
	print(f'✘ Definitions for {name} do not match!')
	same = False

	print(f' Byte {i} (0x{i:02x}): '
	f'Unicode = point {pretty_code_point(unicode_point)} / '
	f'WHATWG = point {pretty_code_point(whatwg_point)}')

	if same:
	print(f'✔︎ Matched: {name}')


	def compare_encoding_files(name: str, unicode_path: str, whatwg_path: str, ignore_control_chars=False):
	unicode = UnicodeMappingParser().parse_file(unicode_path)
	whatwg = WhatwgMappingParser().parse_file(whatwg_path)
	compare_mappings(name, unicode, whatwg, ignore_control_chars)


	def compare_chardetng(ignore_control_chars=False):
	compare_encoding_files("IBM866", 'unicode/VENDORS/MICSFT/PC/CP866.TXT', 'whatwg/index-ibm866.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-2", 'unicode/ISO8859/8859-2.TXT', 'whatwg/index-iso-8859-2.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-4", 'unicode/ISO8859/8859-4.TXT', 'whatwg/index-iso-8859-4.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-5", 'unicode/ISO8859/8859-5.TXT', 'whatwg/index-iso-8859-5.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-6", 'unicode/ISO8859/8859-6.TXT', 'whatwg/index-iso-8859-6.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-7", 'unicode/ISO8859/8859-7.TXT', 'whatwg/index-iso-8859-7.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-8", 'unicode/ISO8859/8859-8.TXT', 'whatwg/index-iso-8859-8.txt', ignore_control_chars)
	compare_encoding_files("ISO-8859-13", 'unicode/ISO8859/8859-13.TXT', 'whatwg/index-iso-8859-13.txt', ignore_control_chars)
	compare_encoding_files("KOI8-U", 'unicode/VENDORS/MISC/KOI8-U.TXT', 'whatwg/index-koi8-u.txt', ignore_control_chars)
	compare_encoding_files("windows-874", 'unicode/VENDORS/MICSFT/WINDOWS/CP874.TXT', 'whatwg/index-windows-874.txt', ignore_control_chars)
	compare_encoding_files("windows-1250", 'unicode/VENDORS/MICSFT/WINDOWS/CP1250.TXT', 'whatwg/index-windows-1250.txt', ignore_control_chars)
	compare_encoding_files("windows-1251", 'unicode/VENDORS/MICSFT/WINDOWS/CP1251.TXT', 'whatwg/index-windows-1251.txt', ignore_control_chars)
	compare_encoding_files("windows-1252", 'unicode/VENDORS/MICSFT/WINDOWS/CP1252.TXT', 'whatwg/index-windows-1252.txt', ignore_control_chars)
	compare_encoding_files("windows-1253", 'unicode/VENDORS/MICSFT/WINDOWS/CP1253.TXT', 'whatwg/index-windows-1253.txt', ignore_control_chars)
	compare_encoding_files("windows-1254", 'unicode/VENDORS/MICSFT/WINDOWS/CP1254.TXT', 'whatwg/index-windows-1254.txt', ignore_control_chars)
	compare_encoding_files("windows-1255", 'unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT', 'whatwg/index-windows-1255.txt', ignore_control_chars)
	compare_encoding_files("windows-1256", 'unicode/VENDORS/MICSFT/WINDOWS/CP1256.TXT', 'whatwg/index-windows-1256.txt', ignore_control_chars)
	compare_encoding_files("windows-1257", 'unicode/VENDORS/MICSFT/WINDOWS/CP1257.TXT', 'whatwg/index-windows-1257.txt', ignore_control_chars)
	compare_encoding_files("windows-1258", 'unicode/VENDORS/MICSFT/WINDOWS/CP1258.TXT', 'whatwg/index-windows-1258.txt', ignore_control_chars)


	if __name__ == '__main__':
	from argparse import ArgumentParser
	from pathlib import Path

	parser = ArgumentParser(
	prog='compare',
	description='Compare Unicode Consortium vs WHATWG encoding definitions.'
	)
	parser.add_argument('--name', type=str, help='Name of encoding to compare.')
	parser.add_argument('--ignore-controls', action='store_true', help='Consider control characters and undefined mappings to be the same.')
	parser.add_argument('FILES', nargs='*', help='Path to Unicode Consortium and WHATWG mapping file (in that order).')
	args = parser.parse_args()

	if len(args.FILES) == 0:
	compare_chardetng(args.ignore_controls)
	elif len(args.FILES) != 2:
	print('You must name two files: a Unicode mapping file and a WHATWG mapping file.')
	else:
	name = args.name or Path(args.FILES[1]).stem[6:]
	compare_encoding_files(name, args.FILES[0], args.FILES[1], args.ignore_controls)