Skip to content

Instantly share code, notes, and snippets.

@akx
Last active May 11, 2022 07:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akx/edd4e2b033ebb00d283be0b6fac0bd64 to your computer and use it in GitHub Desktop.
Save akx/edd4e2b033ebb00d283be0b6fac0bd64 to your computer and use it in GitHub Desktop.
import random
from collections import Counter
py310_encodings = [
'ascii',
'big5',
'big5hkscs',
'cp037',
'cp1006',
'cp1026',
'cp1125',
'cp1140',
'cp1250',
'cp1251',
'cp1252',
'cp1253',
'cp1254',
'cp1255',
'cp1256',
'cp1257',
'cp1258',
'cp273',
'cp424',
'cp437',
'cp500',
'cp720',
'cp737',
'cp775',
'cp850',
'cp852',
'cp855',
'cp856',
'cp857',
'cp858',
'cp860',
'cp861',
'cp862',
'cp863',
'cp864',
'cp865',
'cp866',
'cp869',
'cp874',
'cp875',
'cp932',
'cp949',
'cp950',
'euc_jis_2004',
'euc_jisx0213',
'euc_jp',
'euc_kr',
'gb18030',
'gb2312',
'gbk',
'hz',
'iso2022_jp',
'iso2022_jp_1',
'iso2022_jp_2',
'iso2022_jp_2004',
'iso2022_jp_3',
'iso2022_jp_ext',
'iso2022_kr',
'iso8859_10',
'iso8859_11',
'iso8859_13',
'iso8859_14',
'iso8859_15',
'iso8859_16',
'iso8859_2',
'iso8859_3',
'iso8859_4',
'iso8859_5',
'iso8859_6',
'iso8859_7',
'iso8859_8',
'iso8859_9',
'johab',
'koi8_r',
'koi8_t',
'koi8_u',
'kz1048',
'latin_1',
'mac_cyrillic',
'mac_greek',
'mac_iceland',
'mac_latin2',
'mac_roman',
'mac_turkish',
'ptcp154',
'shift_jis',
'shift_jis_2004',
'shift_jisx0213',
'utf_16',
'utf_16_be',
'utf_16_le',
'utf_32',
'utf_32_be',
'utf_32_le',
'utf_7',
'utf_8',
'utf_8_sig',
]
def loop():
n_tried = 0
n_ok = 0
n_skip = 0
errors = []
while True:
string = "".join(chr(random.randint(0, 0x110000 - 1)) for _ in range(random.randint(1, 10)))
encoding = random.choice(py310_encodings)
n_tried += 1
res = step(string, encoding)
if res is True:
n_ok += 1
elif res is False: # internal error
n_skip += 1
else:
errors.append(res)
if n_tried % 100_000 == 0:
encs = Counter(r[0] for r in errors)
failures = len(errors) / (n_ok + len(errors))
print(f"{n_tried=} {n_ok=} {n_skip=} | {len(errors)} failed of all tests ({failures:.2%} of non-skipped)")
print(*(f"{enc}: {n}" for (enc, n) in encs.most_common()), sep=", ")
def step(string, encoding):
try:
encoded = string.encode(encoding)
decoded = encoded.decode(encoding)
except UnicodeError: # Unrepresentable, skip test
return False
if string != decoded:
print(f"Bad encoding: {encoding!r} => {string!r}")
return False
try:
decoded_as_utf8 = encoded.decode('utf-8')
except UnicodeError:
return False
if '_sig' in encoding:
decoded_as_utf8 = decoded_as_utf8.replace('\ufeff', '')
if decoded != decoded_as_utf8:
return (encoding, decoded, decoded_as_utf8)
return True
def main():
loop()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment