Helps to fix diacritics mess in legacy websites. Uses the chardet module to detect character encoding; accepts multiple files to print a table
#-*- coding: utf-8 -*-
import chardet, pathlib, sys
known_enc = {'Win':'Windows-1250', 'ISO':'ISO-8859-2', '1250':'Windows-1250', 'utf':'utf8' }
for fn in sys.argv[1:]:
found_enc = chardet.detect(pathlib.Path(fn).read_bytes())['encoding']
if found_enc[:3] in known_enc.keys():
found_enc = known_enc[found_enc[:3]]
print(f'{fn:20s} auto-detected encoding {found_enc:14s}', end='')
fileheader = pathlib.Path(fn).read_bytes()[:500]
if 'charset='.encode() in fileheader:
print(' --> file defines encoding ', end='')
for k in known_enc.keys():
if k.encode() in fileheader:
print(k, end='')
print(' --> file DOES NOT define encoding')
