FilipDominec/encoding_detection_for_html.py

## encoding_detection_for_html.py
#!/usr/bin/python3
#-*- coding: utf-8 -*-
import chardet, pathlib, sys

known_enc = {'Win':'Windows-1250', 'ISO':'ISO-8859-2', '1250':'Windows-1250', 'utf':'utf8' }
for fn in sys.argv[1:]:
    found_enc = chardet.detect(pathlib.Path(fn).read_bytes())['encoding']
    if found_enc[:3] in known_enc.keys():
       found_enc = known_enc[found_enc[:3]]
    print(f'{fn:20s} auto-detected encoding {found_enc:14s}', end='')

    fileheader = pathlib.Path(fn).read_bytes()[:500]
    if 'charset='.encode() in  fileheader:
        print(' --> file defines encoding ', end='')
        for k in known_enc.keys():
            if k.encode() in fileheader:
                print(k, end='')
        print()
    else:
        print(' --> file DOES NOT define encoding')
	#!/usr/bin/python3
	#-- coding: utf-8 --
	import chardet, pathlib, sys

	known_enc = {'Win':'Windows-1250', 'ISO':'ISO-8859-2', '1250':'Windows-1250', 'utf':'utf8' }
	for fn in sys.argv[1:]:
	found_enc = chardet.detect(pathlib.Path(fn).read_bytes())['encoding']
	if found_enc[:3] in known_enc.keys():
	found_enc = known_enc[found_enc[:3]]
	print(f'{fn:20s} auto-detected encoding {found_enc:14s}', end='')

	fileheader = pathlib.Path(fn).read_bytes()[:500]
	if 'charset='.encode() in fileheader:
	print(' --> file defines encoding ', end='')
	for k in known_enc.keys():
	if k.encode() in fileheader:
	print(k, end='')
	print()
	else:
	print(' --> file DOES NOT define encoding')