Created
August 1, 2019 20:05
-
-
Save jonathaneunice/6c1337876fe7eb74a4ba48b57a4869a4 to your computer and use it in GitHub Desktop.
Unknown Encoding Reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_file_with_BOM_misc_encoding(filepath, encoding='utf_32_be utf_32_le utf-16le utf-16be utf-8'): | |
""" | |
Attempt to guess the proper encoding by trying them in a sequence. | |
Wider encodings should be tried before narrower. | |
""" | |
for enc in encoding.split(): | |
# try each encoding in turn, take results of first to not blow up | |
try: | |
with open(filepath, mode='r', encoding=enc) as f: | |
text = f.read() | |
if text.startswith(BOM): | |
# print('read_file_with_BOM: removing BOM!') | |
text = text[1:] | |
elif text.startswith(BADBOM): | |
# if file has a BOM that's obviously a byte-order reversal error, | |
# try again | |
raise ValueError | |
return text | |
except (UnicodeDecodeError, ValueError): | |
# print('falling back to next encoder') | |
pass | |
raise UnicodeDecodeError(f'could not read {filepath!r} with any tried encoding') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment