Last active
March 10, 2020 05:06
-
-
Save pepoluan/f8e354c21f76e55dc50542c70fe28962 to your computer and use it in GitHub Desktop.
Simple Detection of Text File Encoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is released to the Public Domain. | |
# Alternatively, you can use one of the following licenses: Unlicense OR CC0-1.0 OR WTFPL OR MIT-0 OR BSD-3-Clause | |
# IMPORTANT: | |
# Please do note that this does NOT attempt to perform complex guessing e.g. CP437 or CP850 or GB18030 or JIS or... | |
# Well, you get the idea. | |
# This function will simply try to guess the most common *Unicode* encoding, i.e., UTF-8, UTF-16, and UTF-32 | |
# More ... 'exotic' unicode encoding such as UCS-1, UCS-2, UTF-7, etc are NOT detected. (They would likely be | |
# detected as "utf-8" by this function) | |
# If you need more 'advanced' detection, use the heavyweight "chardet" library instead. | |
def detect_enc(filename): | |
utf16_boms = {b'\xff\xfe', b'\xfe\xff'} | |
utf32_boms = {b'\xff\xfe\x00\x00', b'\x00\x00\xfe\xff'} | |
with open(filename, "rb") as fdet: | |
b = fdet.read(4) | |
if b[:3] == b"\xEF\xBB\xBF": | |
enc = "utf-8-sig" | |
elif b in utf32_boms: # UTF-32 with BOM, let Python handle the BOM-stripping | |
enc = "utf-32" | |
elif b[:2] in utf16_boms: # UTF-16 with BOM, let Python handle the BOM-stripping | |
enc = "utf-16" | |
# The following cases are when no BOM is detected. | |
# We need to guess based on the location of the x00 chars | |
elif b[:2] == b"\x00\x00": | |
enc = "utf-32-be" | |
elif b[2:4] == b"\x00\x00": | |
enc = "utf-32-le" | |
elif b[0] == b'\x00': | |
enc = "utf-16-be" | |
elif b[1] == b'\x00': | |
enc = "utf-16-le" | |
else: | |
enc = "utf-8" # Just assume UTF-8 (without BOM) | |
return enc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment