pepoluan/detect_enc.py

## detect_enc.py
# This code is released to the Public Domain.
# Alternatively, you can use one of the following licenses: Unlicense OR CC0-1.0 OR WTFPL OR MIT-0 OR BSD-3-Clause

# IMPORTANT:
# Please do note that this does NOT attempt to perform complex guessing e.g. CP437 or CP850 or GB18030 or JIS or...
# Well, you get the idea.
# This function will simply try to guess the most common *Unicode* encoding, i.e., UTF-8, UTF-16, and UTF-32
# More ... 'exotic' unicode encoding such as UCS-1, UCS-2, UTF-7, etc are NOT detected. (They would likely be
# detected as "utf-8" by this function)
# If you need more 'advanced' detection, use the heavyweight "chardet" library instead.

def detect_enc(filename):
    utf16_boms = {b'\xff\xfe', b'\xfe\xff'}
    utf32_boms = {b'\xff\xfe\x00\x00', b'\x00\x00\xfe\xff'}
    with open(filename, "rb") as fdet:
        b = fdet.read(4)
    if b[:3] == b"\xEF\xBB\xBF":
        enc = "utf-8-sig"
    elif b in utf32_boms:  # UTF-32 with BOM, let Python handle the BOM-stripping
        enc = "utf-32"
    elif b[:2] in utf16_boms:  # UTF-16 with BOM, let Python handle the BOM-stripping
        enc = "utf-16"
    # The following cases are when no BOM is detected.
    # We need to guess based on the location of the x00 chars
    elif b[:2] == b"\x00\x00":
        enc = "utf-32-be"
    elif b[2:4] == b"\x00\x00":
        enc = "utf-32-le"
    elif b[0] == b'\x00':
        enc = "utf-16-be"
    elif b[1] == b'\x00':
        enc = "utf-16-le"
    else:
        enc = "utf-8"  # Just assume UTF-8 (without BOM)
    return enc
	# This code is released to the Public Domain.
	# Alternatively, you can use one of the following licenses: Unlicense OR CC0-1.0 OR WTFPL OR MIT-0 OR BSD-3-Clause

	# IMPORTANT:
	# Please do note that this does NOT attempt to perform complex guessing e.g. CP437 or CP850 or GB18030 or JIS or...
	# Well, you get the idea.
	# This function will simply try to guess the most common Unicode encoding, i.e., UTF-8, UTF-16, and UTF-32
	# More ... 'exotic' unicode encoding such as UCS-1, UCS-2, UTF-7, etc are NOT detected. (They would likely be
	# detected as "utf-8" by this function)
	# If you need more 'advanced' detection, use the heavyweight "chardet" library instead.

	def detect_enc(filename):
	utf16_boms = {b'\xff\xfe', b'\xfe\xff'}
	utf32_boms = {b'\xff\xfe\x00\x00', b'\x00\x00\xfe\xff'}
	with open(filename, "rb") as fdet:
	b = fdet.read(4)
	if b[:3] == b"\xEF\xBB\xBF":
	enc = "utf-8-sig"
	elif b in utf32_boms: # UTF-32 with BOM, let Python handle the BOM-stripping
	enc = "utf-32"
	elif b[:2] in utf16_boms: # UTF-16 with BOM, let Python handle the BOM-stripping
	enc = "utf-16"
	# The following cases are when no BOM is detected.
	# We need to guess based on the location of the x00 chars
	elif b[:2] == b"\x00\x00":
	enc = "utf-32-be"
	elif b[2:4] == b"\x00\x00":
	enc = "utf-32-le"
	elif b[0] == b'\x00':
	enc = "utf-16-be"
	elif b[1] == b'\x00':
	enc = "utf-16-le"
	else:
	enc = "utf-8" # Just assume UTF-8 (without BOM)
	return enc