Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Guess encoding by looking at the content
def guess_encoding(s):
"""Guess the character encoding from a byte string."""
# Check BOM
# 0xFF, 0xFE -> UTF-16 LE
# 0xFE, 0xFF -> UTF-16 BE
# 0xEF, 0xBB, 0xBF -> UTF-8
bom = s[0:2]
if bom == '\xff\xfe':
return 'utf-16'
elif bom == '\xfe\xff':
return 'utf-16'
# If there are many zeros (2/5 of total chars), it's likely UTF-16
num_chars = len(s)
num_zeros = s.count('\0')
if 2.5 * num_zeros > num_chars:
idx_zero = s.find('\0')
if idx_zero % 2 == 0:
return 'utf-16-be'
return 'utf-16-le'
if s[0:3] == '\xef\xbb\xbf':
return 'utf-8-sig'
return 'utf-8'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment