Skip to content

Instantly share code, notes, and snippets.

@qnnnnez
Created September 2, 2019 04:03
Show Gist options
  • Save qnnnnez/5da743a7d281670f00641ff8685a11a1 to your computer and use it in GitHub Desktop.
Save qnnnnez/5da743a7d281670f00641ff8685a11a1 to your computer and use it in GitHub Desktop.
def detect_charset(encoded_text, default=None):
"""
探测编码格式,只支持 gbk 或 utf-8
:param default: 默认编码
:param encoded_text: 待探测的二进制数据
:return:
"""
# Python2 中,str 是字节串,unicode 才是字符串
if not isinstance(encoded_text, str):
raise ValueError(u'输入类型必须为str')
# 首先调用 chardet 探测
detected_encoding = chardet.detect(encoded_text)['encoding']
if detected_encoding in ('utf-8', 'gbk'):
# 如果 chardet 探测到是 utf-8 或者 gbk,一般都是对的
return detected_encoding
# 其他情况基本上是数据长度太短,无法有效分辨
# 直接尝试解码,如果出现解码错误,可以直接判断出来
try:
encoded_text.decode('utf-8')
except UnicodeDecodeError:
utf8_ok = False
else:
utf8_ok = True
try:
encoded_text.decode('gbk')
except UnicodeDecodeError:
gbk_ok = False
else:
gbk_ok = True
if utf8_ok and not gbk_ok:
return 'utf-8'
if gbk_ok and not utf8_ok:
return 'gbk'
if not gbk_ok and not utf8_ok:
return None
# utf-8 和 gbk 都可以成功解码
# 如果是 xml 文档,可以解析,然后判断编码
# https://stackoverflow.com/questions/25796238/reading-xml-header-encoding
from xml.parsers import expat
from xml.parsers.expat import ExpatError
latin1_text = encoded_text.decode('latin-1') # latin-1 是单字节编码,不会出现编码/解码出错抛异常的情况
def xml_decl_handler(version, encoding, standalone):
xml_decl_handler.encoding = encoding
xml_decl_handler.encoding = None
parser = expat.ParserCreate(encoding='latin-1')
parser.XmlDeclHandler = xml_decl_handler
try:
parser.Parse(latin1_text)
except ExpatError:
return default
else:
if xml_decl_handler.encoding:
return xml_decl_handler.encoding.lower()
return default
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment