Created
September 2, 2019 04:03
-
-
Save qnnnnez/5da743a7d281670f00641ff8685a11a1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def detect_charset(encoded_text, default=None): | |
""" | |
探测编码格式,只支持 gbk 或 utf-8 | |
:param default: 默认编码 | |
:param encoded_text: 待探测的二进制数据 | |
:return: | |
""" | |
# Python2 中,str 是字节串,unicode 才是字符串 | |
if not isinstance(encoded_text, str): | |
raise ValueError(u'输入类型必须为str') | |
# 首先调用 chardet 探测 | |
detected_encoding = chardet.detect(encoded_text)['encoding'] | |
if detected_encoding in ('utf-8', 'gbk'): | |
# 如果 chardet 探测到是 utf-8 或者 gbk,一般都是对的 | |
return detected_encoding | |
# 其他情况基本上是数据长度太短,无法有效分辨 | |
# 直接尝试解码,如果出现解码错误,可以直接判断出来 | |
try: | |
encoded_text.decode('utf-8') | |
except UnicodeDecodeError: | |
utf8_ok = False | |
else: | |
utf8_ok = True | |
try: | |
encoded_text.decode('gbk') | |
except UnicodeDecodeError: | |
gbk_ok = False | |
else: | |
gbk_ok = True | |
if utf8_ok and not gbk_ok: | |
return 'utf-8' | |
if gbk_ok and not utf8_ok: | |
return 'gbk' | |
if not gbk_ok and not utf8_ok: | |
return None | |
# utf-8 和 gbk 都可以成功解码 | |
# 如果是 xml 文档,可以解析,然后判断编码 | |
# https://stackoverflow.com/questions/25796238/reading-xml-header-encoding | |
from xml.parsers import expat | |
from xml.parsers.expat import ExpatError | |
latin1_text = encoded_text.decode('latin-1') # latin-1 是单字节编码,不会出现编码/解码出错抛异常的情况 | |
def xml_decl_handler(version, encoding, standalone): | |
xml_decl_handler.encoding = encoding | |
xml_decl_handler.encoding = None | |
parser = expat.ParserCreate(encoding='latin-1') | |
parser.XmlDeclHandler = xml_decl_handler | |
try: | |
parser.Parse(latin1_text) | |
except ExpatError: | |
return default | |
else: | |
if xml_decl_handler.encoding: | |
return xml_decl_handler.encoding.lower() | |
return default |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment