Skip to content

Instantly share code, notes, and snippets.

@tantale
Created January 31, 2016 20:28
Show Gist options
  • Save tantale/c217176eed7c6c5dad76 to your computer and use it in GitHub Desktop.
Save tantale/c217176eed7c6c5dad76 to your computer and use it in GitHub Desktop.
Detect the character encoding of the XML file
import re
XML_DECL_REGEX = r"""
^<\?xml # w/o BOM, XML declaration starts with <?xml at the first byte
.+? # some chars (version info), matched minimal
encoding= # encoding attribute begins
["'] # attribute start delimiter
(?P<encstr> # what's matched in the brackets will be named encstr
[^"']+ # every character not delimiter (not overly exact!)
) # closes the brackets pair for the named group
["'] # attribute end delimiter
.*? # some chars optionally (standalone decl or whitespace)
\?> # XML declaration end
"""
search_xml_decl = re.compile(XML_DECL_REGEX, re.VERBOSE).search
def detect_xml_encoding(fp):
"""
Attempts to detect the character encoding of the XML file
given by a file object fp. fp must not be a codec wrapped file
object!
The return value can be:
- if detection of the BOM succeeds, the codec name of the
corresponding unicode charset is returned
- if BOM detection fails, the XML declaration is searched for
the encoding attribute and its value returned. the "<"
character has to be the very first in the file then (it's XML
standard after all).
- if BOM and XML declaration fail, None is returned. According
to XML 1.0 it should be utf_8 then, but it wasn't detected by
the means offered here. at least one can be pretty sure that a
character coding including most of ASCII is used :-/
:param fp: Opened file of file-like object
:rtype: str or unicode
:return: Encoding name
"""
# == detection using BOM
# -- the BOMs we know, by their pattern
bom_dict = { # byte pattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be",
(0xFF, 0xFE, None, None): "utf_16_le",
(0xEF, 0xBB, 0xBF, None): "utf_8",
}
# -- go to beginning of file and get the first 4 bytes
old_fp = fp.tell()
fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
# -- try bom detection using 4 bytes, 3 bytes, or 2 bytes
bom_detection = bom_dict.get((byte1, byte2, byte3, byte4))
if not bom_detection:
bom_detection = bom_dict.get((byte1, byte2, byte3, None))
if not bom_detection:
bom_detection = bom_dict.get((byte1, byte2, None, None))
# -- if BOM detected, we're done :-)
if bom_detection:
fp.seek(old_fp)
return bom_detection
# -- still here? BOM detection failed.
# -- now that BOM detection has failed we assume one byte character
# -- encoding behaving ASCII - of course one could think of nice
# -- algorithms further investigating on that matter, but I won't for now.
# == search XML declaration for encoding attribute
# -- assume XML declaration fits into the first 2 KB (*cough*)
fp.seek(0)
tmp_buffer = fp.read(2048)
# -- search and extract encoding string
match = search_xml_decl(tmp_buffer)
fp.seek(old_fp)
if match:
return match.group("encstr")
else:
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment