tantale/detect_xml_encoding.py

## detect_xml_encoding.py
import re

XML_DECL_REGEX = r"""
^<\?xml             # w/o BOM, XML declaration starts with <?xml at the first byte
.+?                 # some chars (version info), matched minimal
encoding=           # encoding attribute begins
["']                # attribute start delimiter
(?P<encstr>         # what's matched in the brackets will be named encstr
 [^"']+             # every character not delimiter (not overly exact!)
)                   # closes the brackets pair for the named group
["']                # attribute end delimiter
.*?                 # some chars optionally (standalone decl or whitespace)
\?>                 # XML declaration end
"""

search_xml_decl = re.compile(XML_DECL_REGEX, re.VERBOSE).search


def detect_xml_encoding(fp):
    """
    Attempts to detect the character encoding of the XML file
    given by a file object fp. fp must not be a codec wrapped file
    object!

    The return value can be:

    - if detection of the BOM succeeds, the codec name of the
      corresponding unicode charset is returned
    - if BOM detection fails, the XML declaration is searched for
      the encoding attribute and its value returned. the "<"
      character has to be the very first in the file then (it's XML
      standard after all).
    - if BOM and XML declaration fail, None is returned. According
      to XML 1.0 it should be utf_8 then, but it wasn't detected by
      the means offered here. at least one can be pretty sure that a
      character coding including most of ASCII is used :-/

    :param fp: Opened file of file-like object
    :rtype: str or unicode
    :return: Encoding name
    """
    # == detection using BOM

    # -- the BOMs we know, by their pattern
    bom_dict = {  # byte pattern : name
        (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
        (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
        (0xFE, 0xFF, None, None): "utf_16_be",
        (0xFF, 0xFE, None, None): "utf_16_le",
        (0xEF, 0xBB, 0xBF, None): "utf_8",
    }

    # -- go to beginning of file and get the first 4 bytes
    old_fp = fp.tell()
    fp.seek(0)
    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))

    # -- try bom detection using 4 bytes, 3 bytes, or 2 bytes
    bom_detection = bom_dict.get((byte1, byte2, byte3, byte4))
    if not bom_detection:
        bom_detection = bom_dict.get((byte1, byte2, byte3, None))
        if not bom_detection:
            bom_detection = bom_dict.get((byte1, byte2, None, None))

    # -- if BOM detected, we're done :-)
    if bom_detection:
        fp.seek(old_fp)
        return bom_detection

    # -- still here? BOM detection failed.
    # --  now that BOM detection has failed we assume one byte character
    # --  encoding behaving ASCII - of course one could think of nice
    # --  algorithms further investigating on that matter, but I won't for now.

    # == search XML declaration for encoding attribute

    # -- assume XML declaration fits into the first 2 KB (*cough*)
    fp.seek(0)
    tmp_buffer = fp.read(2048)

    # -- search and extract encoding string
    match = search_xml_decl(tmp_buffer)

    fp.seek(old_fp)
    if match:
        return match.group("encstr")
    else:
        return None
	import re

	XML_DECL_REGEX = r"""
	^<\?xml # w/o BOM, XML declaration starts with <?xml at the first byte
	.+? # some chars (version info), matched minimal
	encoding= # encoding attribute begins
	["'] # attribute start delimiter
	(?P<encstr> # what's matched in the brackets will be named encstr
	[^"']+ # every character not delimiter (not overly exact!)
	) # closes the brackets pair for the named group
	["'] # attribute end delimiter
	.*? # some chars optionally (standalone decl or whitespace)
	\?> # XML declaration end
	"""

	search_xml_decl = re.compile(XML_DECL_REGEX, re.VERBOSE).search


	def detect_xml_encoding(fp):
	"""
	Attempts to detect the character encoding of the XML file
	given by a file object fp. fp must not be a codec wrapped file
	object!

	The return value can be:

	- if detection of the BOM succeeds, the codec name of the
	corresponding unicode charset is returned
	- if BOM detection fails, the XML declaration is searched for
	the encoding attribute and its value returned. the "<"
	character has to be the very first in the file then (it's XML
	standard after all).
	- if BOM and XML declaration fail, None is returned. According
	to XML 1.0 it should be utf_8 then, but it wasn't detected by
	the means offered here. at least one can be pretty sure that a
	character coding including most of ASCII is used :-/

	:param fp: Opened file of file-like object
	:rtype: str or unicode
	:return: Encoding name
	"""
	# == detection using BOM

	# -- the BOMs we know, by their pattern
	bom_dict = { # byte pattern : name
	(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
	(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
	(0xFE, 0xFF, None, None): "utf_16_be",
	(0xFF, 0xFE, None, None): "utf_16_le",
	(0xEF, 0xBB, 0xBF, None): "utf_8",
	}

	# -- go to beginning of file and get the first 4 bytes
	old_fp = fp.tell()
	fp.seek(0)
	(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))

	# -- try bom detection using 4 bytes, 3 bytes, or 2 bytes
	bom_detection = bom_dict.get((byte1, byte2, byte3, byte4))
	if not bom_detection:
	bom_detection = bom_dict.get((byte1, byte2, byte3, None))
	if not bom_detection:
	bom_detection = bom_dict.get((byte1, byte2, None, None))

	# -- if BOM detected, we're done :-)
	if bom_detection:
	fp.seek(old_fp)
	return bom_detection

	# -- still here? BOM detection failed.
	# -- now that BOM detection has failed we assume one byte character
	# -- encoding behaving ASCII - of course one could think of nice
	# -- algorithms further investigating on that matter, but I won't for now.

	# == search XML declaration for encoding attribute

	# -- assume XML declaration fits into the first 2 KB (cough)
	fp.seek(0)
	tmp_buffer = fp.read(2048)

	# -- search and extract encoding string
	match = search_xml_decl(tmp_buffer)

	fp.seek(old_fp)
	if match:
	return match.group("encstr")
	else:
	return None