neuroticnerd/utils-unicode-xml.py

## utils-unicode-xml.py
import re
import htmlentitydefs

from lxml import etree as ET
from bs4 import UnicodeDammit


def resolve_entities(entitystring):
    """
    Credits for this function go to Fredrik Lundh
    http://effbot.org/zone/re-sub.htm#unescape-html
    """
    def resolve(c):
        text = c.group(0)
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError as e:
                raise ValueError('entity resolution error: %s' % e)
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError as e:
                raise ValueError('entity resolution error: %s' % e)
        return text
    return re.sub("&#?\w+;", resolve, entitystring)


def dammit(inputstr, guesses=None, resolve=False):
    """
    unicode can be a real pain, but this can help reduce frustration

    luckily beautiful soup provides UnicodeDammit, which
    makes knowledgeable guesses or detections of what encoding a given input
    uses and converts that to unicode. this leverages that to attempt to
    convert the input to utf-8 while also making use of the fact that
    UnicodeDammit should strip the BOM from the input if it is present;
    the BOM can crash a number of parsers/libraries not expecting it.
    """
    if isinstance(inputstr, unicode):
        # UnicodeDammit ignores input that is already a unicode object
        return inputstr
    if guesses:
        ud = UnicodeDammit(inputstr, guesses)
    else:
        ud = UnicodeDammit(inputstr)
    out = ud.unicode_markup
    if ud.original_encoding != 'utf-8':
        out = out.decode(ud.original_encoding)
        out = out.encode('utf-8')
    if resolve:
        out = resolve_entities(out)
    return out


def local(element, exact=False):
    """returns the lowercase local name for a qualified XML node"""
    name = ET.QName(element).localname
    if not exact:
        name = name.lower()
    return name


def xmltree(xmlstring):
    """
    Load an xml string into an XML tree object

    *** !important: there is a bug in lxml where it will not accept unicode
    *** strings containing an XML declaration with 'charset' or 'encoding'
    *** attributes because it incorrectly autodetects the encoding. this can be
    *** circumvented by explicitly converting it to bytes first.

    *** if xmlstring is unicode, then it must be encoded in utf-8!
    """
    tree = None
    content = xmlstring
    if isinstance(xmlstring, unicode):
        content = bytes(bytearray(xmlstring, encoding='utf-8'))
    try:
        tree = ET.ElementTree(ET.fromstring(content))
    except ET.XMLSyntaxError as e:
        raise ConverterError("PARSING FAILED: %s" % e)
    except ValueError as e:
        raise
    return tree
	import re
	import htmlentitydefs

	from lxml import etree as ET
	from bs4 import UnicodeDammit


	def resolve_entities(entitystring):
	"""
	Credits for this function go to Fredrik Lundh
	http://effbot.org/zone/re-sub.htm#unescape-html
	"""
	def resolve(c):
	text = c.group(0)
	if text[:2] == "&#":
	try:
	if text[:3] == "&#x":
	return unichr(int(text[3:-1], 16))
	else:
	return unichr(int(text[2:-1]))
	except ValueError as e:
	raise ValueError('entity resolution error: %s' % e)
	else:
	try:
	text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
	except KeyError as e:
	raise ValueError('entity resolution error: %s' % e)
	return text
	return re.sub("&#?\w+;", resolve, entitystring)


	def dammit(inputstr, guesses=None, resolve=False):
	"""
	unicode can be a real pain, but this can help reduce frustration

	luckily beautiful soup provides UnicodeDammit, which
	makes knowledgeable guesses or detections of what encoding a given input
	uses and converts that to unicode. this leverages that to attempt to
	convert the input to utf-8 while also making use of the fact that
	UnicodeDammit should strip the BOM from the input if it is present;
	the BOM can crash a number of parsers/libraries not expecting it.
	"""
	if isinstance(inputstr, unicode):
	# UnicodeDammit ignores input that is already a unicode object
	return inputstr
	if guesses:
	ud = UnicodeDammit(inputstr, guesses)
	else:
	ud = UnicodeDammit(inputstr)
	out = ud.unicode_markup
	if ud.original_encoding != 'utf-8':
	out = out.decode(ud.original_encoding)
	out = out.encode('utf-8')
	if resolve:
	out = resolve_entities(out)
	return out


	def local(element, exact=False):
	"""returns the lowercase local name for a qualified XML node"""
	name = ET.QName(element).localname
	if not exact:
	name = name.lower()
	return name


	def xmltree(xmlstring):
	"""
	Load an xml string into an XML tree object

	*** !important: there is a bug in lxml where it will not accept unicode
	*** strings containing an XML declaration with 'charset' or 'encoding'
	*** attributes because it incorrectly autodetects the encoding. this can be
	*** circumvented by explicitly converting it to bytes first.

	*** if xmlstring is unicode, then it must be encoded in utf-8!
	"""
	tree = None
	content = xmlstring
	if isinstance(xmlstring, unicode):
	content = bytes(bytearray(xmlstring, encoding='utf-8'))
	try:
	tree = ET.ElementTree(ET.fromstring(content))
	except ET.XMLSyntaxError as e:
	raise ConverterError("PARSING FAILED: %s" % e)
	except ValueError as e:
	raise
	return tree