Skip to content

Instantly share code, notes, and snippets.

@neuroticnerd
Created September 15, 2014 18:07
Show Gist options
  • Save neuroticnerd/92f737f28970b8c93c22 to your computer and use it in GitHub Desktop.
Save neuroticnerd/92f737f28970b8c93c22 to your computer and use it in GitHub Desktop.
Unicode and XML helpers
import re
import htmlentitydefs
from lxml import etree as ET
from bs4 import UnicodeDammit
def resolve_entities(entitystring):
"""
Credits for this function go to Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def resolve(c):
text = c.group(0)
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError as e:
raise ValueError('entity resolution error: %s' % e)
else:
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError as e:
raise ValueError('entity resolution error: %s' % e)
return text
return re.sub("&#?\w+;", resolve, entitystring)
def dammit(inputstr, guesses=None, resolve=False):
"""
unicode can be a real pain, but this can help reduce frustration
luckily beautiful soup provides UnicodeDammit, which
makes knowledgeable guesses or detections of what encoding a given input
uses and converts that to unicode. this leverages that to attempt to
convert the input to utf-8 while also making use of the fact that
UnicodeDammit should strip the BOM from the input if it is present;
the BOM can crash a number of parsers/libraries not expecting it.
"""
if isinstance(inputstr, unicode):
# UnicodeDammit ignores input that is already a unicode object
return inputstr
if guesses:
ud = UnicodeDammit(inputstr, guesses)
else:
ud = UnicodeDammit(inputstr)
out = ud.unicode_markup
if ud.original_encoding != 'utf-8':
out = out.decode(ud.original_encoding)
out = out.encode('utf-8')
if resolve:
out = resolve_entities(out)
return out
def local(element, exact=False):
"""returns the lowercase local name for a qualified XML node"""
name = ET.QName(element).localname
if not exact:
name = name.lower()
return name
def xmltree(xmlstring):
"""
Load an xml string into an XML tree object
*** !important: there is a bug in lxml where it will not accept unicode
*** strings containing an XML declaration with 'charset' or 'encoding'
*** attributes because it incorrectly autodetects the encoding. this can be
*** circumvented by explicitly converting it to bytes first.
*** if xmlstring is unicode, then it must be encoded in utf-8!
"""
tree = None
content = xmlstring
if isinstance(xmlstring, unicode):
content = bytes(bytearray(xmlstring, encoding='utf-8'))
try:
tree = ET.ElementTree(ET.fromstring(content))
except ET.XMLSyntaxError as e:
raise ConverterError("PARSING FAILED: %s" % e)
except ValueError as e:
raise
return tree
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment