Created
September 15, 2014 18:07
-
-
Save neuroticnerd/92f737f28970b8c93c22 to your computer and use it in GitHub Desktop.
Unicode and XML helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import htmlentitydefs | |
from lxml import etree as ET | |
from bs4 import UnicodeDammit | |
def resolve_entities(entitystring): | |
""" | |
Credits for this function go to Fredrik Lundh | |
http://effbot.org/zone/re-sub.htm#unescape-html | |
""" | |
def resolve(c): | |
text = c.group(0) | |
if text[:2] == "&#": | |
try: | |
if text[:3] == "&#x": | |
return unichr(int(text[3:-1], 16)) | |
else: | |
return unichr(int(text[2:-1])) | |
except ValueError as e: | |
raise ValueError('entity resolution error: %s' % e) | |
else: | |
try: | |
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
except KeyError as e: | |
raise ValueError('entity resolution error: %s' % e) | |
return text | |
return re.sub("&#?\w+;", resolve, entitystring) | |
def dammit(inputstr, guesses=None, resolve=False): | |
""" | |
unicode can be a real pain, but this can help reduce frustration | |
luckily beautiful soup provides UnicodeDammit, which | |
makes knowledgeable guesses or detections of what encoding a given input | |
uses and converts that to unicode. this leverages that to attempt to | |
convert the input to utf-8 while also making use of the fact that | |
UnicodeDammit should strip the BOM from the input if it is present; | |
the BOM can crash a number of parsers/libraries not expecting it. | |
""" | |
if isinstance(inputstr, unicode): | |
# UnicodeDammit ignores input that is already a unicode object | |
return inputstr | |
if guesses: | |
ud = UnicodeDammit(inputstr, guesses) | |
else: | |
ud = UnicodeDammit(inputstr) | |
out = ud.unicode_markup | |
if ud.original_encoding != 'utf-8': | |
out = out.decode(ud.original_encoding) | |
out = out.encode('utf-8') | |
if resolve: | |
out = resolve_entities(out) | |
return out | |
def local(element, exact=False): | |
"""returns the lowercase local name for a qualified XML node""" | |
name = ET.QName(element).localname | |
if not exact: | |
name = name.lower() | |
return name | |
def xmltree(xmlstring): | |
""" | |
Load an xml string into an XML tree object | |
*** !important: there is a bug in lxml where it will not accept unicode | |
*** strings containing an XML declaration with 'charset' or 'encoding' | |
*** attributes because it incorrectly autodetects the encoding. this can be | |
*** circumvented by explicitly converting it to bytes first. | |
*** if xmlstring is unicode, then it must be encoded in utf-8! | |
""" | |
tree = None | |
content = xmlstring | |
if isinstance(xmlstring, unicode): | |
content = bytes(bytearray(xmlstring, encoding='utf-8')) | |
try: | |
tree = ET.ElementTree(ET.fromstring(content)) | |
except ET.XMLSyntaxError as e: | |
raise ConverterError("PARSING FAILED: %s" % e) | |
except ValueError as e: | |
raise | |
return tree |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment