Skip to content

Instantly share code, notes, and snippets.

@the-dan
Created August 15, 2015 11:19
Show Gist options
  • Save the-dan/63afc6fd4e4f8e7f333d to your computer and use it in GitHub Desktop.
Save the-dan/63afc6fd4e4f8e7f333d to your computer and use it in GitHub Desktop.
Tinkering with ElementTree encoding detection
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import itertools
import StringIO
import traceback
XML=r"""<?xml version='1.0' encoding='%s'?><response><msg>АБВГДЕЖЗСКЛМНОПРСТУФХЦЧШЩЫЪЭЬЮЯ</msg></response>"""
# XXX: for some unknown reason Ё isn't supported
PARSING_STYLE = ["string", "file"]
BYTE_ENCODING = ["unicode", "utf-8", "windows-1251"]
PROLOG_ENCODING = [u"utf-8", u"windows-1251"]
if __name__ == "__main__":
r = itertools.product(PARSING_STYLE, BYTE_ENCODING, PROLOG_ENCODING)
for parse, byte_enc, prolog_enc in r:
result = "OK"
xml = unicode(XML, "utf-8") % (prolog_enc)
if byte_enc != u"unicode":
xml = xml.encode(byte_enc)
if parse == "string":
try:
doc = ET.fromstring(xml)
#print doc.find("msg").text
except:
result = "F" # traceback.format_exc()
elif parse == "file":
try:
doc = ET.parse(StringIO.StringIO(xml))
except:
result = "F" # traceback.format_exc()
print "%s %s %s: %s" % (parse, byte_enc, prolog_enc, result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment