Using lxml to parse the Microsoft API translation result from the xml
#!/usr/bin/python | |
from lxml import etree | |
def get_text_from_msmt_xml (xml): | |
"""Parse the xml string returned by the MS machine translation API, and return just the text""" | |
text = [] | |
doc = etree.fromstring(xml) | |
for elem in doc.xpath('/foo:string', namespaces={'foo': 'http://schemas.microsoft.com/2003/10/Serialization/'}): | |
if elem.text: | |
elem_text = ' '.join(elem.text.split()) | |
if len(elem_text) > 0: | |
text.append(elem_text) | |
return ' '.join(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment