Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/python
import os
import lxml.etree
def get_words_from_kap(node):
flat_string = flatten_kap(node)
# now this is either one word 'foo' or multiple 'foo, bar'
words = flat_string.split(', ')
if len(words) > 1:
for i in range(len(words)):
# remove trailing/leading space
words[i] = words[i].strip()
return words
def flatten_kap(kap):
# take kap node ugliness and return a naked string
# convert text of the form 'ret<tld/>ejo<fnt>Z</fnt>, ret<tld/>o'
# to ['retetejo, 'reteto']
assert kap != None
root = get_word_root(kap)
flat_string = ""
if kap.text != None:
flat_string += kap.text
# flatten, get all the text, throw away ofc, fnt
# this is not simple, but the xml structure is a pain
# offenders: nuks.xml
for child in kap.getchildren():
if child.tag == 'tld':
flat_string += root
if child.text != None:
flat_string += child.text
elif child.tag == 'fnt':
# we throw away source of word, not interested right now
pass
elif child.tag == 'ofc':
# also throw away oficialness, not interested
pass
elif child.tag == 'var':
# recurse -- egads! Why isn't the xml simpler?
child_kap = child.getchildren()[0]
flat_string += flatten_kap(child_kap)
else:
# shouldn't get here
assert False
if child.tail != None:
flat_string += child.tail
return flat_string.strip()
def get_word_root(arbitrary_node):
# get the root without the ending
assert arbitrary_node != None
tree = arbitrary_node.getroottree()
return list(tree.iter('rad'))[0].text
def get_tree(xml_file):
parser = lxml.etree.XMLParser(load_dtd=True)
return lxml.etree.parse(xml_file, parser)
if __name__ == '__main__':
path = '/home/wilfred/languages/esperanto/reta_vortaro/xml'
for file in os.listdir(path):
tree = get_tree(path + '/' + file)
# each word is a drv
for drv_node in tree.iter('drv'):
words = get_words_from_kap(drv_node.find('kap'))
for word in words:
print word.encode('utf8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment