Wilfred/extract_words.py

## extract_words.py
#!/usr/bin/python

import os
import lxml.etree

def get_words_from_kap(node):
    flat_string = flatten_kap(node)

    # now this is either one word 'foo' or multiple 'foo, bar'
    words = flat_string.split(', ')
    if len(words) > 1:
        for i in range(len(words)):
            # remove trailing/leading space
            words[i] = words[i].strip()

    return words

def flatten_kap(kap):
    # take kap node ugliness and return a naked string
    # convert text of the form 'ret<tld/>ejo<fnt>Z</fnt>, ret<tld/>o'
    # to ['retetejo, 'reteto']
    assert kap != None
    root = get_word_root(kap)

    flat_string = ""
    if kap.text != None:
        flat_string += kap.text

    # flatten, get all the text, throw away ofc, fnt
    # this is not simple, but the xml structure is a pain
    # offenders: nuks.xml
    for child in kap.getchildren():
        if child.tag == 'tld':
            flat_string += root
            if child.text != None:
                flat_string += child.text
        elif child.tag == 'fnt':
            # we throw away source of word, not interested right now
            pass
        elif child.tag == 'ofc':
            # also throw away oficialness, not interested
            pass
        elif child.tag == 'var':
            # recurse -- egads! Why isn't the xml simpler?
            child_kap = child.getchildren()[0]
            flat_string += flatten_kap(child_kap)
        else:
            # shouldn't get here
            assert False
        if child.tail != None:
            flat_string += child.tail

    return flat_string.strip()

def get_word_root(arbitrary_node):
    # get the root without the ending
    assert arbitrary_node != None
    tree = arbitrary_node.getroottree()
    return list(tree.iter('rad'))[0].text

def get_tree(xml_file):
    parser = lxml.etree.XMLParser(load_dtd=True)
    return lxml.etree.parse(xml_file, parser)

if __name__ == '__main__':
    path = '/home/wilfred/languages/esperanto/reta_vortaro/xml'
    for file in os.listdir(path):
        tree = get_tree(path + '/' + file)

        # each word is a drv
        for drv_node in tree.iter('drv'):
            words = get_words_from_kap(drv_node.find('kap'))
            for word in words:
                print word.encode('utf8')
	#!/usr/bin/python

	import os
	import lxml.etree

	def get_words_from_kap(node):
	flat_string = flatten_kap(node)

	# now this is either one word 'foo' or multiple 'foo, bar'
	words = flat_string.split(', ')
	if len(words) > 1:
	for i in range(len(words)):
	# remove trailing/leading space
	words[i] = words[i].strip()

	return words

	def flatten_kap(kap):
	# take kap node ugliness and return a naked string
	# convert text of the form 'ret<tld/>ejo<fnt>Z</fnt>, ret<tld/>o'
	# to ['retetejo, 'reteto']
	assert kap != None
	root = get_word_root(kap)

	flat_string = ""
	if kap.text != None:
	flat_string += kap.text

	# flatten, get all the text, throw away ofc, fnt
	# this is not simple, but the xml structure is a pain
	# offenders: nuks.xml
	for child in kap.getchildren():
	if child.tag == 'tld':
	flat_string += root
	if child.text != None:
	flat_string += child.text
	elif child.tag == 'fnt':
	# we throw away source of word, not interested right now
	pass
	elif child.tag == 'ofc':
	# also throw away oficialness, not interested
	pass
	elif child.tag == 'var':
	# recurse -- egads! Why isn't the xml simpler?
	child_kap = child.getchildren()[0]
	flat_string += flatten_kap(child_kap)
	else:
	# shouldn't get here
	assert False
	if child.tail != None:
	flat_string += child.tail

	return flat_string.strip()

	def get_word_root(arbitrary_node):
	# get the root without the ending
	assert arbitrary_node != None
	tree = arbitrary_node.getroottree()
	return list(tree.iter('rad'))[0].text

	def get_tree(xml_file):
	parser = lxml.etree.XMLParser(load_dtd=True)
	return lxml.etree.parse(xml_file, parser)

	if __name__ == '__main__':
	path = '/home/wilfred/languages/esperanto/reta_vortaro/xml'
	for file in os.listdir(path):
	tree = get_tree(path + '/' + file)

	# each word is a drv
	for drv_node in tree.iter('drv'):
	words = get_words_from_kap(drv_node.find('kap'))
	for word in words:
	print word.encode('utf8')