Skip to content

Instantly share code, notes, and snippets.

Created June 17, 2014 07:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/33b4ecbec246d774147d to your computer and use it in GitHub Desktop.
Save anonymous/33b4ecbec246d774147d to your computer and use it in GitHub Desktop.
import xml.dom.minidom, json
class DictError(Exception):
pass
def innerText(node):
if node.nodeType == node.TEXT_NODE:
return node.nodeValue
return u''.join(innerText(child) for child in node.childNodes)
def getPos(rnode):
snodes = rnode.getElementsByTagName('s')
if not snodes: return u''
return snodes[0].getAttribute('n') or u''
dictSets = {} # es -> [[ca options]]
doc = xml.dom.minidom.parse('apertium-es-ca.es-ca.dix')
for p in doc.getElementsByTagName('p'):
lnodes = p.getElementsByTagName('l')
rnodes = p.getElementsByTagName('r')
if len(lnodes) > 1: raise DictError('Multiple l nodes: ' + p.toxml())
if len(rnodes) > 1: raise DictError('Multiple r nodes: ' + p.toxml())
if not lnodes or not rnodes: continue
lnode = lnodes[0]
rnode = rnodes[0]
ltext = innerText(lnode)
rtext = innerText(rnode)
rpos = getPos(rnode)
dictSets.setdefault(ltext, set()).add((rtext, rpos))
dictData = dict((k, sorted(v)) for k, v in dictSets.items())
print '{'
for i, (k, v) in enumerate(sorted(dictSets.items())):
maybeComma = ',' if i < len(dictSets) else ''
print '\t' + k.encode('UTF-8') + ': ' + json.dumps(sorted(v), ensure_ascii=False).encode('UTF-8') + maybeComma
print '}'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment