Skip to content

Instantly share code, notes, and snippets.

@fauxneticien
Last active July 13, 2017 01:08
Show Gist options
  • Save fauxneticien/ff9935d66ba27e472bd3a2c9dc22ab6d to your computer and use it in GitHub Desktop.
Save fauxneticien/ff9935d66ba27e472bd3a2c9dc22ab6d to your computer and use it in GitHub Desktop.
Parse backslash-coded lexicon using a defined grammar
#!/usr/bin/python
# Usage: pass in a grammar from a text file, or define them chunk-by-chunk as following arguments
# python chunker.py < lexicon.txt "xml" $(cat grammar.txt)
# python chunker.py < lexicon.txt "json" "examples:{<text><translation>}" "headword:{<lx><ps><examples>*}"
import cStringIO, json, sys, xmltodict, xml.dom.minidom
from toolz.functoolz import pipe
from nltk.toolbox import ToolboxData
from xml.etree.ElementTree import ElementTree
# Expecting lexicon data as redirect from STDIN
ToolboxData = ToolboxData()
ToolboxData._file = sys.stdin
# "json" or "xml"
output_type = sys.argv[1]
# Expecting grammar definition as following arguments
GRAMMAR = "\n".join(sys.argv[2:])
# Use StringIO to avoid having to do actual file I/O
xml_temp = cStringIO.StringIO()
# Parse lexicon using provided grammar, then 'write' to xml_temp 'file'
pipe(GRAMMAR,
ToolboxData.parse,
ElementTree,
lambda lexicon_tree: lexicon_tree.write(xml_temp, encoding = 'utf-8')
)
if output_type == "xml":
print xml.dom.minidom.parseString(xml_temp.getvalue()).toprettyxml()
elif output_type == "json":
# Read in from xml_temp 'file' as dict, then dump the dict as JSON to STDOUT
pipe(xml_temp.getvalue(),
xmltodict.parse,
lambda lexicon_dict: json.dump(lexicon_dict['toolbox_data']['record'], sys.stdout, indent = 2)
)
else:
print 'Error: output type should be "xml" or "json"'
examples:{<text><translation>}
headword:{<lx><ps><examples>*}
\lx bonjour
\ps Exclamation
\lx hallo
\ps Exclamation
\text Wir sagen 'hallo' auf Deutsch
\translation We say hello in German
\lx auto
\ps Noun
\text Das ist ein Auto
\translation This is a car
\text Das ist mein Auto
\translation This is my car
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment