Skip to content

Instantly share code, notes, and snippets.

@varvara-l
Last active February 8, 2016 10:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save varvara-l/55e890aab83e00980dac to your computer and use it in GitHub Desktop.
Save varvara-l/55e890aab83e00980dac to your computer and use it in GitHub Desktop.
import sys
'''
Parse TERCOM .pra files
'''
def parse_sentence(line_array):
hyp, ref = [], []
align, sentence_id = "", ""
for line in line_array:
line_separator = line.find(':')
line_id = line[:line_separator]
if line_id == "Original Hyp":
hyp = [w for w in line[line_separator+2:].split()]
elif line_id == "Original Ref":
ref = [w for w in line[line_separator+2:].split()]
elif line_id == "Sentence ID":
sentence_id = line[line_separator+2:]
elif line_id == "Alignment":
align = line[line_separator+3:-1]
tags = []
for ch in align:
if ch == ' ':
tags.append('OK')
else:
tags.append(ch)
return (sentence_id, hyp, ref, tags)
def parse_file(file_name):
instance = []
mt_file = open(file_name+'.mt', 'w')
pe_file = open(file_name+'.pe', 'w')
tags_file = open(file_name+'.tags', 'w')
tags_map = {'OK': 'OK', 'S': 'BAD', 'I': 'BAD', 'D': 'BAD'}
for line in open(file_name):
if line == '\n':
sent_id, hyp, ref, tags = parse_sentence(instance)
# print(hyp, tags)
assert(len(hyp) == len([t for t in tags if t != 'D'])), "Lengths mismatch: {} and {} in sentence {}".format(len(hyp), len([t for t in tags if t != 'D']), sent_id)
assert(len(ref) == len([t for t in tags if t != 'I'])), "Lengths mismatch: {} and {} in sentence {}".format(len(ref), len([t for t in tags if t != 'I']), sent_id)
mt_file.write('%s\n' % (' '.join([w.encode('utf-8') for w in hyp])))
pe_file.write('%s\n' % (' '.join([w.encode('utf-8') for w in ref])))
tags_file.write('%s\n' % (' '.join([tags_map[t] for t in tags if t != 'D'])))
instance = []
instance.append(line[:-1].decode('utf-8'))
if __name__ == "__main__":
parse_file(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment