Skip to content

Instantly share code, notes, and snippets.

@andreasf
Created April 30, 2015 09:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasf/af6bdc00cf5a928712b5 to your computer and use it in GitHub Desktop.
Save andreasf/af6bdc00cf5a928712b5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse
import sys
import os
def main():
parser = argparse.ArgumentParser()
parser.add_argument("phrase_table")
parser.add_argument("input_folder")
parser.add_argument("output_folder")
args = parser.parse_args()
if os.path.exists(args.output_folder):
panic("output folder exists, please delete and retry")
table = read_table(args.phrase_table)
print("translation table contains %s elements" % len(table))
i = 0
os.makedirs(args.output_folder)
for fn in os.listdir(args.input_folder):
path = os.path.join(args.input_folder, fn)
out_path = os.path.join(args.output_folder, fn)
translate(table, path, out_path)
sys.stdout.write(".")
i += 1
if i % 70 == 0:
sys.stdout.write(" %03d\n" % i)
sys.stdout.flush()
print("\n%s files translated" % i)
def translate(table, infn, outfn):
tokens = open(infn).read().split(" ")
tokens = map(lambda x: x.lower().strip(), tokens)
trans_tokens = [table[token] if token in table else token for token in tokens]
fh = open(outfn, "w")
fh.write(" ".join(trans_tokens))
fh.flush()
fh.close()
def read_table(fn):
trans = dict()
probs = dict()
f = open(fn)
for line in f:
columns = line.split(" ||| ")
if " " in columns[0]:
continue
term = columns[0]
term_trans = columns[1]
prob = float(columns[2].split(" ")[2])
if term not in probs or probs[term] < prob:
trans[term] = term_trans
probs[term] = prob
f.close()
return trans
def panic(msg):
sys.stderr.write(msg + "\n")
sys.stderr.flush()
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment