Last active
August 29, 2015 14:00
-
-
Save nsaphra/e4b74457a91f9b5dd41f to your computer and use it in GitHub Desktop.
Generate GIZA++ input files from segmented parallel text files, with option to add onto previous input files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import argparse | |
from collections import defaultdict | |
parser = argparse.ArgumentParser(description='Generate GIZA++ input files from ' | |
'segmented parallel text files.') | |
parser.add_argument('-s', '--src_in', help='Source input file') | |
parser.add_argument('-t', '--tgt_in', help='Target input file') | |
parser.add_argument('-p', '--prev_out', default=None, help='Previous output files prefix') | |
parser.add_argument('-o', '--out', help='Prefix for output files') | |
parser.add_argument('-k', '--keep_new_order', action='store_true', | |
help='Preserve order of the sentences in the new alignments at top of output.') | |
parser.add_argument('-m', '--multiplier', default=1, type=int, | |
help='multiply corpus size by this') | |
parser.add_argument('--test', action='store_true', | |
help='Print the new alignments to stdout for test data.') | |
args = parser.parse_args() | |
def vocab_from_file(fh): | |
vocab = {} | |
for line in fh: | |
(ind, word, cnt) = line.strip().split() | |
vocab[word] = [int(ind), int(cnt)] | |
return vocab | |
def bitext_from_file(fh): | |
bitext = defaultdict(int) | |
step = 0 | |
curr_cnt = 0 | |
curr_src = '' | |
for line in fh: | |
line = line.strip() | |
if step == 0: | |
curr_cnt = int(line) | |
elif step == 1: | |
curr_src = line | |
else: | |
bitext[(curr_src, line)] = curr_cnt | |
step = (step + 1) % 3 | |
return bitext | |
mult = args.multiplier | |
def build_vocab(vocab, fh): | |
last_ind = len(vocab) | |
for line in fh: | |
toks = line.strip().split() | |
for tok in toks: | |
if tok not in vocab: | |
last_ind += 1 | |
vocab[tok] = [last_ind, 0] | |
vocab[tok][1] += mult | |
def print_vocab(vocab, fh): | |
for (word, data) in sorted(vocab.items(), key=lambda (w,d) : d[0]): | |
print >> fh, "%d %s %d" % (data[0], word, data[1]) | |
# build vocab | |
vocab_s = vocab_from_file(open(args.prev_out+'.sv')) if args.prev_out else {} | |
build_vocab(vocab_s, open(args.src_in)) | |
vocab_t = vocab_from_file(open(args.prev_out+'.tv')) if args.prev_out else {} | |
build_vocab(vocab_t, open(args.tgt_in)) | |
bitext_fh = open(args.out+'.align', 'w') | |
bitext = bitext_from_file(open(args.prev_out+'.align')) \ | |
if args.prev_out else defaultdict(int) | |
for (s_line, t_line) in zip(open(args.src_in), open(args.tgt_in)): | |
src = s_line.strip().split() | |
tgt = t_line.strip().split() | |
s_inds = [str(vocab_s[s][0]) for s in src] | |
t_inds = [str(vocab_t[t][0]) for t in tgt] | |
if (args.test): | |
print str(mult) | |
print ' '.join(s_inds) | |
print ' '.join(t_inds) | |
if (args.keep_new_order): | |
print >> bitext_fh, str(mult) | |
print >> bitext_fh, ' '.join(s_inds) | |
print >> bitext_fh, ' '.join(t_inds) | |
else: | |
bitext[(' '.join(s_inds), ' '.join(t_inds))] += mult | |
for ((s, t), cnt) in bitext.items(): | |
print >> bitext_fh, str(cnt) | |
print >> bitext_fh, s | |
print >> bitext_fh, t | |
bitext_fh.close() | |
print_vocab(vocab_s, open(args.out+'.sv', 'w')) | |
print_vocab(vocab_t, open(args.out+'.tv', 'w')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment