Skip to content

Instantly share code, notes, and snippets.

@nsaphra
Last active August 29, 2015 14:00
Show Gist options
  • Save nsaphra/e4b74457a91f9b5dd41f to your computer and use it in GitHub Desktop.
Save nsaphra/e4b74457a91f9b5dd41f to your computer and use it in GitHub Desktop.
Generate GIZA++ input files from segmented parallel text files, with option to add onto previous input files.
#!/usr/bin/python
import argparse
from collections import defaultdict
parser = argparse.ArgumentParser(description='Generate GIZA++ input files from '
'segmented parallel text files.')
parser.add_argument('-s', '--src_in', help='Source input file')
parser.add_argument('-t', '--tgt_in', help='Target input file')
parser.add_argument('-p', '--prev_out', default=None, help='Previous output files prefix')
parser.add_argument('-o', '--out', help='Prefix for output files')
parser.add_argument('-k', '--keep_new_order', action='store_true',
help='Preserve order of the sentences in the new alignments at top of output.')
parser.add_argument('-m', '--multiplier', default=1, type=int,
help='multiply corpus size by this')
parser.add_argument('--test', action='store_true',
help='Print the new alignments to stdout for test data.')
args = parser.parse_args()
def vocab_from_file(fh):
vocab = {}
for line in fh:
(ind, word, cnt) = line.strip().split()
vocab[word] = [int(ind), int(cnt)]
return vocab
def bitext_from_file(fh):
bitext = defaultdict(int)
step = 0
curr_cnt = 0
curr_src = ''
for line in fh:
line = line.strip()
if step == 0:
curr_cnt = int(line)
elif step == 1:
curr_src = line
else:
bitext[(curr_src, line)] = curr_cnt
step = (step + 1) % 3
return bitext
mult = args.multiplier
def build_vocab(vocab, fh):
last_ind = len(vocab)
for line in fh:
toks = line.strip().split()
for tok in toks:
if tok not in vocab:
last_ind += 1
vocab[tok] = [last_ind, 0]
vocab[tok][1] += mult
def print_vocab(vocab, fh):
for (word, data) in sorted(vocab.items(), key=lambda (w,d) : d[0]):
print >> fh, "%d %s %d" % (data[0], word, data[1])
# build vocab
vocab_s = vocab_from_file(open(args.prev_out+'.sv')) if args.prev_out else {}
build_vocab(vocab_s, open(args.src_in))
vocab_t = vocab_from_file(open(args.prev_out+'.tv')) if args.prev_out else {}
build_vocab(vocab_t, open(args.tgt_in))
bitext_fh = open(args.out+'.align', 'w')
bitext = bitext_from_file(open(args.prev_out+'.align')) \
if args.prev_out else defaultdict(int)
for (s_line, t_line) in zip(open(args.src_in), open(args.tgt_in)):
src = s_line.strip().split()
tgt = t_line.strip().split()
s_inds = [str(vocab_s[s][0]) for s in src]
t_inds = [str(vocab_t[t][0]) for t in tgt]
if (args.test):
print str(mult)
print ' '.join(s_inds)
print ' '.join(t_inds)
if (args.keep_new_order):
print >> bitext_fh, str(mult)
print >> bitext_fh, ' '.join(s_inds)
print >> bitext_fh, ' '.join(t_inds)
else:
bitext[(' '.join(s_inds), ' '.join(t_inds))] += mult
for ((s, t), cnt) in bitext.items():
print >> bitext_fh, str(cnt)
print >> bitext_fh, s
print >> bitext_fh, t
bitext_fh.close()
print_vocab(vocab_s, open(args.out+'.sv', 'w'))
print_vocab(vocab_t, open(args.out+'.tv', 'w'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment