Last active
November 25, 2016 07:43
-
-
Save olejorgenb/ebb29ded97cb955cd5a5df4cb1538d3c to your computer and use it in GitHub Desktop.
Messy code that renders GIZA+ word alignments to latex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Requires the following preamble: | |
preamle = r""" | |
\usepackage{rotating} %% rotated column labels | |
\usepackage{colortbl} %% Colored cells | |
\usepackage{hhline} %% Borders that play nice with colortbl | |
\definecolor{lightgray}{gray}{0.8} | |
""" | |
# # Sentence pair (1) | |
# il s' agit de la même société qui a changé de propriétaires | |
# NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ }) | |
# # Sentence pair (2) | |
# UNK UNK , le propriétaire , dit que cela s' est produit si rapidement qu' il n' en connaît pas la cause exacte | |
# NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 }) | |
# | |
# The alignment file is represented by three lines for each sentence | |
# pair. The first line is a label that can be used, e.g., as a caption | |
# for alignment visualization tools. It contains information about the | |
# sentence sequential number in the training corpus, sentence lengths, | |
# and alignment probability. The second line is the target sentence, the | |
# third line is the source sentence. Each token in the source sentence | |
# is followed by a set of zero or more numbers. These numbers represent | |
# the positions of the target words to which this source word is | |
# connected, according to the alignment. | |
import re | |
def parse_alignment_spec(source): | |
""" Parse a GIZA++ alignment (*.A.n) source spec """ | |
alignment = [] | |
source_words = [] | |
for m in re.finditer(r"(\S+) \(\{ ([0-9 ]*)\}\)", source): | |
w, w_alignment_str = m.groups() | |
# NB: The spec use 1-based indexing | |
w_alignment = [ int(a)-1 for a in w_alignment_str.split()] | |
alignment.append((w, w_alignment)) | |
source_words.append(w) | |
# The w_alignment is array of word-indices into the target sentence | |
# meaning that w is the source of those words | |
# | |
return alignment | |
def parse_sentence(string): | |
""" | |
Parse a simple sentence into words, represented in a dummy | |
alignment format | |
""" | |
return [ (w, []) for w in string.strip().split() ] | |
def parse_alignment_file(A, limit=None): | |
i = 0 | |
label = A.readline() | |
while label != "" and (limit is None or i < limit): | |
target = A.readline().strip() | |
source = A.readline().strip() | |
alignment = parse_alignment_spec(source) | |
yield alignment, parse_sentence(target) | |
i += 1 | |
label = A.readline() | |
def source_align_p(source, target, i, j): | |
w, alignment = source[i] | |
return j in alignment | |
def intersection_align_p(source, target, i, j): | |
_, sa = source[i] | |
_, ta = target[j] | |
return j in sa and i in ta | |
def union_align_p(source, target, i, j): | |
_, sa = source[i] | |
_, ta = target[j] | |
return j in sa or i in ta | |
def align_type(source, target, i, j): | |
_, sa = source[i] | |
_, ta = target[j] | |
aligned = j in sa | |
inv_aligned = i in ta | |
if aligned and inv_aligned: | |
return "common" | |
if aligned: | |
return "aligned" | |
if inv_aligned: | |
return "inv_aligned" | |
return "unaligned" | |
def alignment_color(*args): | |
color_map = { | |
"common" : "black", | |
"aligned" : "lightgray", | |
"inv_aligned" : "lightgray", | |
"unaligned" : None | |
} | |
return color_map[align_type(*args)] | |
def alignment_color_inv(source, target, i, j): | |
return alignment_color(target, source, j, i) | |
#\begin{tabular}{c|c|c|c|c|} | |
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}} & | |
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} & | |
#\multicolumn{1}{c}{\begin{turn}{90} bet \end{turn}} & | |
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} & | |
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}} | |
#\tabularnewline | |
#\cline{2-5} | |
#dog & \cellcolor{black} & & & | |
#\tabularnewline | |
#\cline{2-5} | |
#bit & & & & | |
#\tabularnewline | |
#\cline{2-5} | |
#dog & & & & | |
#\tabularnewline | |
#\cline{2-5} | |
#\end{tabular} | |
# NB: \cline and \cellcolor doesn't like each-other | |
def render_alignment_table(source, target, cell_color_fn=alignment_color): | |
out = [] | |
target_words = [ w for w,_ in target ] | |
column_count = len(target) # label column excluded | |
out.append(r"\begin{tabular}{r|%s}" % ("m{2ex}|"*column_count)) | |
hline = r"\hhline{~|%s}" % ("-|" * column_count) | |
row_sep = r"\tabularnewline " + hline | |
# header | |
header = [] | |
for w in [""] + target_words: | |
# Use \multicolumn to turn off vertical borders for the header cells | |
header.append(r"\multicolumn{1}{c}{\begin{turn}{90} %s \end{turn}}" % (w)) | |
out.append(" & \n".join(header)) | |
out.append(row_sep) | |
def cell(color): | |
if color: | |
return r"\cellcolor{%s}" % color | |
else: | |
return "" | |
for i, (w, alignment) in enumerate(source): | |
row_columns = [] | |
row_columns.append(w) | |
for j in range(column_count): | |
color = cell_color_fn(source, target, i, j) | |
row_columns.append(cell(color)) | |
out.append(" & ".join(row_columns)) | |
out.append(row_sep) | |
out.append(r"\end{tabular}") | |
return "\n".join(out) | |
def pp_alignment_table(source, target, align_p=source_align_p): | |
""" | |
h | |
u | |
e n | |
r d | |
is # | |
dog # | |
""" | |
row_format = "{:>10} "+("{:s} "*len(target)) | |
row_label_padding=" "*10 | |
header_lines = [] | |
target_words = [ w for w,_ in target ] | |
for i in range(len(max(target_words, key=len))): | |
line = row_label_padding | |
for w in target_words: | |
line += " " | |
if i < len(w): | |
line += w[-(i+1)] | |
else: | |
line += " " | |
header_lines.append(line) | |
header_lines.reverse() | |
print("\n".join(header_lines)) | |
for i, (w, alignment) in enumerate(source): | |
row = [ "#" if align_p(source, target, i, j) else " " | |
for j in range(len(target)) ] | |
print(row_format.format(w, *row)) | |
import sys | |
if __name__ == '__main__': | |
def write_file(content, filename): | |
with open(filename, "w") as out: | |
out.write(content) | |
if len(sys.argv) < 3: | |
i = 0 | |
for a, b in parse_alignment_file(sys.stdin): | |
print(render_alignment_table(a, b)) | |
if i % 2 == 1: | |
print(r"\newline") | |
i += 1 | |
sys.exit(0) | |
figure_prefix = sys.argv[3] | |
with open(sys.argv[1]) as AtoB, open(sys.argv[2]) as BtoA: | |
for i, (ab, ba) in enumerate(zip( | |
parse_alignment_file(AtoB), | |
parse_alignment_file(BtoA))): | |
# Skip NULL tokens. This also makes the alignment indices correct | |
# (the indices are into the target sentence which doesn't include NULL) | |
a_alignment = ab[0][1:] | |
b = ab[1] | |
b_alignment = ba[0][1:] | |
a = ba[1] | |
write_file(render_alignment_table(a_alignment, b), | |
"%s-%d-a-to-b.tex" % (figure_prefix, i) | |
) | |
write_file(render_alignment_table(a, b_alignment, alignment_color_inv), | |
"%s-%d-b-to-a.tex" % (figure_prefix, i) | |
) | |
write_file(render_alignment_table(a_alignment, b_alignment), | |
"%s-%d-combined.tex" % (figure_prefix, i) | |
) | |
# render_alignment_table(b_alignment, a_alignment) | |
# render_alignment_table(a_alignment, b_alignment, intersection_align_p) | |
# render_alignment_table(a_alignment, b_alignment, union_align_p) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment