Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Messy code that renders GIZA+ word alignments to latex
#!/usr/bin/env python3
# Requires the following preamble:
preamle = r"""
\usepackage{rotating} %% rotated column labels
\usepackage{colortbl} %% Colored cells
\usepackage{hhline} %% Borders that play nice with colortbl
\definecolor{lightgray}{gray}{0.8}
"""
# # Sentence pair (1)
# il s' agit de la même société qui a changé de propriétaires
# NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ })
# # Sentence pair (2)
# UNK UNK , le propriétaire , dit que cela s' est produit si rapidement qu' il n' en connaît pas la cause exacte
# NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 })
#
# The alignment file is represented by three lines for each sentence
# pair. The first line is a label that can be used, e.g., as a caption
# for alignment visualization tools. It contains information about the
# sentence sequential number in the training corpus, sentence lengths,
# and alignment probability. The second line is the target sentence, the
# third line is the source sentence. Each token in the source sentence
# is followed by a set of zero or more numbers. These numbers represent
# the positions of the target words to which this source word is
# connected, according to the alignment.
import re
def parse_alignment_spec(source):
""" Parse a GIZA++ alignment (*.A.n) source spec """
alignment = []
source_words = []
for m in re.finditer(r"(\S+) \(\{ ([0-9 ]*)\}\)", source):
w, w_alignment_str = m.groups()
# NB: The spec use 1-based indexing
w_alignment = [ int(a)-1 for a in w_alignment_str.split()]
alignment.append((w, w_alignment))
source_words.append(w)
# The w_alignment is array of word-indices into the target sentence
# meaning that w is the source of those words
#
return alignment
def parse_sentence(string):
"""
Parse a simple sentence into words, represented in a dummy
alignment format
"""
return [ (w, []) for w in string.strip().split() ]
def parse_alignment_file(A, limit=None):
i = 0
label = A.readline()
while label != "" and (limit is None or i < limit):
target = A.readline().strip()
source = A.readline().strip()
alignment = parse_alignment_spec(source)
yield alignment, parse_sentence(target)
i += 1
label = A.readline()
def source_align_p(source, target, i, j):
w, alignment = source[i]
return j in alignment
def intersection_align_p(source, target, i, j):
_, sa = source[i]
_, ta = target[j]
return j in sa and i in ta
def union_align_p(source, target, i, j):
_, sa = source[i]
_, ta = target[j]
return j in sa or i in ta
def align_type(source, target, i, j):
_, sa = source[i]
_, ta = target[j]
aligned = j in sa
inv_aligned = i in ta
if aligned and inv_aligned:
return "common"
if aligned:
return "aligned"
if inv_aligned:
return "inv_aligned"
return "unaligned"
def alignment_color(*args):
color_map = {
"common" : "black",
"aligned" : "lightgray",
"inv_aligned" : "lightgray",
"unaligned" : None
}
return color_map[align_type(*args)]
def alignment_color_inv(source, target, i, j):
return alignment_color(target, source, j, i)
#\begin{tabular}{c|c|c|c|c|}
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} bet \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}}
#\tabularnewline
#\cline{2-5}
#dog & \cellcolor{black} & & &
#\tabularnewline
#\cline{2-5}
#bit & & & &
#\tabularnewline
#\cline{2-5}
#dog & & & &
#\tabularnewline
#\cline{2-5}
#\end{tabular}
# NB: \cline and \cellcolor doesn't like each-other
def render_alignment_table(source, target, cell_color_fn=alignment_color):
out = []
target_words = [ w for w,_ in target ]
column_count = len(target) # label column excluded
out.append(r"\begin{tabular}{r|%s}" % ("m{2ex}|"*column_count))
hline = r"\hhline{~|%s}" % ("-|" * column_count)
row_sep = r"\tabularnewline " + hline
# header
header = []
for w in [""] + target_words:
# Use \multicolumn to turn off vertical borders for the header cells
header.append(r"\multicolumn{1}{c}{\begin{turn}{90} %s \end{turn}}" % (w))
out.append(" & \n".join(header))
out.append(row_sep)
def cell(color):
if color:
return r"\cellcolor{%s}" % color
else:
return ""
for i, (w, alignment) in enumerate(source):
row_columns = []
row_columns.append(w)
for j in range(column_count):
color = cell_color_fn(source, target, i, j)
row_columns.append(cell(color))
out.append(" & ".join(row_columns))
out.append(row_sep)
out.append(r"\end{tabular}")
return "\n".join(out)
def pp_alignment_table(source, target, align_p=source_align_p):
"""
h
u
e n
r d
is #
dog #
"""
row_format = "{:>10} "+("{:s} "*len(target))
row_label_padding=" "*10
header_lines = []
target_words = [ w for w,_ in target ]
for i in range(len(max(target_words, key=len))):
line = row_label_padding
for w in target_words:
line += " "
if i < len(w):
line += w[-(i+1)]
else:
line += " "
header_lines.append(line)
header_lines.reverse()
print("\n".join(header_lines))
for i, (w, alignment) in enumerate(source):
row = [ "#" if align_p(source, target, i, j) else " "
for j in range(len(target)) ]
print(row_format.format(w, *row))
import sys
if __name__ == '__main__':
def write_file(content, filename):
with open(filename, "w") as out:
out.write(content)
if len(sys.argv) < 3:
i = 0
for a, b in parse_alignment_file(sys.stdin):
print(render_alignment_table(a, b))
if i % 2 == 1:
print(r"\newline")
i += 1
sys.exit(0)
figure_prefix = sys.argv[3]
with open(sys.argv[1]) as AtoB, open(sys.argv[2]) as BtoA:
for i, (ab, ba) in enumerate(zip(
parse_alignment_file(AtoB),
parse_alignment_file(BtoA))):
# Skip NULL tokens. This also makes the alignment indices correct
# (the indices are into the target sentence which doesn't include NULL)
a_alignment = ab[0][1:]
b = ab[1]
b_alignment = ba[0][1:]
a = ba[1]
write_file(render_alignment_table(a_alignment, b),
"%s-%d-a-to-b.tex" % (figure_prefix, i)
)
write_file(render_alignment_table(a, b_alignment, alignment_color_inv),
"%s-%d-b-to-a.tex" % (figure_prefix, i)
)
write_file(render_alignment_table(a_alignment, b_alignment),
"%s-%d-combined.tex" % (figure_prefix, i)
)
# render_alignment_table(b_alignment, a_alignment)
# render_alignment_table(a_alignment, b_alignment, intersection_align_p)
# render_alignment_table(a_alignment, b_alignment, union_align_p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment