olejorgenb/render_giza_alignments.py

## render_giza_alignments.py
#!/usr/bin/env python3

# Requires the following preamble:
preamle = r"""
\usepackage{rotating} %% rotated column labels
\usepackage{colortbl} %% Colored cells
\usepackage{hhline}   %% Borders that play nice with colortbl
\definecolor{lightgray}{gray}{0.8}
"""


# # Sentence pair (1)
# il s' agit de la même société qui a changé de propriétaires
# NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ })
# # Sentence pair (2)
# UNK UNK , le propriétaire , dit que cela s' est produit si rapidement qu' il n' en connaît pas la cause exacte
# NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 })
#
# The alignment file is represented by three lines for each sentence
# pair. The first line is a label that can be used, e.g., as a caption
# for alignment visualization tools.  It contains information about the
# sentence sequential number in the training corpus, sentence lengths,
# and alignment probability. The second line is the target sentence, the
# third line is the source sentence. Each token in the source sentence
# is followed by a set of zero or more numbers. These numbers represent
# the positions of the target words to which this source word is
# connected, according to the alignment.

import re

def parse_alignment_spec(source):
    """ Parse a GIZA++ alignment (*.A.n) source spec """
    alignment = []
    source_words = []
    for m in re.finditer(r"(\S+) \(\{ ([0-9 ]*)\}\)", source):
        w, w_alignment_str = m.groups()
        # NB: The spec use 1-based indexing
        w_alignment = [ int(a)-1 for a in w_alignment_str.split()]
        alignment.append((w, w_alignment))
        source_words.append(w)
        # The w_alignment is array of word-indices into the target sentence
        # meaning that w is the source of those words
    #
    return alignment

def parse_sentence(string):
    """
    Parse a simple sentence into words, represented in a dummy
    alignment format
    """
    return [ (w, []) for w in string.strip().split() ]

def parse_alignment_file(A, limit=None):
    i = 0
    label = A.readline()
    while label != "" and (limit is None or i < limit):
        target = A.readline().strip()
        source = A.readline().strip()
        alignment = parse_alignment_spec(source)
        yield alignment, parse_sentence(target)

        i += 1
        label = A.readline()

def source_align_p(source, target, i, j):
    w, alignment = source[i]
    return j in alignment

def intersection_align_p(source, target, i, j):
    _, sa = source[i]
    _, ta = target[j]
    return j in sa and i in ta

def union_align_p(source, target, i, j):
    _, sa = source[i]
    _, ta = target[j]
    return j in sa or i in ta

def align_type(source, target, i, j):
    _, sa = source[i]
    _, ta = target[j]
    aligned = j in sa
    inv_aligned = i in ta
    if aligned and inv_aligned:
        return "common"
    if aligned:
        return "aligned"
    if inv_aligned:
        return "inv_aligned"
    return "unaligned"

def alignment_color(*args):
    color_map = {
        "common"      : "black",
        "aligned"     : "lightgray",
        "inv_aligned" : "lightgray",
        "unaligned"   : None
    }
    return color_map[align_type(*args)]

def alignment_color_inv(source, target, i, j):
    return alignment_color(target, source, j, i)


#\begin{tabular}{c|c|c|c|c|}
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} bet \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}}
#\tabularnewline
#\cline{2-5}
#dog & \cellcolor{black} &  &  &
#\tabularnewline
#\cline{2-5}
#bit &  &  &  &
#\tabularnewline
#\cline{2-5}
#dog &  &  &  &
#\tabularnewline
#\cline{2-5}
#\end{tabular}

# NB: \cline and \cellcolor doesn't like each-other
def render_alignment_table(source, target, cell_color_fn=alignment_color):
    out = []
    target_words = [ w for w,_ in target ]
    column_count = len(target) # label column excluded
    out.append(r"\begin{tabular}{r|%s}" % ("m{2ex}|"*column_count))
    hline = r"\hhline{~|%s}" % ("-|" * column_count)
    row_sep = r"\tabularnewline " + hline

    # header
    header = []
    for w in [""] + target_words:
        # Use \multicolumn to turn off vertical borders for the header cells
        header.append(r"\multicolumn{1}{c}{\begin{turn}{90} %s \end{turn}}" % (w))

    out.append(" & \n".join(header))
    out.append(row_sep)

    def cell(color):
        if color:
            return r"\cellcolor{%s}" % color
        else:
            return ""

    for i, (w, alignment) in enumerate(source):
        row_columns = []
        row_columns.append(w)
        for j in range(column_count):
            color = cell_color_fn(source, target, i, j)
            row_columns.append(cell(color))

        out.append(" & ".join(row_columns))
        out.append(row_sep)

    out.append(r"\end{tabular}")
    return "\n".join(out)

def pp_alignment_table(source, target, align_p=source_align_p):
    """
          h
          u
        e n
        r d
     is #
    dog   #
    """
    row_format = "{:>10} "+("{:s} "*len(target))
    row_label_padding=" "*10
    header_lines = []
    target_words = [ w for w,_ in target ]
    for i in range(len(max(target_words, key=len))):
        line = row_label_padding
        for w in target_words:
            line += " "
            if i < len(w):
                line += w[-(i+1)]
            else:
                line += " "
        header_lines.append(line)

    header_lines.reverse()

    print("\n".join(header_lines))

    for i, (w, alignment) in enumerate(source):
        row = [ "#" if align_p(source, target, i, j) else " "
                for j in range(len(target)) ]
        print(row_format.format(w, *row))


import sys

if __name__ == '__main__':
    def write_file(content, filename):
        with open(filename, "w") as out:
            out.write(content)

    if len(sys.argv) < 3:
        i = 0
        for a, b in parse_alignment_file(sys.stdin):
            print(render_alignment_table(a, b))
            if i % 2 == 1:
                print(r"\newline")
            i += 1
        sys.exit(0)


    figure_prefix = sys.argv[3]
    with open(sys.argv[1]) as AtoB, open(sys.argv[2]) as BtoA:
        for i, (ab, ba) in enumerate(zip(
                parse_alignment_file(AtoB),
                parse_alignment_file(BtoA))):
            # Skip NULL tokens. This also makes the alignment indices correct
            # (the indices are into the target sentence which doesn't include NULL)
            a_alignment = ab[0][1:]
            b = ab[1]
            b_alignment = ba[0][1:]
            a = ba[1]
            write_file(render_alignment_table(a_alignment, b),
                       "%s-%d-a-to-b.tex" % (figure_prefix, i)
            )
            write_file(render_alignment_table(a, b_alignment, alignment_color_inv),
                       "%s-%d-b-to-a.tex" % (figure_prefix, i)
            )
            write_file(render_alignment_table(a_alignment, b_alignment),
                       "%s-%d-combined.tex" % (figure_prefix, i)
            )
            # render_alignment_table(b_alignment, a_alignment)
            # render_alignment_table(a_alignment, b_alignment, intersection_align_p)
            # render_alignment_table(a_alignment, b_alignment, union_align_p)
	#!/usr/bin/env python3

	# Requires the following preamble:
	preamle = r"""
	\usepackage{rotating} %% rotated column labels
	\usepackage{colortbl} %% Colored cells
	\usepackage{hhline} %% Borders that play nice with colortbl
	\definecolor{lightgray}{gray}{0.8}
	"""


	# # Sentence pair (1)
	# il s' agit de la même société qui a changé de propriétaires
	# NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ })
	# # Sentence pair (2)
	# UNK UNK , le propriétaire , dit que cela s' est produit si rapidement qu' il n' en connaît pas la cause exacte
	# NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 })
	#
	# The alignment file is represented by three lines for each sentence
	# pair. The first line is a label that can be used, e.g., as a caption
	# for alignment visualization tools. It contains information about the
	# sentence sequential number in the training corpus, sentence lengths,
	# and alignment probability. The second line is the target sentence, the
	# third line is the source sentence. Each token in the source sentence
	# is followed by a set of zero or more numbers. These numbers represent
	# the positions of the target words to which this source word is
	# connected, according to the alignment.

	import re

	def parse_alignment_spec(source):
	""" Parse a GIZA++ alignment (*.A.n) source spec """
	alignment = []
	source_words = []
	for m in re.finditer(r"(\S+) \(\{ ([0-9 ]*)\}\)", source):
	w, w_alignment_str = m.groups()
	# NB: The spec use 1-based indexing
	w_alignment = [ int(a)-1 for a in w_alignment_str.split()]
	alignment.append((w, w_alignment))
	source_words.append(w)
	# The w_alignment is array of word-indices into the target sentence
	# meaning that w is the source of those words
	#
	return alignment

	def parse_sentence(string):
	"""
	Parse a simple sentence into words, represented in a dummy
	alignment format
	"""
	return [ (w, []) for w in string.strip().split() ]

	def parse_alignment_file(A, limit=None):
	i = 0
	label = A.readline()
	while label != "" and (limit is None or i < limit):
	target = A.readline().strip()
	source = A.readline().strip()
	alignment = parse_alignment_spec(source)
	yield alignment, parse_sentence(target)

	i += 1
	label = A.readline()

	def source_align_p(source, target, i, j):
	w, alignment = source[i]
	return j in alignment

	def intersection_align_p(source, target, i, j):
	_, sa = source[i]
	_, ta = target[j]
	return j in sa and i in ta

	def union_align_p(source, target, i, j):
	_, sa = source[i]
	_, ta = target[j]
	return j in sa or i in ta

	def align_type(source, target, i, j):
	_, sa = source[i]
	_, ta = target[j]
	aligned = j in sa
	inv_aligned = i in ta
	if aligned and inv_aligned:
	return "common"
	if aligned:
	return "aligned"
	if inv_aligned:
	return "inv_aligned"
	return "unaligned"

	def alignment_color(*args):
	color_map = {
	"common" : "black",
	"aligned" : "lightgray",
	"inv_aligned" : "lightgray",
	"unaligned" : None
	}
	return color_map[align_type(*args)]

	def alignment_color_inv(source, target, i, j):
	return alignment_color(target, source, j, i)


	#\begin{tabular}{c\|c\|c\|c\|c\|}
	#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}} &
	#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
	#\multicolumn{1}{c}{\begin{turn}{90} bet \end{turn}} &
	#\multicolumn{1}{c}{\begin{turn}{90} hund \end{turn}} &
	#\multicolumn{1}{c}{\begin{turn}{90} \end{turn}}
	#\tabularnewline
	#\cline{2-5}
	#dog & \cellcolor{black} & & &
	#\tabularnewline
	#\cline{2-5}
	#bit & & & &
	#\tabularnewline
	#\cline{2-5}
	#dog & & & &
	#\tabularnewline
	#\cline{2-5}
	#\end{tabular}

	# NB: \cline and \cellcolor doesn't like each-other
	def render_alignment_table(source, target, cell_color_fn=alignment_color):
	out = []
	target_words = [ w for w,_ in target ]
	column_count = len(target) # label column excluded
	out.append(r"\begin{tabular}{r\|%s}" % ("m{2ex}\|"*column_count))
	hline = r"\hhline{~\|%s}" % ("-\|" * column_count)
	row_sep = r"\tabularnewline " + hline

	# header
	header = []
	for w in [""] + target_words:
	# Use \multicolumn to turn off vertical borders for the header cells
	header.append(r"\multicolumn{1}{c}{\begin{turn}{90} %s \end{turn}}" % (w))

	out.append(" & \n".join(header))
	out.append(row_sep)

	def cell(color):
	if color:
	return r"\cellcolor{%s}" % color
	else:
	return ""

	for i, (w, alignment) in enumerate(source):
	row_columns = []
	row_columns.append(w)
	for j in range(column_count):
	color = cell_color_fn(source, target, i, j)
	row_columns.append(cell(color))

	out.append(" & ".join(row_columns))
	out.append(row_sep)

	out.append(r"\end{tabular}")
	return "\n".join(out)

	def pp_alignment_table(source, target, align_p=source_align_p):
	"""
	h
	u
	e n
	r d
	is #
	dog #
	"""
	row_format = "{:>10} "+("{:s} "*len(target))
	row_label_padding=" "*10
	header_lines = []
	target_words = [ w for w,_ in target ]
	for i in range(len(max(target_words, key=len))):
	line = row_label_padding
	for w in target_words:
	line += " "
	if i < len(w):
	line += w[-(i+1)]
	else:
	line += " "
	header_lines.append(line)

	header_lines.reverse()

	print("\n".join(header_lines))

	for i, (w, alignment) in enumerate(source):
	row = [ "#" if align_p(source, target, i, j) else " "
	for j in range(len(target)) ]
	print(row_format.format(w, *row))


	import sys

	if __name__ == '__main__':
	def write_file(content, filename):
	with open(filename, "w") as out:
	out.write(content)

	if len(sys.argv) < 3:
	i = 0
	for a, b in parse_alignment_file(sys.stdin):
	print(render_alignment_table(a, b))
	if i % 2 == 1:
	print(r"\newline")
	i += 1
	sys.exit(0)


	figure_prefix = sys.argv[3]
	with open(sys.argv[1]) as AtoB, open(sys.argv[2]) as BtoA:
	for i, (ab, ba) in enumerate(zip(
	parse_alignment_file(AtoB),
	parse_alignment_file(BtoA))):
	# Skip NULL tokens. This also makes the alignment indices correct
	# (the indices are into the target sentence which doesn't include NULL)
	a_alignment = ab[0][1:]
	b = ab[1]
	b_alignment = ba[0][1:]
	a = ba[1]
	write_file(render_alignment_table(a_alignment, b),
	"%s-%d-a-to-b.tex" % (figure_prefix, i)
	)
	write_file(render_alignment_table(a, b_alignment, alignment_color_inv),
	"%s-%d-b-to-a.tex" % (figure_prefix, i)
	)
	write_file(render_alignment_table(a_alignment, b_alignment),
	"%s-%d-combined.tex" % (figure_prefix, i)
	)
	# render_alignment_table(b_alignment, a_alignment)
	# render_alignment_table(a_alignment, b_alignment, intersection_align_p)
	# render_alignment_table(a_alignment, b_alignment, union_align_p)