benui-dev/IBM_Model_1.pl

## IBM_Model_1.pl
use strict;
use warnings;

use Data::Dumper;


# Assume that we've got sentence-aligned stuff
my $sentence_pairs = [
    [ [ qw( the book  ) ], [ qw( das buch ) ], ],
#    [ [ qw( the house ) ], [ qw( das haus ) ], ],
#    [ [ qw( a book    ) ], [ qw( ein buch ) ], ],
#    [ [ qw( a house   ) ], [ qw( ein haus ) ], ],
#    [ [ qw( a the book house  ) ], [ qw( ein das buch haus ) ], ],
];

# Shortcut, could have generated these
my @english_words = qw( the book );
my @german_words  = qw( das buch );

#my @english_words = qw( the a book house );
#my @german_words  = qw( das ein buch haus );

# Set up t uniformly
# t(e|f)
# cheat
my $t;
foreach my $e (@english_words) {
    foreach my $g (@german_words) {
        $t->{$g}->{$e} = 0.25;
    }
}


# Should loop through until Expectation converges with Maximisation
# Can use perplexity to work out when we need to stop
foreach my $x ( 1...10 ) {

    # Initialise counts to zero
    # count(e|f)
    my $count;
    foreach my $g (@german_words) {
        foreach my $e (@english_words) {
            $count->{$g}->{$e} = 0;
        }
    }

    # Initialize total to zero
    # total(f)
    my %gtotal;
    foreach my $g (@german_words) {
        $gtotal{$g} = 0;
    }

    # for all sentence pairs
    foreach my $sentence_pair (@$sentence_pairs) {
        my ($english, $german) = @$sentence_pair;

        print "Training '" . join(' ', @$english) . "' and '" . join(' ', @$german) . "'\n";

        # Calculate total probability for each English word
        # based on t(e|f)
        my %etotal;
        foreach my $e (@$english) {
            $etotal{$e} = 0;
            foreach my $g (@$german) {
                $etotal{$e} += $t->{$g}->{$e};
            }
        }
        print "English total: " . Dumper(\%etotal);

        #
        foreach my $e (@$english) {
            foreach my $g (@$german) {
                print $count->{$g}->{$e} . " += " . $t->{$g}->{$e} . " / " . $etotal{$e} . "\n";
                $count->{$g}->{$e} += $t->{$g}->{$e} / $etotal{$e};
                $gtotal{$g}        += $t->{$g}->{$e} / $etotal{$e};
            }
        }
        print "Count: " . Dumper($count);
        print "gTotal: " . Dumper(\%gtotal);
    }

    # Estimate probabilities
    # Based on the counts we've esen, update the probabilities t(e|f)
    foreach my $g (@german_words) {
        foreach my $e (@english_words) {
            $t->{$g}->{$e} += $count->{$g}->{$e} / $gtotal{$g};
        }
    }

    print Dumper($t);

    print "Hit Enter to continue...";
    my $meh = <stdin>;


}

# OK we're converged and done. Or somethin
	use strict;
	use warnings;

	use Data::Dumper;


	# Assume that we've got sentence-aligned stuff
	my $sentence_pairs = [
	[ [ qw( the book ) ], [ qw( das buch ) ], ],
	# [ [ qw( the house ) ], [ qw( das haus ) ], ],
	# [ [ qw( a book ) ], [ qw( ein buch ) ], ],
	# [ [ qw( a house ) ], [ qw( ein haus ) ], ],
	# [ [ qw( a the book house ) ], [ qw( ein das buch haus ) ], ],
	];

	# Shortcut, could have generated these
	my @english_words = qw( the book );
	my @german_words = qw( das buch );

	#my @english_words = qw( the a book house );
	#my @german_words = qw( das ein buch haus );

	# Set up t uniformly
	# t(e\|f)
	# cheat
	my $t;
	foreach my $e (@english_words) {
	foreach my $g (@german_words) {
	$t->{$g}->{$e} = 0.25;
	}
	}


	# Should loop through until Expectation converges with Maximisation
	# Can use perplexity to work out when we need to stop
	foreach my $x ( 1...10 ) {

	# Initialise counts to zero
	# count(e\|f)
	my $count;
	foreach my $g (@german_words) {
	foreach my $e (@english_words) {
	$count->{$g}->{$e} = 0;
	}
	}

	# Initialize total to zero
	# total(f)
	my %gtotal;
	foreach my $g (@german_words) {
	$gtotal{$g} = 0;
	}

	# for all sentence pairs
	foreach my $sentence_pair (@$sentence_pairs) {
	my ($english, $german) = @$sentence_pair;

	print "Training '" . join(' ', @$english) . "' and '" . join(' ', @$german) . "'\n";

	# Calculate total probability for each English word
	# based on t(e\|f)
	my %etotal;
	foreach my $e (@$english) {
	$etotal{$e} = 0;
	foreach my $g (@$german) {
	$etotal{$e} += $t->{$g}->{$e};
	}
	}
	print "English total: " . Dumper(\%etotal);

	#
	foreach my $e (@$english) {
	foreach my $g (@$german) {
	print $count->{$g}->{$e} . " += " . $t->{$g}->{$e} . " / " . $etotal{$e} . "\n";
	$count->{$g}->{$e} += $t->{$g}->{$e} / $etotal{$e};
	$gtotal{$g} += $t->{$g}->{$e} / $etotal{$e};
	}
	}
	print "Count: " . Dumper($count);
	print "gTotal: " . Dumper(\%gtotal);
	}

	# Estimate probabilities
	# Based on the counts we've esen, update the probabilities t(e\|f)
	foreach my $g (@german_words) {
	foreach my $e (@english_words) {
	$t->{$g}->{$e} += $count->{$g}->{$e} / $gtotal{$g};
	}
	}

	print Dumper($t);

	print "Hit Enter to continue...";
	my $meh = <stdin>;


	}

	# OK we're converged and done. Or somethin