Skip to content

Instantly share code, notes, and snippets.

@benui-dev
Created August 15, 2010 01:18
Show Gist options
  • Save benui-dev/524931 to your computer and use it in GitHub Desktop.
Save benui-dev/524931 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
use Data::Dumper;
# Assume that we've got sentence-aligned stuff
my $sentence_pairs = [
[ [ qw( the book ) ], [ qw( das buch ) ], ],
# [ [ qw( the house ) ], [ qw( das haus ) ], ],
# [ [ qw( a book ) ], [ qw( ein buch ) ], ],
# [ [ qw( a house ) ], [ qw( ein haus ) ], ],
# [ [ qw( a the book house ) ], [ qw( ein das buch haus ) ], ],
];
# Shortcut, could have generated these
my @english_words = qw( the book );
my @german_words = qw( das buch );
#my @english_words = qw( the a book house );
#my @german_words = qw( das ein buch haus );
# Set up t uniformly
# t(e|f)
# cheat
my $t;
foreach my $e (@english_words) {
foreach my $g (@german_words) {
$t->{$g}->{$e} = 0.25;
}
}
# Should loop through until Expectation converges with Maximisation
# Can use perplexity to work out when we need to stop
foreach my $x ( 1...10 ) {
# Initialise counts to zero
# count(e|f)
my $count;
foreach my $g (@german_words) {
foreach my $e (@english_words) {
$count->{$g}->{$e} = 0;
}
}
# Initialize total to zero
# total(f)
my %gtotal;
foreach my $g (@german_words) {
$gtotal{$g} = 0;
}
# for all sentence pairs
foreach my $sentence_pair (@$sentence_pairs) {
my ($english, $german) = @$sentence_pair;
print "Training '" . join(' ', @$english) . "' and '" . join(' ', @$german) . "'\n";
# Calculate total probability for each English word
# based on t(e|f)
my %etotal;
foreach my $e (@$english) {
$etotal{$e} = 0;
foreach my $g (@$german) {
$etotal{$e} += $t->{$g}->{$e};
}
}
print "English total: " . Dumper(\%etotal);
#
foreach my $e (@$english) {
foreach my $g (@$german) {
print $count->{$g}->{$e} . " += " . $t->{$g}->{$e} . " / " . $etotal{$e} . "\n";
$count->{$g}->{$e} += $t->{$g}->{$e} / $etotal{$e};
$gtotal{$g} += $t->{$g}->{$e} / $etotal{$e};
}
}
print "Count: " . Dumper($count);
print "gTotal: " . Dumper(\%gtotal);
}
# Estimate probabilities
# Based on the counts we've esen, update the probabilities t(e|f)
foreach my $g (@german_words) {
foreach my $e (@english_words) {
$t->{$g}->{$e} += $count->{$g}->{$e} / $gtotal{$g};
}
}
print Dumper($t);
print "Hit Enter to continue...";
my $meh = <stdin>;
}
# OK we're converged and done. Or somethin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment