neubig/counterrors.pl

## counterrors.pl
#!/usr/bin/perl

use strict;
use utf8;
use List::Util qw(max min);
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";

if(@ARGV != 2) {
    print STDERR "Usage: counterrors.pl REFERENCE SYSTEM\n";
    exit;
}

# return a minimum edit distance path with the following notation, plus cost
#  d=delete i=insert s=substitute e=equal
sub levenshtein {
    my ($s, $t) = @_;
    my (@sc, $m, @tc, $n, %d, %str, $i, $j, $id, $cost, $type, $aid, $bid, $cid, $c);
    @sc = split(/ /, $s);
    $m = @sc;
    @tc = split(/ /, $t);
    $n = @tc;
    # initialize
    %d = (); %str = ();
    foreach $i (0 .. $m) { $id = pack('S2', $i, 0); $d{$id} = $i; $str{$id} = 'd'x$i; }
    foreach $j (1 .. $n) { $id = pack('S2', 0, $j); $d{$id} = $j; $str{$id} = 'i'x$j; }

    foreach $i (1 .. $m) {
        foreach $j (1 .. $n) {
            if($sc[$i-1] eq $tc[$j-1]) {
                $cost = 0; $type = 'e'; # equal
            } else {
                $cost = 1.1; $type = 's'; # substitution
            }

            $aid = pack('S2', $i-1, $j); $a = $d{$aid} + 1; # deletion
            $bid = pack('S2', $i, $j-1); $b = $d{$bid} + 1; # insertion
            $cid = pack('S2', $i-1, $j-1); $c = $d{$cid} + $cost; # insertion

            $id = pack('S2', $i, $j);

            # we want matches to come at the end, so do deletions/insertions first
            if($a <= $b and $a <= $c) {
                $d{$id} = $a;
                $type = 'd';
                $str{$id} = $str{$aid}.'d';
            }
            elsif($b <= $c) {
                $d{$id} = $b;
                $type = 'i';
                $str{$id} = $str{$bid}.'i';
            }
            else {
                $d{$id} = $c;
                $str{$id} = $str{$cid}.$type;
            }

            delete $d{$cid};
            delete $str{$cid};
            # print "".$sc[$i-1]." ".$tc[$j-1]." $i $j $a $b $c $d[$id] $type\n"
        }
    }

    $id = pack('S2', $m, $n);
    return ($str{$id}, $d{$id});
}

# find which of the inputs are in error
open REF, "<:utf8", $ARGV[0] or die $!;
open TEST,    "<:utf8", $ARGV[1] or die $!;
my ($ref, $test, @refs, @tests, %errs, @hists);
while($ref = <REF> and $test = <TEST>) {
    chomp $ref; chomp $test;
    my ($hist, $score) = levenshtein($ref, $test);
    @refs = split(/ /, $ref);
    @tests = split(/ /, $test);
    @hists = split(//, $hist);
    my (@rerr, @terr);
    while(@hists) {
        my $h = shift(@hists);
        if($h eq 'e') {
            if(@rerr+@terr) {
                my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
                $errs{$err}++;
                @rerr = ();
                @terr = ();
            }
            shift @refs; shift @tests;
        } else {
            push @rerr, shift(@refs) if $h ne 'i';
            push @terr, shift(@tests) if $h ne 'd';
        }
    }
    if(@rerr+@terr) {
        my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
        $errs{$err}++;
    }
}
close REF;
close TEST;

for(sort { $errs{$b} <=> $errs{$a} } keys %errs) {
    print "$_\t$errs{$_}\n";
}
	#!/usr/bin/perl

	use strict;
	use utf8;
	use List::Util qw(max min);
	binmode STDIN, ":utf8";
	binmode STDOUT, ":utf8";

	if(@ARGV != 2) {
	print STDERR "Usage: counterrors.pl REFERENCE SYSTEM\n";
	exit;
	}

	# return a minimum edit distance path with the following notation, plus cost
	# d=delete i=insert s=substitute e=equal
	sub levenshtein {
	my ($s, $t) = @_;
	my (@sc, $m, @tc, $n, %d, %str, $i, $j, $id, $cost, $type, $aid, $bid, $cid, $c);
	@sc = split(/ /, $s);
	$m = @sc;
	@tc = split(/ /, $t);
	$n = @tc;
	# initialize
	%d = (); %str = ();
	foreach $i (0 .. $m) { $id = pack('S2', $i, 0); $d{$id} = $i; $str{$id} = 'd'x$i; }
	foreach $j (1 .. $n) { $id = pack('S2', 0, $j); $d{$id} = $j; $str{$id} = 'i'x$j; }

	foreach $i (1 .. $m) {
	foreach $j (1 .. $n) {
	if($sc[$i-1] eq $tc[$j-1]) {
	$cost = 0; $type = 'e'; # equal
	} else {
	$cost = 1.1; $type = 's'; # substitution
	}

	$aid = pack('S2', $i-1, $j); $a = $d{$aid} + 1; # deletion
	$bid = pack('S2', $i, $j-1); $b = $d{$bid} + 1; # insertion
	$cid = pack('S2', $i-1, $j-1); $c = $d{$cid} + $cost; # insertion

	$id = pack('S2', $i, $j);

	# we want matches to come at the end, so do deletions/insertions first
	if($a <= $b and $a <= $c) {
	$d{$id} = $a;
	$type = 'd';
	$str{$id} = $str{$aid}.'d';
	}
	elsif($b <= $c) {
	$d{$id} = $b;
	$type = 'i';
	$str{$id} = $str{$bid}.'i';
	}
	else {
	$d{$id} = $c;
	$str{$id} = $str{$cid}.$type;
	}

	delete $d{$cid};
	delete $str{$cid};
	# print "".$sc[$i-1]." ".$tc[$j-1]." $i $j $a $b $c $d[$id] $type\n"
	}
	}

	$id = pack('S2', $m, $n);
	return ($str{$id}, $d{$id});
	}

	# find which of the inputs are in error
	open REF, "<:utf8", $ARGV[0] or die $!;
	open TEST, "<:utf8", $ARGV[1] or die $!;
	my ($ref, $test, @refs, @tests, %errs, @hists);
	while($ref = <REF> and $test = <TEST>) {
	chomp $ref; chomp $test;
	my ($hist, $score) = levenshtein($ref, $test);
	@refs = split(/ /, $ref);
	@tests = split(/ /, $test);
	@hists = split(//, $hist);
	my (@rerr, @terr);
	while(@hists) {
	my $h = shift(@hists);
	if($h eq 'e') {
	if(@rerr+@terr) {
	my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
	$errs{$err}++;
	@rerr = ();
	@terr = ();
	}
	shift @refs; shift @tests;
	} else {
	push @rerr, shift(@refs) if $h ne 'i';
	push @terr, shift(@tests) if $h ne 'd';
	}
	}
	if(@rerr+@terr) {
	my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
	$errs{$err}++;
	}
	}
	close REF;
	close TEST;

	for(sort { $errs{$b} <=> $errs{$a} } keys %errs) {
	print "$_\t$errs{$_}\n";
	}