jimregan/expected-output.xml

## expected-output.xml
    <rule id="NA_WZAJEM" name="„na wzajem” (nawzajem)">
      <pattern>
        <token>na</token>
        <token>wzajem</token>
      </pattern>
      <message>Ten wyraz zwykle pisze się łącznie: <suggestion>\1\2</suggestion>.</message>
      <short>Prawdopodobna literówka</short>
      <example correction="nawzajem" type="incorrect">Oni kochają się <marker>na wzajem</marker>.</example>
      <example type="correct">Oni kochają się nawzajem.</example>
    </rule>

## joined.pl
#!/usr/bin/perl

use warnings;
use strict;
use utf8;

use Getopt::Long;
#uncomment to use unidecode
#use Text::Unidecode;

my $lang = 'pl';
my $input = '';
my $output = '';
my $encoding = 'UTF-8';

my $fhin = *STDIN;
my $fhout = *STDOUT;

my $res = GetOptions(
	'lang=s' => \$lang,
	'in=s' => \$input,
	'out=s' => \$output,
	'enc=s' => \$encoding
);

if ($input ne '') {
	open ($fhin, "<$input");
}

if ($output ne '') {
	open ($fhout, ">$output");
}

binmode($fhin, ":encoding($encoding)");
binmode($fhout, ":encoding(UTF-8)");

my %message = (
	'pl' => 'Ten wyraz zwykle pisze się łącznie:',
	'en' => 'Did you mean',
);

my %short = (
	'pl' => 'Prawdopodobna literówka',
	'en' => 'Possible typo',
);

while (<$fhin>) {
	chomp;
	my ($incorrect, $example) = split/\t/, $_;

	# We could probably check the example for the incorrect form, but it
	# seems better (or, at least, easier) to just crap out on the line.
	if ($incorrect !~ / /) {
		print "Error: no spaces in incorrect form: $_\n";
		next;
	}

	# The correct form is the incorrect minus spaces
	my $correct = $incorrect;
	$correct =~ s/ //g;

	# Uncoment for unidecode
	# my $id = uc(unidecode($incorrect));

	# without unidecode, this should be ok for Polish
	# comment these two lines to use unidecode
	my $id = uc($incorrect);
	$id =~ tr/ĄĆĘŁŃÓŚŻŹ/ACELNOSZZ/;

	# Do this anyway
	$id =~ s/ /_/g;

	print $fhout "    <rule id=\"$id\" name=\"„$incorrect” ($correct)\">\n";

	my @parts = split/ /, $incorrect;

	print $fhout "      <pattern>\n";
	for my $part (@parts) {
		print "        <token>$part</token>\n";
	}
	print $fhout "      </pattern>\n";

	print $fhout "      <message>$message{$lang} <suggestion>";
	for (my $i=1; $i<=($#parts+1); $i++) {
		print $fhout "\\$i";
	}
	print $fhout "</suggestion>.</message>\n";
	print $fhout "      <short>$short{$lang}</short>\n";

	my $outincor = $example;
	my $outcor = $example;
	if ($example =~ /($incorrect)/i) {
		my $m = $1;
		$outincor =~ s#$m#<marker>$m</marker>#;
		$outcor =~ s#$m#$correct#;
	} elsif ($example =~ /($correct)/i) {
		my $m = $1;
		$outincor =~ s#$m#<marker>$incorrect</marker>#;
	} else {
		print "Error: example contains neither correct nor incorrect phrase: $example\n";
		next;
	}

	print "      <example correction=\"$correct\" type=\"incorrect\">${outincor}.</example>\n";
	print "      <example type=\"correct\">${outcor}.</example>\n";
	print "    </rule>\n";
}

## sample_input.tsv

          
            na wzajem
            Oni kochają się na wzajem
	<rule id="NA_WZAJEM" name="„na wzajem” (nawzajem)">
	<pattern>
	<token>na</token>
	<token>wzajem</token>
	</pattern>
	<message>Ten wyraz zwykle pisze się łącznie: <suggestion>\1\2</suggestion>.</message>
	<short>Prawdopodobna literówka</short>
	<example correction="nawzajem" type="incorrect">Oni kochają się <marker>na wzajem</marker>.</example>
	<example type="correct">Oni kochają się nawzajem.</example>
	</rule>
	#!/usr/bin/perl

	use warnings;
	use strict;
	use utf8;

	use Getopt::Long;
	#uncomment to use unidecode
	#use Text::Unidecode;

	my $lang = 'pl';
	my $input = '';
	my $output = '';
	my $encoding = 'UTF-8';

	my $fhin = *STDIN;
	my $fhout = *STDOUT;

	my $res = GetOptions(
	'lang=s' => \$lang,
	'in=s' => \$input,
	'out=s' => \$output,
	'enc=s' => \$encoding
	);

	if ($input ne '') {
	open ($fhin, "<$input");
	}

	if ($output ne '') {
	open ($fhout, ">$output");
	}

	binmode($fhin, ":encoding($encoding)");
	binmode($fhout, ":encoding(UTF-8)");

	my %message = (
	'pl' => 'Ten wyraz zwykle pisze się łącznie:',
	'en' => 'Did you mean',
	);

	my %short = (
	'pl' => 'Prawdopodobna literówka',
	'en' => 'Possible typo',
	);

	while (<$fhin>) {
	chomp;
	my ($incorrect, $example) = split/\t/, $_;

	# We could probably check the example for the incorrect form, but it
	# seems better (or, at least, easier) to just crap out on the line.
	if ($incorrect !~ / /) {
	print "Error: no spaces in incorrect form: $_\n";
	next;
	}

	# The correct form is the incorrect minus spaces
	my $correct = $incorrect;
	$correct =~ s/ //g;

	# Uncoment for unidecode
	# my $id = uc(unidecode($incorrect));

	# without unidecode, this should be ok for Polish
	# comment these two lines to use unidecode
	my $id = uc($incorrect);
	$id =~ tr/ĄĆĘŁŃÓŚŻŹ/ACELNOSZZ/;

	# Do this anyway
	$id =~ s/ /_/g;

	print $fhout " <rule id=\"$id\" name=\"„$incorrect” ($correct)\">\n";

	my @parts = split/ /, $incorrect;

	print $fhout " <pattern>\n";
	for my $part (@parts) {
	print " <token>$part</token>\n";
	}
	print $fhout " </pattern>\n";

	print $fhout " <message>$message{$lang} <suggestion>";
	for (my $i=1; $i<=($#parts+1); $i++) {
	print $fhout "\\$i";
	}
	print $fhout "</suggestion>.</message>\n";
	print $fhout " <short>$short{$lang}</short>\n";

	my $outincor = $example;
	my $outcor = $example;
	if ($example =~ /($incorrect)/i) {
	my $m = $1;
	$outincor =~ s#$m#<marker>$m</marker>#;
	$outcor =~ s#$m#$correct#;
	} elsif ($example =~ /($correct)/i) {
	my $m = $1;
	$outincor =~ s#$m#<marker>$incorrect</marker>#;
	} else {
	print "Error: example contains neither correct nor incorrect phrase: $example\n";
	next;
	}

	print " <example correction=\"$correct\" type=\"incorrect\">${outincor}.</example>\n";
	print " <example type=\"correct\">${outcor}.</example>\n";
	print " </rule>\n";
	}