hoehrmann/gist:2395307

## gistfile1.pl
#!perl -w
use strict;
use warnings;
use XML::Parser;
use PerlIO::via::gzip;
use feature 'say';
use Archive::Zip;
use Image::Magick;
use Algorithm::Diff 'sdiff';
use List::Util qw/min max first/;
use YAML::XS;
use JSON;
use URI;

#
# Prints out tab-separated values with the fields: id, Unicode value,
# page number, offset left, top, right, bottom, and the image data as
# data:image/png address.
#

# http://archive.org/details/glossariumderfr01outzgoog
my $prefix     = 'glossariumderfr01outzgoog';
my $zip_path   = "${prefix}_tif.zip";
my $abbyy_path = "${prefix}_abbyy.gz";

# Generated using sth like https://gist.github.com/2313504
my $google_path = "${prefix}_google.txt";

open my $google_f, '<:utf8', $google_path or die $!;
my $google_txt = do { local $/; <$google_f>; };
my @google_words = split /\s+/, $google_txt;

my $zip = Archive::Zip->new;

$zip->read($zip_path) and die $!;

open my $f, '<:via(gzip)', $abbyy_path or die $!;

sub image_region_to_uri {

# return "";
  my ($image, %region) = @_;
  my $clone = $image->Clone();
  $clone->Extent(%region);
  $clone->Strip();
  my $png_data = $clone->ImageToBlob(magick => 'png');
  my $uri = URI->new('data:image/png,');
  $uri->data($png_data);
  return "$uri";
}

sub diff_words {
  my @words = @_;

  my @trial = sdiff \@words, \@google_words;
  my $sum_u   = grep { $_->[0] eq 'u' } @trial;
  my $middle  = 0;
  my $running = 0;

  for (my $running = 0; $running < $sum_u / 2; ++$middle) {
    $running++ if $trial[$middle]->[0] eq 'u';
  }

  my @part =
    @google_words[ max($middle - @words, 0)
    .. min($#google_words, $middle + @words) ];
  @trial = sdiff \@words, \@part;

  my $third      = int(0.33 * $#trial);
  my @lead_third = @trial[ 0 .. $third ];
  my @tail_third = @trial[ 2 * $third .. $#trial ];
  my $skip       = grep { $_->[0] eq '+' } @lead_third;
  my $cut        = grep { $_->[0] eq '+' } @tail_third;
  @part = @part[ $skip .. $#part - $cut ];

  @trial = sdiff \@words, \@part;

# find adjacent where IA has <prefix>-, <suffix> and Google
# has <prefix><suffix> as single entry and hyphenate Google.
# If IA has <prefix>- <suffix> and Google has <prefix>-<suffix>
# as single entry, split Google.

  my @new = ();
  for (my $i = 1; $i <= $#trial; ++$i) {
    my ($ia_prefix) = $trial[ $i - 1 ]->[1] =~ /^(.+)-$/;
    my ($ia_suffix) = $trial[$i]->[1];

    if (defined $ia_prefix
      and $trial[ $i - 1 ]->[2] eq "$ia_prefix$ia_suffix") {
      push @new, "$ia_prefix-";
      push @new, "$ia_suffix";

    } elsif (defined $ia_prefix
      and $trial[ $i - 1 ]->[2] eq "$ia_prefix-$ia_suffix") {
      push @new, "$ia_prefix-";
      push @new, "$ia_suffix";

    } else {
      push @new, $trial[ $i - 1 ]->[2];
      push @new, $trial[$i]->[2] if $i == $#trial;
    }
  }

  @trial = sdiff \@words, [ grep { length } @new ];
}

my $page = 0;
my $text = "";
my $cur_img;
my $p = XML::Parser->new;
my $cur_p;

$p->setHandlers(
  Start => sub {
    my ($xp, $name, %attr) = @_;
    $text = "";
    if ($name eq 'page') {
      $cur_p = { page_number => ++$page, };
      my $tiff_path = sprintf "%s_tif/%s_%04u.tif",
        $prefix, $prefix, $page;
      my $tiff_data = $zip->contents($tiff_path);

      $cur_img = Image::Magick->new(magick => 'tif');
      $cur_img->Set('monochrome' => 'True');
      $cur_img->BlobToImage($tiff_data);

    } elsif ($name eq 'par') {
      push @{ $cur_p->{pars} }, {};

    } elsif ($name eq 'line') {
      push @{ $cur_p->{pars}[-1]{lines} },
        { words => [ { chars => [] } ], };

    } elsif ($name eq 'charParams') {
      if ($attr{wordStart} eq 'true') {
        if (@{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }) {
          push @{ $cur_p->{pars}[-1]{lines}[-1]{words} },
            { chars => [] };
        }
      }
      push @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }, {
        x      => $attr{l},
        y      => $attr{t},
        width  => $attr{r} - $attr{l},
        height => $attr{b} - $attr{t}
      };
    }
  },
  Char => sub {
    my ($xp, $data) = @_;
    $text .= $data;
  },
  End => sub {
    my ($xp, $name) = @_;
    if ($name eq 'charParams' and $text =~ /\s+/) {

      # treat white space as word boundary
      pop @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} };
      push @{ $cur_p->{pars}[-1]{lines}[-1]{words} }, { chars => [] };
    } elsif ($name eq 'charParams') {
      my $char = $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars}[-1];
      my $uri  = image_region_to_uri(
        $cur_img,
        x      => $char->{x},
        y      => $char->{y},
        width  => $char->{width},
        height => $char->{height}
      );
      $char->{scan} = "$uri";
      $char->{code} = ord $text;
      $char->{char} = $text;
    }
    if ($name eq 'line') {
      my $line = $cur_p->{pars}[-1]{lines}[-1];
      $line->{words} =
        [ grep { @{ $_->{chars} } } @{ $line->{words} } ];
      for my $w (@{ $line->{words} }) {
        $w->{word} = join '', map { $_->{char} } @{ $w->{chars} };
        $w->{x}    = min map      { $_->{x} } @{ $w->{chars} };
        $w->{y}    = min map      { $_->{y} } @{ $w->{chars} };
        $w->{width} =
          max(map { $_->{x} + $_->{width} } @{ $w->{chars} }) - $w->{x};
        $w->{height} =
          max(map { $_->{y} + $_->{height} } @{ $w->{chars} }) -
          $w->{y};

        my $uri = image_region_to_uri(
          $cur_img,
          x      => $w->{x},
          y      => $w->{y},
          width  => $w->{width},
          height => $w->{height},
        );
        $w->{scan} = "$uri";
      }
    }
    if ($name eq 'page') {

      warn "$page\n";

#return unless $page == 40;

      my @words = map { @{ $_->{words} // [] } }    # @@@
        map { @{ $_->{lines} // [] } }              # @@@
        @{ $cur_p->{pars} };

      my @literals = map { $_->{word} // "" } @words;
      my @trial    = diff_words(@literals);
      my $index    = 0;

      for (@trial) {
        if ($_->[0] eq '+' or not length $_->[1]) {
          push @{ $words[$index]->{google_words} }, $_->[2];
          next;
        }
        push @{ $words[ $index++ ]->{google_words} }, $_->[2];
      }

      my $json = JSON->new->ascii(1)->pretty(1)->encode($cur_p);
      my $filename =
        sprintf("xnew-%s-page%04u.json.gz", $prefix, $page);
      open my $f, '>:via(gzip)', $filename or die $!;
      print $f $json;
      close $f;

#      die;

    }
  }
);

$p->parse($f);
	#!perl -w
	use strict;
	use warnings;
	use XML::Parser;
	use PerlIO::via::gzip;
	use feature 'say';
	use Archive::Zip;
	use Image::Magick;
	use Algorithm::Diff 'sdiff';
	use List::Util qw/min max first/;
	use YAML::XS;
	use JSON;
	use URI;

	#
	# Prints out tab-separated values with the fields: id, Unicode value,
	# page number, offset left, top, right, bottom, and the image data as
	# data:image/png address.
	#

	# http://archive.org/details/glossariumderfr01outzgoog
	my $prefix = 'glossariumderfr01outzgoog';
	my $zip_path = "${prefix}_tif.zip";
	my $abbyy_path = "${prefix}_abbyy.gz";

	# Generated using sth like https://gist.github.com/2313504
	my $google_path = "${prefix}_google.txt";

	open my $google_f, '<:utf8', $google_path or die $!;
	my $google_txt = do { local $/; <$google_f>; };
	my @google_words = split /\s+/, $google_txt;

	my $zip = Archive::Zip->new;

	$zip->read($zip_path) and die $!;

	open my $f, '<:via(gzip)', $abbyy_path or die $!;

	sub image_region_to_uri {

	# return "";
	my ($image, %region) = @_;
	my $clone = $image->Clone();
	$clone->Extent(%region);
	$clone->Strip();
	my $png_data = $clone->ImageToBlob(magick => 'png');
	my $uri = URI->new('data:image/png,');
	$uri->data($png_data);
	return "$uri";
	}

	sub diff_words {
	my @words = @_;

	my @trial = sdiff \@words, \@google_words;
	my $sum_u = grep { $_->[0] eq 'u' } @trial;
	my $middle = 0;
	my $running = 0;

	for (my $running = 0; $running < $sum_u / 2; ++$middle) {
	$running++ if $trial[$middle]->[0] eq 'u';
	}

	my @part =
	@google_words[ max($middle - @words, 0)
	.. min($#google_words, $middle + @words) ];
	@trial = sdiff \@words, \@part;

	my $third = int(0.33 * $#trial);
	my @lead_third = @trial[ 0 .. $third ];
	my @tail_third = @trial[ 2 * $third .. $#trial ];
	my $skip = grep { $_->[0] eq '+' } @lead_third;
	my $cut = grep { $_->[0] eq '+' } @tail_third;
	@part = @part[ $skip .. $#part - $cut ];

	@trial = sdiff \@words, \@part;

	# find adjacent where IA has <prefix>-, <suffix> and Google
	# has <prefix><suffix> as single entry and hyphenate Google.
	# If IA has <prefix>- <suffix> and Google has <prefix>-<suffix>
	# as single entry, split Google.

	my @new = ();
	for (my $i = 1; $i <= $#trial; ++$i) {
	my ($ia_prefix) = $trial[ $i - 1 ]->[1] =~ /^(.+)-$/;
	my ($ia_suffix) = $trial[$i]->[1];

	if (defined $ia_prefix
	and $trial[ $i - 1 ]->[2] eq "$ia_prefix$ia_suffix") {
	push @new, "$ia_prefix-";
	push @new, "$ia_suffix";

	} elsif (defined $ia_prefix
	and $trial[ $i - 1 ]->[2] eq "$ia_prefix-$ia_suffix") {
	push @new, "$ia_prefix-";
	push @new, "$ia_suffix";

	} else {
	push @new, $trial[ $i - 1 ]->[2];
	push @new, $trial[$i]->[2] if $i == $#trial;
	}
	}

	@trial = sdiff \@words, [ grep { length } @new ];
	}

	my $page = 0;
	my $text = "";
	my $cur_img;
	my $p = XML::Parser->new;
	my $cur_p;

	$p->setHandlers(
	Start => sub {
	my ($xp, $name, %attr) = @_;
	$text = "";
	if ($name eq 'page') {
	$cur_p = { page_number => ++$page, };
	my $tiff_path = sprintf "%s_tif/%s_%04u.tif",
	$prefix, $prefix, $page;
	my $tiff_data = $zip->contents($tiff_path);

	$cur_img = Image::Magick->new(magick => 'tif');
	$cur_img->Set('monochrome' => 'True');
	$cur_img->BlobToImage($tiff_data);

	} elsif ($name eq 'par') {
	push @{ $cur_p->{pars} }, {};

	} elsif ($name eq 'line') {
	push @{ $cur_p->{pars}[-1]{lines} },
	{ words => [ { chars => [] } ], };

	} elsif ($name eq 'charParams') {
	if ($attr{wordStart} eq 'true') {
	if (@{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }) {
	push @{ $cur_p->{pars}[-1]{lines}[-1]{words} },
	{ chars => [] };
	}
	}
	push @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }, {
	x => $attr{l},
	y => $attr{t},
	width => $attr{r} - $attr{l},
	height => $attr{b} - $attr{t}
	};
	}
	},
	Char => sub {
	my ($xp, $data) = @_;
	$text .= $data;
	},
	End => sub {
	my ($xp, $name) = @_;
	if ($name eq 'charParams' and $text =~ /\s+/) {

	# treat white space as word boundary
	pop @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} };
	push @{ $cur_p->{pars}[-1]{lines}[-1]{words} }, { chars => [] };
	} elsif ($name eq 'charParams') {
	my $char = $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars}[-1];
	my $uri = image_region_to_uri(
	$cur_img,
	x => $char->{x},
	y => $char->{y},
	width => $char->{width},
	height => $char->{height}
	);
	$char->{scan} = "$uri";
	$char->{code} = ord $text;
	$char->{char} = $text;
	}
	if ($name eq 'line') {
	my $line = $cur_p->{pars}[-1]{lines}[-1];
	$line->{words} =
	[ grep { @{ $_->{chars} } } @{ $line->{words} } ];
	for my $w (@{ $line->{words} }) {
	$w->{word} = join '', map { $_->{char} } @{ $w->{chars} };
	$w->{x} = min map { $_->{x} } @{ $w->{chars} };
	$w->{y} = min map { $_->{y} } @{ $w->{chars} };
	$w->{width} =
	max(map { $_->{x} + $_->{width} } @{ $w->{chars} }) - $w->{x};
	$w->{height} =
	max(map { $_->{y} + $_->{height} } @{ $w->{chars} }) -
	$w->{y};

	my $uri = image_region_to_uri(
	$cur_img,
	x => $w->{x},
	y => $w->{y},
	width => $w->{width},
	height => $w->{height},
	);
	$w->{scan} = "$uri";
	}
	}
	if ($name eq 'page') {

	warn "$page\n";

	#return unless $page == 40;

	my @words = map { @{ $_->{words} // [] } } # @@@
	map { @{ $_->{lines} // [] } } # @@@
	@{ $cur_p->{pars} };

	my @literals = map { $_->{word} // "" } @words;
	my @trial = diff_words(@literals);
	my $index = 0;

	for (@trial) {
	if ($_->[0] eq '+' or not length $_->[1]) {
	push @{ $words[$index]->{google_words} }, $_->[2];
	next;
	}
	push @{ $words[ $index++ ]->{google_words} }, $_->[2];
	}

	my $json = JSON->new->ascii(1)->pretty(1)->encode($cur_p);
	my $filename =
	sprintf("xnew-%s-page%04u.json.gz", $prefix, $page);
	open my $f, '>:via(gzip)', $filename or die $!;
	print $f $json;
	close $f;

	# die;

	}
	}
	);

	$p->parse($f);