Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Created April 5, 2012 19:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoehrmann/2313504 to your computer and use it in GitHub Desktop.
Save hoehrmann/2313504 to your computer and use it in GitHub Desktop.
Extract tabular data from PDFs (after using pdftohtml -c -xml)
#!perl -w
use strict;
use warnings;
use XML::LibXML;
use List::Util qw/max/;
use Math::Trig qw/:pi deg2rad rad2deg/;
my $d = XML::LibXML->load_xml(location => 'wurdlist.xml');
my $m = deg2rad( 3 );
foreach my $page ($d->findnodes('//page')) {
my @nodes = grep { length $_->{d} }
map {
my $text = $_->textContent;
$text =~ s/\s+/ /;
$text =~ s/^\s+|\s+$//g;
{
d => $text,
x => $_->getAttribute('left'),
y => $_->getAttribute('top'),
}
} $page->findnodes('text');
foreach my $x (@nodes) {
foreach my $y (@nodes) {
my $angle = atan2($y->{y} - $x->{y}, $y->{x} - $x->{x});
if ($x->{x} < $y->{x} and abs($angle) < deg2rad(5)) {
$x->{r} = $y if !defined $x->{r} or $y->{x} < $x->{r}->{x};
}
if (abs($angle - deg2rad(90)) < deg2rad(3)) {
$x->{b} = $y if !defined $x->{b} or $y->{y} < $x->{b}->{y};
}
}
}
do { push @{ $_->{b}{top} }, $_ if $_->{b} } for @nodes;
do { push @{ $_->{r}{left} }, $_ if $_->{r} } for @nodes;
for (grep { !$_->{left} } @nodes) {
for (my $here = $_; $here; $here = $here->{r}) {
print $here->{d};
print $here->{r} ? "\t" : "\n";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment