Skip to content

Instantly share code, notes, and snippets.

@fujiwara
Created May 15, 2009 12:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fujiwara/112184 to your computer and use it in GitHub Desktop.
Save fujiwara/112184 to your computer and use it in GitHub Desktop.
use Web::Scraper;
use strict;
use Data::Dumper;
use URI;
use YAML;
use utf8;
use Encode qw/ encode_utf8 decode_utf8 /;
my $s = scraper {
process "table.wikitable tr", "tr[]" =>
scraper {
process "td", "td[]" => "RAW";
};
};
my $all = {};
for my $name (qw/ 数 長さ 面積 体積 /) {
$all->{$name} = make($name);
}
print encode_utf8 Dump $all;
sub to_text {
local $_ = $_[0];
s{</?span.*?>}{}g;
s{<sup>(-?\d+)</sup>}{e$1};
s{\[.+?\]}{}g;
s{<.*?>}{}g;
$_;
}
sub make {
my $name = shift;
my $result = $s->scrape( URI->new("http://ja.wikipedia.org/wiki/${name}の比較") );
my ($order, $value, $description);
my $data = {};
for my $tr ( @{ $result->{tr} } ) {
next if ref $tr->{td} ne 'ARRAY';
my @td = @{$tr->{td}};
if (@td == 5) {
($order, undef, $value, $description) = map { to_text $_ } @td;
}
elsif (@td == 4) {
($order, undef, undef, $value, $description) = map { to_text $_ } @td;
}
elsif (@td == 2) {
($value, $description) = map { to_text $_ } @td;
}
else {
next;
}
$data->{$order} ||= [];
push @{ $data->{$order} }, { value => $value, description => $description };
}
return $data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment