Skip to content

Instantly share code, notes, and snippets.

@lopnor
Created March 19, 2009 07:24
Show Gist options
  • Save lopnor/81633 to your computer and use it in GitHub Desktop.
Save lopnor/81633 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use FindBin;
use Web::Scraper;
use YAML;
use URI;
my $file = shift;
my @headers = qw(id rep name_anno docomo kddi softbank google);
my $scraper = scraper {
process '//tr' => 'list[]' => sub {
my $elem = shift;
my $hash = {};
my @cols = $elem->look_down(_tag => 'td');
scalar @cols == 7 or return;
no strict 'refs';
for my $i (0 .. 6) {
my $item = $cols[$i];
my $class = $item->attr('class');
my $item_hash = &{"$class"}($item);
$item_hash->{$class} = 1;
$hash->{$headers[$i]} = $item_hash;
}
return $hash;
};
result 'list';
};
my $target;
if ($file) {
open my $fh, "<", $file;
$target = do {local $/; <$fh>};
close $fh;
} else {
$target = URI->new('http://www.unicode.org/~scherer/emoji4unicode/20090206/utc_pdf.html');
}
my $list = $scraper->scrape($target);
print Dump $list;
sub id {
my $item = shift;
return {
internal_id => $item->as_text,
};
}
sub rep {
my $item = shift;
my $hash;
$hash->{status} = $item->look_down(_tag => 'span', class => 'status')->as_text;
($hash->{unicode}) = $item->as_text =~ /(U\+[0-9A-F]+)/;
return $hash;
}
sub name_anno {
my $item = shift;
return {content => $item->as_text};
}
sub round_trip {
my $item = shift;
my ($id, $oldid, $unicode);
for ($item->content_list) {
my $content = ref $_ ? $_->as_text : $_;
$content =~ /(#(?:Exp.|)\d+)/ and $id = $1;
$content =~ /(#old\d+)/ and $oldid = $1;
$content =~ /(U\+[0-9A-F]+)/ and $unicode = $1;
}
my $image = $item->look_down(_tag => 'img');
my $src = $image->attr('src') if $image;
my $class = $item->attr('class');
return {
$id ? (char_id => $id) : (),
unicode => $unicode,
$src ? (image => $src) : (),
$oldid ? (oldid => $oldid) : (),
};
}
sub text_fallback {
my $item = shift;
return {text => $item->as_text};
}
sub fallback {
return &round_trip(shift);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment