Skip to content

Instantly share code, notes, and snippets.

@kkobayashi
Created May 6, 2012 08:12
Show Gist options
  • Save kkobayashi/2621015 to your computer and use it in GitHub Desktop.
Save kkobayashi/2621015 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
use Web::Scraper;
use URI;
use YAML::Syck;
use utf8;
$YAML::Syck::ImplicitUnicode = 1;
# 一覧: アニメ(終了/再放送) - しょぼいカレンダー
# http://cal.syoboi.jp/list?cat=10
# 一覧: アニメ - しょぼいカレンダー
# http://cal.syoboi.jp/list?cat=1
my @uri_list = qw(http://cal.syoboi.jp/list?cat=10 http://cal.syoboi.jp/list?cat=1);
my $yaml_file = 'syoboi.yaml';
my $yaml = (-f $yaml_file) ? LoadFile($yaml_file) : {};
my $result = {};
my @list = ();
### $yaml
foreach my $u (@uri_list){
### $u
print STDERR "scraping $u ...\n";
my $info = scraper {
process 'table#TitleList tbody tr', 'h[]' => scraper {
process '//td[1]/a', 'title' => 'TEXT';
process '//td[1]/a', 'url' => '@href';
process '//td[2]', 'begin' => 'TEXT';
process '//td[3]', 'end' => 'TEXT';
process '//td[4]', 'tid' => 'TEXT';
};
result 'h';
}->scrape(URI->new($u));
### $info
push(@list, @$info);
sleep 1;
}
foreach my $l(@list){
### $l
my $tid = $l->{tid};
if(exists $yaml->{$tid}){
print STDERR " tid $tid found\n";
$result->{$tid} = { %{$yaml->{$tid}}, %$l };
}
else{
print STDERR " tid $tid not found. scraping $l->{url} ...\n";
$result->{$tid} = $l;
my $v = scraper {
process 'table.cast table.data td a.keyword', 'v[]' => 'TEXT';
result 'v';
}->scrape($l->{url});
### $v
sleep 1;
$result->{$tid}->{v} = $v;
}
}
DumpFile($yaml_file, $result);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment