Skip to content

Instantly share code, notes, and snippets.

@xaicron
Created April 2, 2009 14:25
Show Gist options
  • Save xaicron/89215 to your computer and use it in GitHub Desktop.
Save xaicron/89215 to your computer and use it in GitHub Desktop.
# 偉大なるしょこたんが今日までにどれだけBlog記事を書いたか調べるスクリプト
use strict;
use warnings;
use Web::Scraper;
use LWP::UserAgent;
use URI;
use Encode;
use 5.0100;
my $base_url = 'http://blog.excite.co.jp/shokotan/';
my $munth_url = $base_url . 'm1900-01-01/';
my $res = LWP::UserAgent->new->get($munth_url);
my $content = decode_utf8 $res->content;
my ($archive) = $content =~ m#<div class=ARCHIVE_BODY>(.*?)</div>#i;
my @contents;
while ($archive =~ s#<a href=([^<>]+)/>##i) {
say $1;
my $uri = URI->new($1);
my $scraper = scraper {
process '/html/body/div[3]/div[3]/div/div[2]/p/a', 'list[]' => [ '@href', sub { $_->as_string } ];
};
my $result = $scraper->scrape($uri) or die 'Oops!';
push @contents, grep { m#${base_url}\d+/# } @{$result->{list}};
}
my $count;
for my $list (do { my %h; grep { !$h{$_}++ } @contents }) {
printf "%05d => $list\n", ++$count;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment