Skip to content

Instantly share code, notes, and snippets.

@fuba
Created August 13, 2009 16:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fuba/167287 to your computer and use it in GitHub Desktop.
Save fuba/167287 to your computer and use it in GitHub Desktop.
known bug: サンプル画像が無いアイテムの画像が出せない、gifとjpgの判定は表紙画像だけでできるので要修正
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use DateTime;
use DateTime::Format::Strptime qw( strptime );
use Encode qw( decode encode );
use Time::ParseDate;
use URI;
use Web::Scraper 0.22;
use YAML;
my $t = parsedate(shift || 'today');
my $dt = DateTime->from_epoch(epoch => $t, time_zone => 'Asia/Tokyo');
my $ss = scraper {
process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
process '//tr[1]/td/font', title => 'text';
process '//tr[2]/td/font/a', author => 'text';
process '//tr[3]/td',
link => ['text', \&mk_link];
process '//tr[4]/td/a', 'tags[]' => 'text';
process '//tr[5]/td/a[1]',
body => ['@href', \&mk_body],
process '//tr[last()-1]/td/a',
date => ['text', \&mk_date];
result qw( title author link body tags date );
};
result qw( entry );
};
$ss->user_agent->env_proxy;
my $sig = scraper {
process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
process '//tr[1]/td/font', title => 'text';
process '//tr[2]/td/font/a', author => 'text';
process '//tr[3]/td',
link => ['text', \&mk_link];
process '//tr[4]/td/a', 'category' => 'text';
process '//tr[5]/td/a', 'tags[]' => 'text';
process '//tr[6]/td/a[1]',
body => ['@href', \&mk_body],
process '//tr[last()-1]/td/a',
date => ['text', \&mk_date];
result qw( title author link body category tags date );
};
result qw( entry );
};
$sig->user_agent->env_proxy;
my $entry = [
@{ &parse($ss, $dt, '同人誌') || [] },
@{ &parse($sig, $dt, '同人ソフト') || [] },
@{ &parse($sig, $dt, '同人グッズ') || [] },
];
for my $e (@$entry) {
unshift @{$e->{tags}}, $e->{category} if $e->{category};
delete $e->{category} if defined $e->{category};
}
my $dt_title = $dt->strftime('%Y年%m月%d日');
binmode STDOUT, ":utf8";
print Dump +{
title => "メロンブックス通信販売 新着リスト ($dt_title)",
link => 'http://shop.melonbooks.co.jp/shop/list/',
entry => $entry,
};
sub parse {
my ($scraper, $dt, $genre) = @_;
my @items;
my $result_num = 0;
my $offset = 10;
for (my $dispstart = 0; $dispstart < 300; $dispstart += $offset) {
my $url = URI->new('http://shop.melonbooks.co.jp/shop/list/');
my %query = (
'CR[]' => [18, 15, 0],
DA => 'dispstart',
'LA' => $dt->ymd,
SC => $dispstart, # オフセット
G => $genre, # ジャンル
E => '', # 在庫切れも含む ? '' : 'ON'
'ARRIVAL[]' => ['first', 'reall'],
ST => 0,
O => 'maker', #表示順
P => $offset, # 表示件数
DS => 'desc',
RATED => 18, # これを渡すと18禁認証スキップ
);
$url->query_form(%query);
my $res = $scraper->user_agent->get($url);
unless ($res->is_success) {
die "GET $url failed: " . $res->status_line;
}
my $content = decode('utf-8', $res->content);
if ($content =~ m|\(\d+~\d+件/全(\d+)件\)|m) {
$result_num = $1;
}
my $data = $scraper->scrape($content);
push @items, @{ $data || [] };
last if ($dispstart + $offset > $result_num);
}
wantarray ? @items : \@items;
}
sub mk_link {
my $link = URI->new('http://shop.melonbooks.co.jp/shop/list/ID/'.$_);
$link->as_string;
}
sub mk_body {
my $key = $_;
if ($key =~ /(\d+)\w\.(\w+)$/) {
my $id = $1;
my $type = $2;
join '', map {
($type eq 'jpg')
? qq!<img src="http://shop.melonbooks.co.jp/img2/m/${id}${_}.jpg">!
: qq!<img src="http://shop.melonbooks.co.jp/img/${id}${_}.gif">!
} ('', 'a', 'b');
}
}
sub mk_date { eval { strptime('%Y年%m月%d日', $_)->ymd } }
sub as_string { $_->as_string }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment