fuba/shop_melonbooks_co_jp_tsuhan_20090814.pl

## shop_melonbooks_co_jp_tsuhan_20090814.pl
#!/usr/bin/perl
use strict;
use warnings;
use utf8;

use DateTime;
use DateTime::Format::Strptime qw( strptime );
use Encode qw( decode encode );
use Time::ParseDate;
use URI;
use Web::Scraper 0.22;
use YAML;

my $t = parsedate(shift || 'today');
my $dt = DateTime->from_epoch(epoch => $t, time_zone => 'Asia/Tokyo');

my $ss = scraper {
    process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
        process '//tr[1]/td/font', title => 'text';
        process '//tr[2]/td/font/a', author => 'text';
        process '//tr[3]/td',
            link => ['text', \&mk_link];
        process '//tr[4]/td/a', 'tags[]' => 'text';
        process '//tr[5]/td/a[1]',
            body => ['@href', \&mk_body],
        process '//tr[last()-1]/td/a',
            date => ['text', \&mk_date];
        result qw( title author link body tags date );
    };
    result qw( entry );
};
$ss->user_agent->env_proxy;

my $sig = scraper {
    process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
        process '//tr[1]/td/font', title => 'text';
        process '//tr[2]/td/font/a', author => 'text';
        process '//tr[3]/td',
            link => ['text', \&mk_link];
        process '//tr[4]/td/a', 'category' => 'text';
        process '//tr[5]/td/a', 'tags[]' => 'text';
        process '//tr[6]/td/a[1]',
            body => ['@href', \&mk_body],
        process '//tr[last()-1]/td/a',
            date => ['text', \&mk_date];
        result qw( title author link body category tags date );
    };
    result qw( entry );
};
$sig->user_agent->env_proxy;

my $entry = [
    @{ &parse($ss,  $dt, '同人誌')     || [] },
    @{ &parse($sig, $dt, '同人ソフト') || [] },
    @{ &parse($sig, $dt, '同人グッズ') || [] },
];

for my $e (@$entry) {
    unshift @{$e->{tags}}, $e->{category} if $e->{category};
    delete $e->{category} if defined $e->{category};
}

my $dt_title = $dt->strftime('%Y年%m月%d日');
binmode STDOUT, ":utf8";
print Dump +{
    title => "メロンブックス通信販売 新着リスト ($dt_title)",
    link  => 'http://shop.melonbooks.co.jp/shop/list/',
    entry => $entry,
};

sub parse {
    my ($scraper, $dt, $genre) = @_;

    my @items;
    my $result_num = 0;
    my $offset = 10;
    for (my $dispstart = 0; $dispstart < 300; $dispstart += $offset) {
        my $url = URI->new('http://shop.melonbooks.co.jp/shop/list/');
        my %query = (
            'CR[]'      =>  [18, 15, 0],
            DA          =>  'dispstart',
            'LA'        => $dt->ymd,
            SC          => $dispstart, # オフセット
            G           => $genre, # ジャンル
            E           => '', # 在庫切れも含む ? '' : 'ON'
            'ARRIVAL[]' => ['first', 'reall'],
            ST          => 0,
            O           => 'maker', #表示順
            P           => $offset, # 表示件数
            DS          => 'desc',
            RATED       => 18, # これを渡すと18禁認証スキップ
        );

        $url->query_form(%query);

        my $res = $scraper->user_agent->get($url);
        unless ($res->is_success) {
            die "GET $url failed: " . $res->status_line;
        }

        my $content = decode('utf-8', $res->content);

        if ($content =~ m|\(\d+～\d+件/全(\d+)件\)|m) {
            $result_num = $1;
        }

        my $data = $scraper->scrape($content);
        push @items, @{ $data || [] };

        last if ($dispstart + $offset > $result_num);
    }

    wantarray ? @items : \@items;
}

sub mk_link {
    my $link = URI->new('http://shop.melonbooks.co.jp/shop/list/ID/'.$_);
    $link->as_string;
}

sub mk_body {
    my $key = $_;
    if ($key =~ /(\d+)\w\.(\w+)$/) {
        my $id = $1;
        my $type = $2;
        join '', map {
            ($type eq 'jpg')
                ? qq!<img src="http://shop.melonbooks.co.jp/img2/m/${id}${_}.jpg">!
                : qq!<img src="http://shop.melonbooks.co.jp/img/${id}${_}.gif">!
        } ('', 'a', 'b');
    }
}

sub mk_date { eval { strptime('%Y年%m月%d日', $_)->ymd } }

sub as_string { $_->as_string }
	#!/usr/bin/perl
	use strict;
	use warnings;
	use utf8;

	use DateTime;
	use DateTime::Format::Strptime qw( strptime );
	use Encode qw( decode encode );
	use Time::ParseDate;
	use URI;
	use Web::Scraper 0.22;
	use YAML;

	my $t = parsedate(shift \|\| 'today');
	my $dt = DateTime->from_epoch(epoch => $t, time_zone => 'Asia/Tokyo');

	my $ss = scraper {
	process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
	process '//tr[1]/td/font', title => 'text';
	process '//tr[2]/td/font/a', author => 'text';
	process '//tr[3]/td',
	link => ['text', \&mk_link];
	process '//tr[4]/td/a', 'tags[]' => 'text';
	process '//tr[5]/td/a[1]',
	body => ['@href', \&mk_body],
	process '//tr[last()-1]/td/a',
	date => ['text', \&mk_date];
	result qw( title author link body tags date );
	};
	result qw( entry );
	};
	$ss->user_agent->env_proxy;

	my $sig = scraper {
	process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper {
	process '//tr[1]/td/font', title => 'text';
	process '//tr[2]/td/font/a', author => 'text';
	process '//tr[3]/td',
	link => ['text', \&mk_link];
	process '//tr[4]/td/a', 'category' => 'text';
	process '//tr[5]/td/a', 'tags[]' => 'text';
	process '//tr[6]/td/a[1]',
	body => ['@href', \&mk_body],
	process '//tr[last()-1]/td/a',
	date => ['text', \&mk_date];
	result qw( title author link body category tags date );
	};
	result qw( entry );
	};
	$sig->user_agent->env_proxy;

	my $entry = [
	@{ &parse($ss, $dt, '同人誌') \|\| [] },
	@{ &parse($sig, $dt, '同人ソフト') \|\| [] },
	@{ &parse($sig, $dt, '同人グッズ') \|\| [] },
	];

	for my $e (@$entry) {
	unshift @{$e->{tags}}, $e->{category} if $e->{category};
	delete $e->{category} if defined $e->{category};
	}

	my $dt_title = $dt->strftime('%Y年%m月%d日');
	binmode STDOUT, ":utf8";
	print Dump +{
	title => "メロンブックス通信販売新着リスト ($dt_title)",
	link => 'http://shop.melonbooks.co.jp/shop/list/',
	entry => $entry,
	};

	sub parse {
	my ($scraper, $dt, $genre) = @_;

	my @items;
	my $result_num = 0;
	my $offset = 10;
	for (my $dispstart = 0; $dispstart < 300; $dispstart += $offset) {
	my $url = URI->new('http://shop.melonbooks.co.jp/shop/list/');
	my %query = (
	'CR[]' => [18, 15, 0],
	DA => 'dispstart',
	'LA' => $dt->ymd,
	SC => $dispstart, # オフセット
	G => $genre, # ジャンル
	E => '', # 在庫切れも含む ? '' : 'ON'
	'ARRIVAL[]' => ['first', 'reall'],
	ST => 0,
	O => 'maker', #表示順
	P => $offset, # 表示件数
	DS => 'desc',
	RATED => 18, # これを渡すと18禁認証スキップ
	);

	$url->query_form(%query);

	my $res = $scraper->user_agent->get($url);
	unless ($res->is_success) {
	die "GET $url failed: " . $res->status_line;
	}

	my $content = decode('utf-8', $res->content);

	if ($content =~ m\|\(\d+～\d+件/全(\d+)件\)\|m) {
	$result_num = $1;
	}

	my $data = $scraper->scrape($content);
	push @items, @{ $data \|\| [] };

	last if ($dispstart + $offset > $result_num);
	}

	wantarray ? @items : \@items;
	}

	sub mk_link {
	my $link = URI->new('http://shop.melonbooks.co.jp/shop/list/ID/'.$_);
	$link->as_string;
	}

	sub mk_body {
	my $key = $_;
	if ($key =~ /(\d+)\w\.(\w+)$/) {
	my $id = $1;
	my $type = $2;
	join '', map {
	($type eq 'jpg')
	? qq!<img src="http://shop.melonbooks.co.jp/img2/m/${id}${_}.jpg">!
	: qq!<img src="http://shop.melonbooks.co.jp/img/${id}${_}.gif">!
	} ('', 'a', 'b');
	}
	}

	sub mk_date { eval { strptime('%Y年%m月%d日', $_)->ymd } }

	sub as_string { $_->as_string }