Skip to content

Instantly share code, notes, and snippets.

@mtsukamoto
Created June 11, 2016 17:16
Show Gist options
  • Save mtsukamoto/5476b7c0c4c32d88be5547c023a221aa to your computer and use it in GitHub Desktop.
Save mtsukamoto/5476b7c0c4c32d88be5547c023a221aa to your computer and use it in GitHub Desktop.
リスト中の食べログURLを取得し「店名」「営業時間」「禁煙・喫煙」などを出力
use strict;
use warnings;
use utf8;
use Encode;
use Web::Scraper;
use URI;
my ($opts, @urls) = &parse_args(@ARGV);
my $scraper = scraper {
process '//table[@class="rst-data"]//tr', 'entry[]' => scraper {
process '//th', 'key' => 'TEXT';
process '//td', 'value' => 'TEXT';
};
};
my $shops = [];
my @fields = @{$opts->{'-f'}};
foreach my $url (@urls) {
print "[fetch] $url\n";
my $uri = URI->new($url);
my $res = $scraper->scrape($uri);
my $shop = { 'URL' => $url, map { $_ => '' } @fields };
$shop->{'URL'} = $url if (exists($shop->{'URL'}));
foreach my $entry (@{$res->{'entry'}}) {
next unless (exists($shop->{$entry->{'key'}}));
$shop->{$entry->{'key'}} = $entry->{'value'};
}
push(@$shops, $shop);
}
foreach my $shop (@$shops) {
my @lines = map { sprintf("[%s] %s", $_, $shop->{$_}) } grep { exists($shop->{$_}) } @fields;
my $result = join("\n", @lines);
print encode('utf8', "----\n$result\n\n");
}
sub parse_args {
my @args = @_;
# 引数
# ・URLリスト(第一引数、または最終引数)
# ・-f 出力フィールド(デフォルトは店名、営業時間、定休日、予算(お店より)、予算(ユーザーより)、禁煙・喫煙、URL)
my ($list, $switch, $opts) = (undef, undef, {});
$list = (not @args) ? undef : ($args[0] !~ /^-/) ? shift(@args) : ($args[-1] !~ /^-/) ? pop(@args) : undef;
foreach (@args) {
next unless (defined($_) && length($_));
if (/^-/) {
$switch = $_;
$opts->{$switch} ||= [];
} elsif ($switch) {
push(@{$opts->{$switch}}, $_);
}
}
$opts->{'-f'} ||= [qw(店名 営業時間 定休日 予算(お店より) 予算(ユーザーより) 禁煙・喫煙 URL)];
die qq(usage: $0 URLLIST [-f FIELD1 FIELD2 ...]) unless ($list);
my @urls = ();
if (-f $list) {
open(my $fh, '<', $list) || die qq(Can't open "$list", $!);
push(@urls, <$fh>);
close($fh);
} elsif ($list =~ /https?:\/\/tabelog.com\//i) {
push(@urls, $list);
}
@urls = grep { /https?:\/\/tabelog.com\//i } map { s/^\s+|\s+$//gs; $_ } @urls;
die qq(No tabelog url found in "$list") unless (@urls);
return $opts, @urls;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment