Skip to content

Instantly share code, notes, and snippets.

@onishi
Created July 12, 2024 23:31
Show Gist options
  • Save onishi/12d0fe59618c174eeaa666128be2158e to your computer and use it in GitHub Desktop.
Save onishi/12d0fe59618c174eeaa666128be2158e to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
binmode STDOUT, ':utf8';
binmode STDERR, ":utf8";
use Encode;
use LWP::Simple 'get';
use HTML::TreeBuilder::XPath;
use Data::Dumper 'Dumper';
use YAML 'Dump';
# 2024夏アニメの情報をスクレイピングしたい
# アキバ総研
# https://akiba-souken.com/anime/summer/
my $url = 'https://akiba-souken.com/anime/summer/';
my $known_streaming = [qw/Netflix Hulu dアニメストア Amazonプライム・ビデオ ABEMA/, 'DMM TV'];
my $tree = tree($url);
# <!--作品枠 ATRI -My Dear Moments--->
# <div class="itemBox" id="036">
my $list = $tree->findnodes('//div[contains(@class,"itemBox")]');
my $max_song_count = 0;
my $result;
for my $item (@$list) {
my $node = $item->clone;
my $title = $node->findnodes('//h2')->[0]->as_text;
my $music;
if ($music = $node->findnodes('//dl[contains(@class,"music")]')->[0]) {
$music = $music->as_text;
$music =~ s{^\s*主題歌}{}m;
$music = [grep { $_ } split (/【.+?】/, $music)];
}
# 主題歌
my $song_count = scalar @{$music || []} || 0;
if ($song_count > $max_song_count) {
$max_song_count = $song_count;
}
# 配信
my $streaming;
for my $table (@{$node->findnodes('//div[contains(@class,"schedule")]//table') || []}) {
$table = $table->clone;
#warn $table->as_text;
$table->as_text =~ /地上波/ and next;
$table->as_text =~ /配信/ or next;
#warn $table->as_text;
for my $schedule (@{$table->findnodes('//td')}) {
$schedule = $schedule->clone;
$schedule->as_text =~ /年/ or next;
my ($key, $value) = map { $_->as_text } @{$schedule->findnodes('//span')};
$streaming->{$key} = $value;
}
}
push @$result, {
title => $title,
music => $music,
streaming => $streaming,
};
}
print join "\t", 'タイトル', @$known_streaming, '主題歌', "\n";
for my $item (@$result) {
my $title = $item->{title};
my $music = $item->{music};
my $streaming = $item->{streaming};
my $line = join(
"\t",
$title,
(map { $streaming->{$_} || '' } @$known_streaming),
@{$music || []},
);
print $line, "\n";
}
# ツリーを得る
sub tree {
my $args = shift;
if (ref($args) eq 'HTML::Element') {
return $args->clone;
}
my $content = $args =~ /^http/ ? get($args) : $args;
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse_content($content);
return $tree->clone;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment