Last active
October 13, 2016 06:37
-
-
Save ywindish/98894fd94f8cf0ee0825f0bd6c6a9586 to your computer and use it in GitHub Desktop.
TeianReader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
use LWP::UserAgent; | |
use HTML::TreeBuilder; | |
use Encode; | |
use Data::Dumper; | |
my $url_base = 'http://hiroba.dqx.jp'; | |
my $url_list = '/sc/forum/pretop/0/0/1/page/'; | |
my $page = shift @ARGV || 0; | |
my $tree = get_content_tree($url_base . $url_list . $page); | |
unless ($tree) { | |
print "cannot connect to hiroba page. usage: ".$0." <page_number_0_origin>\n"; | |
exit; | |
} | |
my @items = $tree->look_down('class', 'thread_article'); | |
for my $item (@items) { | |
my $user = $item->look_down('class', 'strongLnk'); | |
my $user_url = $url_base . $user->find('a')->attr('href'); | |
my $user_name = $user->as_text; | |
my $user_id = $item->look_down('class', 'txt_id')->as_text; | |
my $thread = $item->look_down('class', 'txt_thread'); | |
my $url_detail = $url_base . $thread->find('a')->attr('href'); | |
my $title = $thread->as_text; | |
my $comment = $item->look_down('class', 'txt_comment')->as_text; | |
# かんそう件数 (おうえんにはない) | |
my $commentCnt = $item->look_down('class', 'commentCnt txt_good'); | |
my $reaction_count = ''; | |
$reaction_count = $commentCnt->as_text if $commentCnt; | |
if ($comment =~ /\.\.\.$/) { | |
# 途中で省略されてたら詳細をとりにいく | |
my $detail_tree = get_content_tree($url_detail); | |
$comment = $detail_tree->look_down('class', 'threadCtt')->as_text; | |
} | |
print "$user_name $user_id\n"; | |
print "$user_url\n"; | |
print "$title\n"; | |
print "$url_detail\n"; | |
print "$comment\n"; | |
print "$reaction_count\n"; | |
print "\n"; | |
} | |
sub get_content_tree { | |
my $url = shift; | |
my $ua = LWP::UserAgent->new('agent' => "TeianReader/1.0"); | |
my $res = $ua->get($url); | |
return undef unless $res && $res->is_success; | |
my $content = $res->content; | |
my $tree = HTML::TreeBuilder->new; | |
#$tree->no_space_compacting(1); | |
$tree->parse($content); | |
return $tree; | |
} | |
1; | |
=head1 NAME | |
teian_reader.pl | |
=head1 SYNOPSIS | |
perl teian_reader.pl | |
perl teian_reader.pl 0 (same as above) | |
perl teian_reader.pl 2 | |
=head1 DESCRIPTION | |
提案広場をスクレイピングするよ | |
see: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment