Skip to content

Instantly share code, notes, and snippets.

@ywindish
Last active October 13, 2016 06:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ywindish/98894fd94f8cf0ee0825f0bd6c6a9586 to your computer and use it in GitHub Desktop.
Save ywindish/98894fd94f8cf0ee0825f0bd6c6a9586 to your computer and use it in GitHub Desktop.
TeianReader
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use LWP::UserAgent;
use HTML::TreeBuilder;
use Encode;
use Data::Dumper;
my $url_base = 'http://hiroba.dqx.jp';
my $url_list = '/sc/forum/pretop/0/0/1/page/';
my $page = shift @ARGV || 0;
my $tree = get_content_tree($url_base . $url_list . $page);
unless ($tree) {
print "cannot connect to hiroba page. usage: ".$0." <page_number_0_origin>\n";
exit;
}
my @items = $tree->look_down('class', 'thread_article');
for my $item (@items) {
my $user = $item->look_down('class', 'strongLnk');
my $user_url = $url_base . $user->find('a')->attr('href');
my $user_name = $user->as_text;
my $user_id = $item->look_down('class', 'txt_id')->as_text;
my $thread = $item->look_down('class', 'txt_thread');
my $url_detail = $url_base . $thread->find('a')->attr('href');
my $title = $thread->as_text;
my $comment = $item->look_down('class', 'txt_comment')->as_text;
# かんそう件数 (おうえんにはない)
my $commentCnt = $item->look_down('class', 'commentCnt txt_good');
my $reaction_count = '';
$reaction_count = $commentCnt->as_text if $commentCnt;
if ($comment =~ /\.\.\.$/) {
# 途中で省略されてたら詳細をとりにいく
my $detail_tree = get_content_tree($url_detail);
$comment = $detail_tree->look_down('class', 'threadCtt')->as_text;
}
print "$user_name $user_id\n";
print "$user_url\n";
print "$title\n";
print "$url_detail\n";
print "$comment\n";
print "$reaction_count\n";
print "\n";
}
sub get_content_tree {
my $url = shift;
my $ua = LWP::UserAgent->new('agent' => "TeianReader/1.0");
my $res = $ua->get($url);
return undef unless $res && $res->is_success;
my $content = $res->content;
my $tree = HTML::TreeBuilder->new;
#$tree->no_space_compacting(1);
$tree->parse($content);
return $tree;
}
1;
=head1 NAME
teian_reader.pl
=head1 SYNOPSIS
perl teian_reader.pl
perl teian_reader.pl 0 (same as above)
perl teian_reader.pl 2
=head1 DESCRIPTION
提案広場をスクレイピングするよ
see: http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment