Skip to content

Instantly share code, notes, and snippets.

@kkobayashi
Created December 1, 2012 13:31
Show Gist options
  • Save kkobayashi/4182255 to your computer and use it in GitHub Desktop.
Save kkobayashi/4182255 to your computer and use it in GitHub Desktop.
# -*- encoding:utf-8 -*-
use strict;
use warnings;
use utf8;
use Path::Class qw(file);
use List::Util qw(first);
use List::MoreUtils qw(uniq);
use LWP::UserAgent;
use Web::Scraper;
use Encode;
use Getopt::Std;
my $opt = {};
getopts("w:s:h", $opt);
usage() if $opt->{h};
my (@name_list, $subject);
if($opt->{w}){
@name_list = map{tr/\x0A\x0D//d; decode('utf8', $_) } file($opt->{w})->slurp;
}
else{
@name_list = get_names_from_wikipedia();
}
if($opt->{s}){
$subject = decode('shiftjis', file($opt->{s})->slurp);
}
else{
$subject = get_subject_txt();
}
my $names_regexp = '(' . join('|', @name_list) . ')';
print encode('utf8', "$_\n") foreach uniq ($subject =~ m/$names_regexp/go);
#
# get_subject_txt - get UTF-8 decoded contents of 2ch "声優個人" BBS subject.txt
#
sub get_subject_txt{
my $url_bbsmenu = 'http://menu.2ch.net/bbsmenu.html';
## 1. get bbsmenu
my $request = HTTP::Request->new(GET => $url_bbsmenu);
$request->accept_decodable; ## gzip-acceptable
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent("Monazilla/1.00"); ## 2ch-browser should be set UA as 'Monazilla'
print STDERR "getting " . $request->uri . " ...\n";
my $response = $ua->request($request);
### $response
## 2. scrape bbsmenu and find '声優個人' board
my $board = scraper {
process 'a', 'board[]' => {
url => '@href',
name => 'TEXT',
};
result 'board';
}->scrape($response);
### $board
my $va = first { $_->{name} =~ /声優個人/ } @$board;
## 3 get thread title list
$request->uri($va->{url} . 'subject.txt');
print STDERR "getting " . $request->uri . " ...\n";
$response = $ua->request($request);
open my $fh, '>', 'subject.txt';
print $fh $response->decoded_content;
close $fh;
return decode("shiftjis", $response->decoded_content);
}
#
# get_names_from_wikipedia
#
sub get_names_from_wikipedia{
my $base_uri = 'http://ja.wikipedia.org/wiki/Category:%E6%97%A5%E6%9C%AC%E3%81%AE%E5%A5%B3%E6%80%A7%E5%A3%B0%E5%84%AA'; # for women
# my $base_uri = 'http://ja.wikipedia.org/wiki/Category:%E6%97%A5%E6%9C%AC%E3%81%AE%E7%94%B7%E6%80%A7%E5%A3%B0%E5%84%AA'; # for men
my $uri_list = scraper {
process '//table[@class="toc plainlinks"]/tr/td/a', 'list[]' => '@href';
result 'list';
}->scrape(URI->new($base_uri));
### $uri_list
my @name_list = uniq map{
sleep 1; # to avoid DoS
print STDERR "scraping $_ ... \n";
my $l = scraper {
process '//div[@id="mw-pages"]//li/a', 'names[]', => ['TEXT', sub {s/ \(.+//;} ];
result 'names';
}->scrape(new URI($_));
$l ? @$l : ();
} @$uri_list;
### @name_list
open my $fh, '>', 'va_list_wikipedia.txt';
print $fh encode('utf8', "$_\n") foreach(@name_list);
close $fh;
return @name_list;
}
sub usage{
use File::Basename;
print "usage: " . basename($0) . " [-w wikipedia_list] [-s subject.txt]\n";
print " -w Wikipedia voice-actress names\n";
print " -s 2ch BBS subject.txt\n";
exit;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment