Created
December 1, 2012 13:31
-
-
Save kkobayashi/4182255 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding:utf-8 -*- | |
use strict; | |
use warnings; | |
use utf8; | |
use Path::Class qw(file); | |
use List::Util qw(first); | |
use List::MoreUtils qw(uniq); | |
use LWP::UserAgent; | |
use Web::Scraper; | |
use Encode; | |
use Getopt::Std; | |
my $opt = {}; | |
getopts("w:s:h", $opt); | |
usage() if $opt->{h}; | |
my (@name_list, $subject); | |
if($opt->{w}){ | |
@name_list = map{tr/\x0A\x0D//d; decode('utf8', $_) } file($opt->{w})->slurp; | |
} | |
else{ | |
@name_list = get_names_from_wikipedia(); | |
} | |
if($opt->{s}){ | |
$subject = decode('shiftjis', file($opt->{s})->slurp); | |
} | |
else{ | |
$subject = get_subject_txt(); | |
} | |
my $names_regexp = '(' . join('|', @name_list) . ')'; | |
print encode('utf8', "$_\n") foreach uniq ($subject =~ m/$names_regexp/go); | |
# | |
# get_subject_txt - get UTF-8 decoded contents of 2ch "声優個人" BBS subject.txt | |
# | |
sub get_subject_txt{ | |
my $url_bbsmenu = 'http://menu.2ch.net/bbsmenu.html'; | |
## 1. get bbsmenu | |
my $request = HTTP::Request->new(GET => $url_bbsmenu); | |
$request->accept_decodable; ## gzip-acceptable | |
my $ua = LWP::UserAgent->new; | |
$ua->timeout(10); | |
$ua->agent("Monazilla/1.00"); ## 2ch-browser should be set UA as 'Monazilla' | |
print STDERR "getting " . $request->uri . " ...\n"; | |
my $response = $ua->request($request); | |
### $response | |
## 2. scrape bbsmenu and find '声優個人' board | |
my $board = scraper { | |
process 'a', 'board[]' => { | |
url => '@href', | |
name => 'TEXT', | |
}; | |
result 'board'; | |
}->scrape($response); | |
### $board | |
my $va = first { $_->{name} =~ /声優個人/ } @$board; | |
## 3 get thread title list | |
$request->uri($va->{url} . 'subject.txt'); | |
print STDERR "getting " . $request->uri . " ...\n"; | |
$response = $ua->request($request); | |
open my $fh, '>', 'subject.txt'; | |
print $fh $response->decoded_content; | |
close $fh; | |
return decode("shiftjis", $response->decoded_content); | |
} | |
# | |
# get_names_from_wikipedia | |
# | |
sub get_names_from_wikipedia{ | |
my $base_uri = 'http://ja.wikipedia.org/wiki/Category:%E6%97%A5%E6%9C%AC%E3%81%AE%E5%A5%B3%E6%80%A7%E5%A3%B0%E5%84%AA'; # for women | |
# my $base_uri = 'http://ja.wikipedia.org/wiki/Category:%E6%97%A5%E6%9C%AC%E3%81%AE%E7%94%B7%E6%80%A7%E5%A3%B0%E5%84%AA'; # for men | |
my $uri_list = scraper { | |
process '//table[@class="toc plainlinks"]/tr/td/a', 'list[]' => '@href'; | |
result 'list'; | |
}->scrape(URI->new($base_uri)); | |
### $uri_list | |
my @name_list = uniq map{ | |
sleep 1; # to avoid DoS | |
print STDERR "scraping $_ ... \n"; | |
my $l = scraper { | |
process '//div[@id="mw-pages"]//li/a', 'names[]', => ['TEXT', sub {s/ \(.+//;} ]; | |
result 'names'; | |
}->scrape(new URI($_)); | |
$l ? @$l : (); | |
} @$uri_list; | |
### @name_list | |
open my $fh, '>', 'va_list_wikipedia.txt'; | |
print $fh encode('utf8', "$_\n") foreach(@name_list); | |
close $fh; | |
return @name_list; | |
} | |
sub usage{ | |
use File::Basename; | |
print "usage: " . basename($0) . " [-w wikipedia_list] [-s subject.txt]\n"; | |
print " -w Wikipedia voice-actress names\n"; | |
print " -s 2ch BBS subject.txt\n"; | |
exit; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment