Skip to content

Instantly share code, notes, and snippets.

@en45masao
Created January 10, 2011 05:31
Show Gist options
  • Save en45masao/772414 to your computer and use it in GitHub Desktop.
Save en45masao/772414 to your computer and use it in GitHub Desktop.
Yahoo! Japanの英検過去問題をダウンロードしてCSVファイル化するスクリプト
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use WWW::Mechanize;
use Text::CSV_XS;
use Web::Scraper;
my ($grade, $year, $sq) = @ARGV;
die "$0 <1|P1|2|P2|3|4|5> [[yyyy] 1|2|3]" unless defined $grade;
$year = '' unless defined $year;
$sq = '' unless defined $sq;
my $scraper = scraper {
process '//div[@id="answer"]/h2', 'year' => ['TEXT', sub {m/([0-9]+)年度/; return $1;}];
process '//div[@id="answer"]/h2', 'sq' => ['TEXT', sub {m/第([0-9]+)回/; return $1;}];
process '//div[@id="answer"]/h2', 'no' => ['TEXT', sub {m/\(([0-9]+)\)/; return $1;}];
process '//div[@id="a-question"]/dl[1]/dt', 'question' => 'TEXT';
process '//div[@id="a-question"]/dl[1]/dd/ul/li', 'choices[]' => 'TEXT';
process '//dl[@id="a-match"]/dd[2]', 'answer' => 'TEXT';
};
my $mech = new WWW::Mechanize();
$mech->get('http://stepup.yahoo.co.jp/english/eiken/minitest.html');
my $csv = Text::CSV_XS->new({binary => 1});
foreach my $top ($mech->find_all_links(text_regex => qr/${year}年度第${sq}/, url_abs_regex => qr(^http://stepup\.yahoo\.co\.jp/.*gr=${grade}))) {
$mech->get($top->url());
$mech->follow_link(text => '1', url_regex => qr/.*daily_a\.html.*/);
my $tmp = $scraper->scrape($mech->content, $mech->uri);
my $filename = "eiken_${grade}-$tmp->{year}-$tmp->{sq}.csv";
open my $file, '>:encoding(cp932)', $filename or die $!; # force output in CP932
print "$filename: Now scraping...\n";
for (;;) {
my $scrap = $scraper->scrape($mech->content, $mech->uri);
$csv->combine($scrap->{year}, $scrap->{sq}, $scrap->{no}, $scrap->{question}, @{$scrap->{choices}}, $scrap->{answer});
print $file $csv->string() . "\n";
sleep 1;
my $link = $mech->find_link(text => '次の問題', url_regex => qr/.*daily_q\.html.*/);
last unless defined $link;
$mech->get($link);
sleep 1;
$mech->follow_link(text => '1', url_regex => qr/.*daily_a\.html.*/);
}
close $file;
}
=head1 NAME
makecsv_yahooeiken - A script to download "Yahoo! Japan EIKEN Daily Test" as CSV files.
=head1 SYNOPSIS
makecsv_yahooeiken <1|P1|2|P2|3|4|5> [[yyyy] 1|2|3]
=head1 EXAMPLES
perl makecsv_yahooeiken P1
perl makecsv_yahooeiken P1 2009
perl makecsv_yahooeiken P1 2009 3
=head1 DESCRIPTION
このスクリプトを使って、Yahoo! Japanの英検デイリーミニテスト<L<http://stepup.yahoo.co.jp/english/eiken/>>の問題をCSVファイルに書き出すことができます。
=head1 AUTHOR
en45masao
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment