Skip to content

Instantly share code, notes, and snippets.

@satojkovic
Created September 26, 2010 15:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satojkovic/598048 to your computer and use it in GitHub Desktop.
Save satojkovic/598048 to your computer and use it in GitHub Desktop.
#! /usr/bin/perl
use warnings;
use strict;
use URI;
use Web::Scraper;
use LWP::UserAgent;
use File::Basename;
## 全論文のPDFページのURLを取得
my $papers = scraper {
process "ol", "sessions[]" => scraper {
process "li", "papers[]" => scraper {
process "a", title => 'TEXT';
process "a", link => '@href';
process "a", id => '@name';
};
};
};
my $res = $papers->scrape( URI->new('http://bmvc10.dcs.aber.ac.uk/proc/conference/') );
## 個別の論文のPDFページからPDF本体のURLを取得
my $pdfs = scraper {
process "//tr[3]//a[1]", pdf => '@href';
process "//tr[3]//a[2]", abst => '@href';
};
## PDFをダウンロード
my $res2;
my $ua = LWP::UserAgent->new;
for my $session (@{ $res->{sessions} }) {
for my $paper (@{ $session->{papers} }) {
# 各PDFページをスクレイピング
$res2 = $pdfs->scrape( URI->new($paper->{link}) );
# 論文タイトルをファイル名にして保存
# ただし、禁止文字が含まれる場合にはpaperXX.pdf(XXは番号)とする
my $filename;
if( $paper->{title} !~ m/[":]/ ) {
$filename = $paper->{title} . ".pdf";
}
else {
$filename = basename($res2->{pdf});
}
$ua->get( $res2->{pdf}, ':content_file' => $filename );
# 確認
print $filename . "\n";
print "\t" . $res2->{pdf} . "\n";
print "\t" . $res2->{abst} . "\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment