Last active
August 29, 2015 14:00
-
-
Save thinkhy/11027824 to your computer and use it in GitHub Desktop.
Download SHARE handouts automatically.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
################################################################## | |
# | |
# Code on github: https://gist.github.com/thinkhy/11027824 | |
# | |
################################################################## | |
use LWP::Simple; | |
use HTML::LinkExtor; | |
use URI::URL; | |
use URI::Escape; | |
use charnames qw(:full); | |
use strict; | |
binmode STDOUT, ':utf8'; | |
my $conferenceNumber = 122; # default number is 121 (2013 Aug.) | |
my $indexFile = "index.html"; | |
if ($ARGV[0]) | |
{ | |
$conferenceNumber = $ARGV[0]; | |
} | |
if ($ARGV[1]) | |
{ | |
$indexFile = $ARGV[1]; | |
} | |
print "Conference number: $conferenceNumber\n"; | |
print "Index file: $indexFile\n"; | |
my $url = "https://share.confex.com/share/$conferenceNumber/webprogram/uploadlistall.html"; | |
my $base = "https://share.confex.com/share/$conferenceNumber/webprogram/"; | |
print "Get content and extrac link from $url\n"; | |
my $ref_links = extract_link($url, "", "a", "href"); | |
my $indexOriginalContent = get($url); | |
print "=============================================\n"; | |
print $indexOriginalContent; | |
open my $out, ">", $indexFile or die "Failed to open $indexFile\n"; | |
print $out $indexOriginalContent; | |
close $out; | |
my $indexContent = uri_unescape($indexOriginalContent); | |
$indexContent =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige; | |
$indexContent =~ s/href=".*\/(Session.*?\.html)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige; | |
#$indexContent =~ s/$base//sg; | |
my @sessions = $indexContent =~ /href="(Session\d+\.html)"/ig; | |
my $cnt = 0; | |
foreach(@$ref_links) | |
{ | |
my $url = $_; | |
print "Processing URL: $url\n"; | |
my ($html) = $url =~ m/https:\/\/.*\/(Session.*?\.html)/i; | |
my ($pdf) = $url =~ m/https:\/\/.*\/(.*?\.pdf)/i; | |
my $code = $1; | |
$pdf = uri_unescape($pdf); | |
$html = uri_unescape($html); | |
$pdf = FilterName($pdf); | |
$html = FilterName($html); | |
if ($pdf and !-e "./$pdf") | |
{ | |
print "pdf: $pdf\n"; | |
$cnt++; | |
print $pdf."\n"; | |
system(qq(wget --no-check-certificate -O "$pdf" "$url")); | |
} | |
if ($html and !-e "./$html") | |
#if (0) | |
{ | |
print "html: $html\n"; | |
print "GET $url\n"; | |
my $content = get($url); | |
my ($abstract) = $content =~ m#(<div class="content">.*?)<div class="media">#si; | |
#$abstract =~ s/$base//sg; | |
$abstract =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige; | |
print "Write to ./$html\n"; | |
open my $fd, ">", "./$html" or die $!; | |
print $fd $abstract; | |
close $fd; | |
} | |
} | |
print "Count: $cnt\n"; | |
sub extract_link() | |
{ | |
my $url = shift; | |
my $base= shift; # base URL | |
my $mytag = shift; # specified html tag name, such as a, form ... | |
my $attr_name = shift; # link pattern | |
$base =~ s/\/$//g; | |
my $ua = LWP::UserAgent->new or dir $!; | |
# Set up a callback that collect image links | |
my @links = (); | |
sub callback { | |
my($tag, %attr) = @_; | |
return if $tag ne $mytag; # we only look closer at <img ...> | |
push(@links, $attr{$attr_name}); | |
} | |
# Make the parser. Unfortunately, we don't know the base yet | |
# (it might be different from $url) | |
my $p = HTML::LinkExtor->new(\&callback); | |
# Request document and parse it as it arrives | |
my $res = $ua->request(HTTP::Request->new(GET => $url), | |
sub {$p->parse($_[0])}) or die $!; | |
# Expand all image URLs to absolute ones | |
$base or $base = $res->base; | |
@links = map { $_= url($_, $base)->abs; } @links; | |
return \@links; | |
} | |
sub FilterName { | |
my $filename = shift; | |
$filename =~ s/^\s*//g; | |
$filename =~ s/\s*$//g; | |
# C++ ==> CPP | |
$filename =~ s/[Cc]\+\+/CPP/g; | |
# z/OS ==> zOS | |
$filename =~ s/z[\/\\]OS/zOS/g; | |
# Windows filename convention | |
$filename =~ s/[:+\*\\\/\?"<>|]/ /g; | |
$filename; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment