Skip to content

Instantly share code, notes, and snippets.

@thinkhy
Last active August 29, 2015 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkhy/11027824 to your computer and use it in GitHub Desktop.
Save thinkhy/11027824 to your computer and use it in GitHub Desktop.
Download SHARE handouts automatically.
#!/usr/bin/perl -w
##################################################################
#
# Code on github: https://gist.github.com/thinkhy/11027824
#
##################################################################
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;
use URI::Escape;
use charnames qw(:full);
use strict;
binmode STDOUT, ':utf8';
my $conferenceNumber = 122; # default number is 121 (2013 Aug.)
my $indexFile = "index.html";
if ($ARGV[0])
{
$conferenceNumber = $ARGV[0];
}
if ($ARGV[1])
{
$indexFile = $ARGV[1];
}
print "Conference number: $conferenceNumber\n";
print "Index file: $indexFile\n";
my $url = "https://share.confex.com/share/$conferenceNumber/webprogram/uploadlistall.html";
my $base = "https://share.confex.com/share/$conferenceNumber/webprogram/";
print "Get content and extrac link from $url\n";
my $ref_links = extract_link($url, "", "a", "href");
my $indexOriginalContent = get($url);
print "=============================================\n";
print $indexOriginalContent;
open my $out, ">", $indexFile or die "Failed to open $indexFile\n";
print $out $indexOriginalContent;
close $out;
my $indexContent = uri_unescape($indexOriginalContent);
$indexContent =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;
$indexContent =~ s/href=".*\/(Session.*?\.html)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;
#$indexContent =~ s/$base//sg;
my @sessions = $indexContent =~ /href="(Session\d+\.html)"/ig;
my $cnt = 0;
foreach(@$ref_links)
{
my $url = $_;
print "Processing URL: $url\n";
my ($html) = $url =~ m/https:\/\/.*\/(Session.*?\.html)/i;
my ($pdf) = $url =~ m/https:\/\/.*\/(.*?\.pdf)/i;
my $code = $1;
$pdf = uri_unescape($pdf);
$html = uri_unescape($html);
$pdf = FilterName($pdf);
$html = FilterName($html);
if ($pdf and !-e "./$pdf")
{
print "pdf: $pdf\n";
$cnt++;
print $pdf."\n";
system(qq(wget --no-check-certificate -O "$pdf" "$url"));
}
if ($html and !-e "./$html")
#if (0)
{
print "html: $html\n";
print "GET $url\n";
my $content = get($url);
my ($abstract) = $content =~ m#(<div class="content">.*?)<div class="media">#si;
#$abstract =~ s/$base//sg;
$abstract =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;
print "Write to ./$html\n";
open my $fd, ">", "./$html" or die $!;
print $fd $abstract;
close $fd;
}
}
print "Count: $cnt\n";
sub extract_link()
{
my $url = shift;
my $base= shift; # base URL
my $mytag = shift; # specified html tag name, such as a, form ...
my $attr_name = shift; # link pattern
$base =~ s/\/$//g;
my $ua = LWP::UserAgent->new or dir $!;
# Set up a callback that collect image links
my @links = ();
sub callback {
my($tag, %attr) = @_;
return if $tag ne $mytag; # we only look closer at <img ...>
push(@links, $attr{$attr_name});
}
# Make the parser. Unfortunately, we don't know the base yet
# (it might be different from $url)
my $p = HTML::LinkExtor->new(\&callback);
# Request document and parse it as it arrives
my $res = $ua->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])}) or die $!;
# Expand all image URLs to absolute ones
$base or $base = $res->base;
@links = map { $_= url($_, $base)->abs; } @links;
return \@links;
}
sub FilterName {
my $filename = shift;
$filename =~ s/^\s*//g;
$filename =~ s/\s*$//g;
# C++ ==> CPP
$filename =~ s/[Cc]\+\+/CPP/g;
# z/OS ==> zOS
$filename =~ s/z[\/\\]OS/zOS/g;
# Windows filename convention
$filename =~ s/[:+\*\\\/\?"<>|]/ /g;
$filename;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment