thinkhy/gethandouts.pl

## gethandouts.pl
#!/usr/bin/perl -w
##################################################################
#
# Code on github: https://gist.github.com/thinkhy/11027824
#
##################################################################
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;
use URI::Escape;
use charnames qw(:full);
use strict;

binmode STDOUT, ':utf8';

my $conferenceNumber = 122; # default number is 121 (2013 Aug.)
my $indexFile = "index.html";

if ($ARGV[0])
{
    $conferenceNumber = $ARGV[0];
}

if ($ARGV[1])
{
    $indexFile = $ARGV[1];
}

print "Conference number: $conferenceNumber\n";
print "Index file: $indexFile\n";

my $url = "https://share.confex.com/share/$conferenceNumber/webprogram/uploadlistall.html";
my $base = "https://share.confex.com/share/$conferenceNumber/webprogram/";

print "Get content and extrac link from $url\n";

my $ref_links = extract_link($url, "", "a", "href");

my $indexOriginalContent = get($url);
print "=============================================\n";
print $indexOriginalContent;
open my $out, ">", $indexFile or die "Failed to open $indexFile\n";
print $out $indexOriginalContent;
close $out;

my $indexContent = uri_unescape($indexOriginalContent);
$indexContent =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;

$indexContent =~ s/href=".*\/(Session.*?\.html)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;
#$indexContent =~ s/$base//sg;


my @sessions = $indexContent =~ /href="(Session\d+\.html)"/ig;

my $cnt = 0;
foreach(@$ref_links)
{
    my $url = $_;

    print "Processing URL: $url\n";
    my ($html) = $url =~ m/https:\/\/.*\/(Session.*?\.html)/i;
    my ($pdf) = $url =~ m/https:\/\/.*\/(.*?\.pdf)/i;


    my $code = $1;
    $pdf  = uri_unescape($pdf);
    $html = uri_unescape($html);

    $pdf  = FilterName($pdf);
    $html = FilterName($html);

    if ($pdf and !-e "./$pdf")
    {
        print "pdf:  $pdf\n";
        $cnt++;

        print $pdf."\n";
        system(qq(wget --no-check-certificate -O "$pdf" "$url"));
    }

    if ($html and !-e "./$html")
    #if (0)
    {
        print "html: $html\n";
        print "GET $url\n";
        my $content = get($url);

        my ($abstract) = $content =~ m#(<div class="content">.*?)<div class="media">#si;

        #$abstract =~ s/$base//sg;
        $abstract =~ s/href=".*\/(.*?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;


        print "Write to ./$html\n";

        open my $fd, ">", "./$html" or die $!;
        print $fd $abstract;
        close $fd;
    }
}

print "Count: $cnt\n";

sub extract_link()
{
  my $url = shift;
  my $base= shift;       #  base URL
  my $mytag = shift;     # specified html tag name, such as a, form ...
  my $attr_name = shift; # link pattern


  $base =~ s/\/$//g;

  my $ua = LWP::UserAgent->new or dir $!;

  # Set up a callback that collect image links
  my @links = ();

  sub callback {
     my($tag, %attr) = @_;
     return if $tag ne $mytag;  # we only look closer at <img ...>
     push(@links, $attr{$attr_name});
  }

  # Make the parser.  Unfortunately, we don't know the base yet
  # (it might be different from $url)
  my $p = HTML::LinkExtor->new(\&callback);

  # Request document and parse it as it arrives
  my $res = $ua->request(HTTP::Request->new(GET => $url),
                         sub {$p->parse($_[0])}) or die $!;


  # Expand all image URLs to absolute ones
  $base or $base = $res->base;
  @links = map { $_= url($_, $base)->abs; } @links;

  return \@links;
}


sub FilterName {
    my $filename = shift;

    $filename =~ s/^\s*//g;
    $filename =~ s/\s*$//g;

    # C++ ==> CPP
    $filename =~ s/[Cc]\+\+/CPP/g;

    # z/OS ==> zOS
    $filename =~ s/z[\/\\]OS/zOS/g;

    # Windows filename convention
    $filename =~ s/[:+\*\\\/\?"<>|]/ /g;

    $filename;
}
	#!/usr/bin/perl -w
	##################################################################
	#
	# Code on github: https://gist.github.com/thinkhy/11027824
	#
	##################################################################
	use LWP::Simple;
	use HTML::LinkExtor;
	use URI::URL;
	use URI::Escape;
	use charnames qw(:full);
	use strict;

	binmode STDOUT, ':utf8';

	my $conferenceNumber = 122; # default number is 121 (2013 Aug.)
	my $indexFile = "index.html";

	if ($ARGV[0])
	{
	$conferenceNumber = $ARGV[0];
	}

	if ($ARGV[1])
	{
	$indexFile = $ARGV[1];
	}

	print "Conference number: $conferenceNumber\n";
	print "Index file: $indexFile\n";

	my $url = "https://share.confex.com/share/$conferenceNumber/webprogram/uploadlistall.html";
	my $base = "https://share.confex.com/share/$conferenceNumber/webprogram/";

	print "Get content and extrac link from $url\n";

	my $ref_links = extract_link($url, "", "a", "href");

	my $indexOriginalContent = get($url);
	print "=============================================\n";
	print $indexOriginalContent;
	open my $out, ">", $indexFile or die "Failed to open $indexFile\n";
	print $out $indexOriginalContent;
	close $out;

	my $indexContent = uri_unescape($indexOriginalContent);
	$indexContent =~ s/href=".\/(.?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;

	$indexContent =~ s/href=".\/(Session.?\.html)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;
	#$indexContent =~ s/$base//sg;



	my @sessions = $indexContent =~ /href="(Session\d+\.html)"/ig;

	my $cnt = 0;
	foreach(@$ref_links)
	{
	my $url = $_;

	print "Processing URL: $url\n";
	my ($html) = $url =~ m/https:\/\/.\/(Session.?\.html)/i;
	my ($pdf) = $url =~ m/https:\/\/.\/(.?\.pdf)/i;


	my $code = $1;
	$pdf = uri_unescape($pdf);
	$html = uri_unescape($html);

	$pdf = FilterName($pdf);
	$html = FilterName($html);

	if ($pdf and !-e "./$pdf")
	{
	print "pdf: $pdf\n";
	$cnt++;

	print $pdf."\n";
	system(qq(wget --no-check-certificate -O "$pdf" "$url"));
	}

	if ($html and !-e "./$html")
	#if (0)
	{
	print "html: $html\n";
	print "GET $url\n";
	my $content = get($url);

	my ($abstract) = $content =~ m#(<div class="content">.*?)<div class="media">#si;

	#$abstract =~ s/$base//sg;
	$abstract =~ s/href=".\/(.?\.pdf)"/my $newName=FilterName($1);my $tmp=qq(href="$newName");$tmp/ige;


	print "Write to ./$html\n";

	open my $fd, ">", "./$html" or die $!;
	print $fd $abstract;
	close $fd;
	}
	}

	print "Count: $cnt\n";

	sub extract_link()
	{
	my $url = shift;
	my $base= shift; # base URL
	my $mytag = shift; # specified html tag name, such as a, form ...
	my $attr_name = shift; # link pattern


	$base =~ s/\/$//g;

	my $ua = LWP::UserAgent->new or dir $!;

	# Set up a callback that collect image links
	my @links = ();

	sub callback {
	my($tag, %attr) = @_;
	return if $tag ne $mytag; # we only look closer at <img ...>
	push(@links, $attr{$attr_name});
	}

	# Make the parser. Unfortunately, we don't know the base yet
	# (it might be different from $url)
	my $p = HTML::LinkExtor->new(\&callback);

	# Request document and parse it as it arrives
	my $res = $ua->request(HTTP::Request->new(GET => $url),
	sub {$p->parse($_[0])}) or die $!;


	# Expand all image URLs to absolute ones
	$base or $base = $res->base;
	@links = map { $_= url($_, $base)->abs; } @links;

	return \@links;
	}


	sub FilterName {
	my $filename = shift;

	$filename =~ s/^\s*//g;
	$filename =~ s/\s*$//g;

	# C++ ==> CPP
	$filename =~ s/[Cc]\+\+/CPP/g;

	# z/OS ==> zOS
	$filename =~ s/z[\/\\]OS/zOS/g;

	# Windows filename convention
	$filename =~ s/[:+\*\\\/\?"<>\|]/ /g;

	$filename;
	}