Skip to content

Instantly share code, notes, and snippets.

@guipn
Created March 27, 2011 00:30
Show Gist options
  • Save guipn/888789 to your computer and use it in GitHub Desktop.
Save guipn/888789 to your computer and use it in GitHub Desktop.
Script to download from a webpage all files the extensions of which are among the ones listed by the user.
#
# Downloads all files directly linked within a webpage that are of a given list of extensions.
#
# The webpage in question is to be provided as the first argument for the program.
# The formats shall be specified as a single, second argument by the user, like so: "pdf txt tar.gz"
#
# If a third argument is given, it will be used as a target folder in which to save downloaded files.
#
# example: getfiles.pl http://google.com "gif js"
#
# gdjs
use warnings;
use strict;
use WWW::Mechanize;
usage() unless @ARGV > 1;
my $targetfile = shift;
my @extensions = split " ", shift;
my $targetfolder = shift;
## setup destination folder, if any ##
$targetfolder = $targetfolder ? "$targetfolder/" : "";
mkdir($targetfolder) unless -e $targetfolder;
## extra configs ##
my $verbose = 1;
my %mechconf = (
autocheck => 0,
agent => 'downloader',
stack_depth => 0, # don't keep history in memory
quiet => 1,
onerror => undef,
timeout => 3, # s
);
my $client = WWW::Mechanize->new(%mechconf);
my $urlregex = buildurlregex();
## this is where it starts ##
msg("[$targetfile]");
$client->get($targetfile);
error( $client->status() ) unless $client->success();
my @todownload = $client->find_all_links( url_regex => qr/$urlregex/ );
print "\n\n";
for (@todownload)
{
my $filename = (split "/", $_->url)[-1];
print "\n\tGetting '$filename'";
$client->get($_->url, ':content_file' => $targetfolder.$filename);
print "\t\tfailed (", $client->status(), ")!\n" unless $client->success();
}
print "\n\n\tDone.\n\n";
sub usage
{
die "\n\tUsage: $0 [uri] \"ext1 ext2 ...\" [localfolder?]\n\n";
}
## builds "(ext1|ext2|...)" to be used as an argument to Mech. ##
sub buildurlregex
{
my $ret = join "|", @extensions;
return "($ret)";
}
sub msg
{
return unless $verbose;
print "\n\t", shift;
}
sub error
{
my $errorcode = shift;
die "Operation aborted.\n\t\tError $errorcode reported.";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment