Skip to content

Instantly share code, notes, and snippets.

@jukworks
Created February 11, 2014 11:54
Show Gist options
  • Save jukworks/8933501 to your computer and use it in GitHub Desktop.
Save jukworks/8933501 to your computer and use it in GitHub Desktop.
This Perl script fetches image and video files from the given URL.
use strict;
use autodie;
use WWW::Mechanize;
use Getopt::Long;
use URI;
use Math::Random qw/random_exponential/;
my $url = '';
my $interval = 5;
my $save = 'fetched';
my $test = '';
# parse arguments
GetOptions(
"url=s" => \$url,
"interval=i" => \$interval,
"save=s" => \$save,
"test" => \$test,
) or usage();
# check if the given URL is valid
unless ( ( URI->new( $url, "http" ) )->scheme() ) {
print STDERR "ERROR! Invalid URL: ", $url, "\n\n";
usage();
}
# let's get started
$url .= '/' unless $url =~ /.*\/$/;
print "Given URL: ", $url, "\n";
print "Interval : ", $interval, "\n";
print "Save Dir : ", $save, "\n";
mkdir $save unless $test or -d $save;
my $mech = WWW::Mechanize->new;
$mech->show_progress(1);
$mech->agent_alias('Windows Mozilla');
$mech->get($url);
my @links = $mech->links;
my @dir_queue = ();
my @file_queue = ();
while (1) {
for my $x (@links) {
if ( defined $x->url ) {
push @dir_queue, $x->base . $x->url
if $x->url !~ /^\//
&& $x->url !~ /^http/
&& $x->url =~ /\/$/;
push @file_queue, $x->base . $x->url
if $x->url
=~ /.*(avi|bmp|gif|jpg|jpeg|mov|mp4|mpg|mpeg|png|zip)$/i;
}
}
my @delays
= Math::Random::random_exponential( scalar @file_queue, $interval )
unless $test;
for my $target (@file_queue) {
my $save_to = $save . '/' . substr( $target, length($url) );
print "Downloading: $target => $save_to\n";
unless ($test) {
$mech->get($target);
$mech->save_content($save_to);
my $sleep_time = int( shift @delays );
if ( $sleep_time > 0 ) {
print "Sleeping: ${sleep_time}s\n";
sleep($sleep_time);
}
}
}
last unless @dir_queue;
my $next = shift @dir_queue;
unless ($test) {
my $local_dir = $save . '/' . substr( $next, length($url) );
mkdir $local_dir unless -d $local_dir;
}
print "The next directory: ", $next, "\n";
$mech->get($next);
@links = $mech->links;
@file_queue = ();
}
sub usage {
print <<HELLO;
Usage: fetch_all_files.pl -url [URL] -interval [Interval] -save [Save Directory] -test
-url : the target URL (REQUIRED)
-interval : the average time interval (second) (default: 5s)
-save : the directory name for storing files (default: fetched)
-test : the script will not download files (just print out the target files)
HELLO
exit;
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment