Skip to content

Instantly share code, notes, and snippets.

@kimmel
Created October 15, 2013 16:36
Show Gist options
  • Save kimmel/6994542 to your computer and use it in GitHub Desktop.
Save kimmel/6994542 to your computer and use it in GitHub Desktop.
Collect a numbered series of pages
#!/usr/bin/env perl
use v5.14;
use warnings;
use autodie qw( :all );
use utf8::all;
use List::Util qw( shuffle );
use WWW::Mechanize;
#use Data::Show;
sub main {
my $base_url = '';
my $ua = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27';
my @initial_numbers = 9997 .. 18428;
my @keep = [];
foreach my $id (@initial_numbers) {
my $filename = $id . '.html';
if (!-e -s $filename) {
push @keep, $id;
}
}
my @page_numbers = shuffle @keep;
my $mech = WWW::Mechanize->new( 'autocheck' => 1, 'agent' => $ua, );
foreach my $id (@page_numbers) {
my $filename = $id . '.html';
$mech->get( $base_url . $filename,
":content_file" => $filename );
say 'Saved '. $filename;
sleep int(rand(15));
}
}
main() unless caller;
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment