Skip to content

Instantly share code, notes, and snippets.

@philchristensen
Created October 29, 2010 17:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philchristensen/653918 to your computer and use it in GitHub Desktop.
Save philchristensen/653918 to your computer and use it in GitHub Desktop.
a script to download wikipedia image galleries
#!/usr/bin/env perl
use strict;
use warnings;
use WWW::Mechanize;
my $mech = new WWW::Mechanize(
autocheck => 1
);
$mech->agent_alias('Windows IE 6');
my $user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
unless($ARGV[0]){
print STDERR "You must specify a Wikimedia Category page\n";
}
print STDERR "Loading $ARGV[0]...\n";
my $response = $mech->get($ARGV[0]);
my %fetched_urls = ();
if($mech->success()){
print STDERR "Loaded category page...\n";
my @links = $mech->find_all_links(url_regex => qr/Image\:/ );
foreach my $link (@links){
$mech->get($link->url());
if($mech->success()){
my $image_url = $mech->find_link(text => 'Full resolution');
if($image_url){
my $url = $image_url->url();
unless(exists $fetched_urls{$url}){
print STDERR "Downloading $url\n";
system("curl -A \"$user_agent\" -O $url");
if($? == -1) {
print STDERR "\nfailed to execute: $!\n";
exit();
}
elsif($? & 127) {
printf STDERR "\nchild died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit();
}
elsif($?){
printf STDERR "\nchild exited with value %d\n", $? >> 8;
exit();
}
}
$fetched_urls{$url} = 1;
}
}
}
}
else{
print STDERR "Couldn't load category page...\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment