Skip to content

Instantly share code, notes, and snippets.

@langthom
Created August 11, 2016 22:45
Show Gist options
  • Save langthom/8ac09efc279fddeada867b0a477a75e3 to your computer and use it in GitHub Desktop.
Save langthom/8ac09efc279fddeada867b0a477a75e3 to your computer and use it in GitHub Desktop.
simple crawler for getting xkcd comics
#!/usr/bin/env perl
# A simple crawler for getting the nice comics from xkcd.com
#(c) Thomas Lang, 2016
#
# Yes, this is damn non-performant, but if it works, it ain't stupid.
use warnings;
use strict;
use LWP::Simple;
use XML::Twig;
# No error handling!
#
# minIdx = minimum image number, e.g. 1615
# maxIdx = maximum image number, e.g. 1618
# verbose = indicator if prints should be made
# if verbose eq "y" (for yes) then print something
# else print nothing
my ($minIdx, $maxIdx, $verbose) = @ARGV;
# Gets a single image, quite ineffective
sub getImg {
my $idx = shift;
my $uri = "http://www.xkcd.com/$idx/";
my $fil = "./$idx";
getstore($uri, $fil);
# Leave only line 64 in the file
open(my $FH1, '<', $fil) or die "FUCK";
my @lines = <$FH1>;
my $line = $lines[63];
close $FH1;
open(my $FH2, '>', $fil) or die "Error:$!\n";
print $FH2 $line;
close $FH2;
# end
my $i;
# get image url
my $twig = XML::Twig->new(
twig_roots => {
'/img' => sub {
my $val = $_->att('src');
$val =~ s/\/\///; # Strip leading '//'
$i = $val;
},
},
)->parsefile($fil);
my $img = $i;
$img =~ s/imgs\.xkcd\.com\/comics\///;
print "Getting image '$img' ... " if ($verbose eq "y");
getstore("http://$i", $img);
print "done.\n" if ($verbose eq "y");
unlink $fil; # Remove temporary gotten file
}
foreach my $ix ($minIdx .. $maxIdx) {
getImg $ix;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment