Skip to content

Instantly share code, notes, and snippets.

@ajmontag
Created May 22, 2013 16:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ajmontag/5628758 to your computer and use it in GitHub Desktop.
Save ajmontag/5628758 to your computer and use it in GitHub Desktop.
Scrapes random wikipedia pages for the last page edit time.
#!/usr/bin/perl
use strict;
use warnings;
use WWW::Curl::Easy;
use DateTime::Format::Strptime;
use IO::Handle qw( ); # For autoflush
STDOUT->autoflush(1);
my $rand_url = "http://en.wikipedia.org/wiki/Special:Random";
my $next_url = $rand_url;
# 19 February 2013 at 21:50
my $Strp = new DateTime::Format::Strptime(
# pattern => '%m/%d/%Y %H:%M:%S.%3N',
pattern => '%d %B %Y at %H:%M',
time_zone => '-0600',
);
my $now = DateTime->now;
my $curl = WWW::Curl::Easy->new;
$curl->setopt(CURLOPT_HEADER,1);
# A filehandle, reference to a scalar or reference to a typeglob can be used here.
my $response_body;
$curl->setopt(CURLOPT_WRITEDATA,\$response_body);
while (1) {
$curl->setopt(CURLOPT_URL, $next_url);
$response_body = "";
# Starts the actual request
my $retcode = $curl->perform;
# Looking at the results...
if ($retcode == 0) {
my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
if ($response_code == 302) {
# parse out the next url
if ($response_body =~ /\nLocation: (.+)\r\n/) {
# print "parsed next url of [$1]\n";
$next_url = $1;
} else {
print STDERR "WARN unable to parse next url\n";
$next_url = $rand_url;
}
} elsif ($response_code == 200) {
if ($response_body =~ /<li id="footer-info-lastmod"> This page was last modified on (.+)\./) {
# print "parsed next mod time of $1";
my $dt = $Strp->parse_datetime($1);
my $delta = $dt->delta_days($now);
# my $days = $delta->days();
my $days = $delta->in_units('days');
print STDOUT "$days\t$next_url\n";
} else {
print STDERR "WARN unable to parse a timestamp\n";
}
# get another random page
$next_url = $rand_url;
} else {
print STDERR "WARN unacceptable http response_code $response_code\n";
$next_url = $rand_url;
}
# judge result and next action based on $response_code
} else {
# Error code, type of error, error message
print STDERR "An error happened: $retcode ".$curl->strerror($retcode)." ".$curl->errbuf."\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment