Skip to content

Instantly share code, notes, and snippets.

@spikeheap
Created November 23, 2013 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spikeheap/7615234 to your computer and use it in GitHub Desktop.
Save spikeheap/7615234 to your computer and use it in GitHub Desktop.
A scraper to retrieve the NICE pathways in XML format, for research purposes.
use warnings;
use strict;
use LWP::UserAgent;
use File::Basename;
use Time::HiRes qw(usleep);
use CGI qw/escape/;
# Create an LWP User-Agent object for sending HTTP requests.
my $ua = LWP::UserAgent->new;
my $root_url = "http://pathways.nice.org.uk/";
my $outputDir = "output";
mkdir $outputDir;
my $content = getContent($root_url);
my $processedURLs = {};
#print "$content\n";
my @matches = $content =~ /<a href=["'](\/pathways\/.*?)["']/g;
for my $match (@matches){
print "Base URL: $match\n";
getXMLFileAndProcessChildren($match);
}
sub getXMLFileAndProcessChildren{
my $url = $_[0];
# Only handle the URL once
if(!defined $processedURLs->{$url}){
$processedURLs->{$url} = 1;
my $pageContent = getContent($root_url.$url);
my @xmlFiles = $pageContent =~ /["']([a-zA-Z0-9_\/-]*?.xml)["']/g;
for my $xmlFile (@xmlFiles){
my $xmlFileName = "$root_url/$xmlFile";
if(!defined $processedURLs->{$xmlFileName}){
$processedURLs->{$xmlFileName} = 1;
my $xmlContent = getContent($xmlFileName);
my $outputFileName = basename($xmlFileName);
print "\t$outputFileName\n";
# Dump the output to a file
open (XMLOUTPUT, ">>$outputDir/$outputFileName");
print XMLOUTPUT $xmlContent;
close (XMLOUTPUT);
# And then let's look for more content to slurp!
my @nestedXMLFiles = $xmlContent =~ /["']([a-zA-Z0-9_\/-]*?.xml)["']/g;
for my $nestedFile (@nestedXMLFiles){
#print "\t\t$nestedFile\n";
getXMLFileAndProcessChildren($nestedFile);
}
}
}
}else{
#print "\t\tSkipping URL $url\n";
}
}
sub getContent{
my $urlToGet = $_[0];
# Let's be nice and wait for a small amount of time before each request, so we don't hammer the web server
my $waitTimeMillis = rand()*10000; # between 0 and 10 seconds
usleep(1000 * $waitTimeMillis);
# Create an HTTP request object for this URL.
my $request = HTTP::Request->new(GET => $urlToGet);
# This HTTP header is required. The server outputs garbage if
# it's not present.
$request->push_header('Content-Type' => 'text/html');
# Send the request and check for an error from the server.
my $response = $ua->request($request);
print "\t\tError ".$response->code.": $urlToGet\n" if !$response->is_success;
return $response->content
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment