Created
July 19, 2011 10:56
-
-
Save dracos/1091967 to your computer and use it in GitHub Desktop.
BBC News front page scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/pkg/bin/perl -w | |
use strict; | |
use POSIX qw(strftime); | |
use Text::Diff; | |
chdir 'web/dracos.co.uk/public_html/work/bbc-news-archive/'; | |
# Fetch current page | |
my $web = get_bbc_news(); | |
exit unless $web; | |
# Add disclaimer header | |
$web =~ s{(<head[^>]*>)}{$1<base href="http://www.bbc.co.uk/news/">}; | |
my $timestr = strftime('%H:%M on %d %B %Y', localtime()); | |
$web =~ s{(<body.*?>)}{$1<div id="MSdisclaimer" style="border: solid 1px black; padding: 5px; margin: 5px;">This is <em>not</em> the official BBC News site; it is just a prototype implementing a <a href="http://backstage.bbc.co.uk/">backstage.bbc.co.uk</a> idea.<br>This page is an archive of the BBC News front page as it was at <strong>$timestr</strong>.</div>}; | |
# Fetch old latest | |
my $latest = read_file('latest') || ''; | |
$latest =~ s/^\s+//; $latest =~ s/\s+$//; | |
my $latest_file = read_file($latest) || ''; | |
# Check if new one is different, store if so | |
if (my $diff = different($web, $latest_file)) { | |
makepath('%Y', '%m', '%d'); | |
my $date = strftime('%Y/%m/%d/%H.%M', localtime()); | |
my $path = $date . '.html'; | |
my $pathdiff = $date . '.diff'; | |
output($path, $web); | |
output('latest', $path); | |
output($pathdiff, $diff); | |
} | |
sub different { | |
my ($new, $old) = @_; | |
$new = cleanup($new); | |
$old = cleanup($old); | |
my $diff = diff \$old, \$new, { STYLE => 'OldStyle' }; | |
return $diff; | |
} | |
# This could do more | |
sub cleanup { | |
my ($s) = @_; | |
$s =~ s#^.*<div id="main-content"##s; # Header | |
$s =~ s#<div id="most-popular-promotion".*##s; # Footer | |
return $s; | |
} | |
# Create a hierarchy to store the snapshots, and include script to do the front end display | |
sub makepath { | |
my @bits = @_; | |
my @path; | |
foreach my $bit (@bits) { | |
my $part = strftime($bit, localtime()); | |
push @path, $part; | |
mkdir(join('/',@path)); | |
output(join('/', @path) . '/index.php', '<? include "bbcnews/list.php"; ?>'); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment