Skip to content

Instantly share code, notes, and snippets.

Created Jul 19, 2011
What would you like to do?
BBC News front page scraper
#!/usr/pkg/bin/perl -w
use strict;
use POSIX qw(strftime);
use Text::Diff;
chdir 'web/';
# Fetch current page
my $web = get_bbc_news();
exit unless $web;
# Add disclaimer header
$web =~ s{(<head[^>]*>)}{$1<base href="">};
my $timestr = strftime('%H:%M on %d %B %Y', localtime());
$web =~ s{(<body.*?>)}{$1<div id="MSdisclaimer" style="border: solid 1px black; padding: 5px; margin: 5px;">This is <em>not</em> the official BBC News site; it is just a prototype implementing a <a href=""></a> idea.<br>This page is an archive of the BBC News front page as it was at <strong>$timestr</strong>.</div>};
# Fetch old latest
my $latest = read_file('latest') || '';
$latest =~ s/^\s+//; $latest =~ s/\s+$//;
my $latest_file = read_file($latest) || '';
# Check if new one is different, store if so
if (my $diff = different($web, $latest_file)) {
makepath('%Y', '%m', '%d');
my $date = strftime('%Y/%m/%d/%H.%M', localtime());
my $path = $date . '.html';
my $pathdiff = $date . '.diff';
output($path, $web);
output('latest', $path);
output($pathdiff, $diff);
sub different {
my ($new, $old) = @_;
$new = cleanup($new);
$old = cleanup($old);
my $diff = diff \$old, \$new, { STYLE => 'OldStyle' };
return $diff;
# This could do more
sub cleanup {
my ($s) = @_;
$s =~ s#^.*<div id="main-content"##s; # Header
$s =~ s#<div id="most-popular-promotion".*##s; # Footer
return $s;
# Create a hierarchy to store the snapshots, and include script to do the front end display
sub makepath {
my @bits = @_;
my @path;
foreach my $bit (@bits) {
my $part = strftime($bit, localtime());
push @path, $part;
output(join('/', @path) . '/index.php', '<? include "bbcnews/list.php"; ?>');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment