Skip to content

Instantly share code, notes, and snippets.

@dracos
Created Jul 19, 2011
Embed
What would you like to do?
BBC News front page scraper
#!/usr/pkg/bin/perl -w
use strict;
use POSIX qw(strftime);
use Text::Diff;
chdir 'web/dracos.co.uk/public_html/work/bbc-news-archive/';
# Fetch current page
my $web = get_bbc_news();
exit unless $web;
# Add disclaimer header
$web =~ s{(<head[^>]*>)}{$1<base href="http://www.bbc.co.uk/news/">};
my $timestr = strftime('%H:%M on %d %B %Y', localtime());
$web =~ s{(<body.*?>)}{$1<div id="MSdisclaimer" style="border: solid 1px black; padding: 5px; margin: 5px;">This is <em>not</em> the official BBC News site; it is just a prototype implementing a <a href="http://backstage.bbc.co.uk/">backstage.bbc.co.uk</a> idea.<br>This page is an archive of the BBC News front page as it was at <strong>$timestr</strong>.</div>};
# Fetch old latest
my $latest = read_file('latest') || '';
$latest =~ s/^\s+//; $latest =~ s/\s+$//;
my $latest_file = read_file($latest) || '';
# Check if new one is different, store if so
if (my $diff = different($web, $latest_file)) {
makepath('%Y', '%m', '%d');
my $date = strftime('%Y/%m/%d/%H.%M', localtime());
my $path = $date . '.html';
my $pathdiff = $date . '.diff';
output($path, $web);
output('latest', $path);
output($pathdiff, $diff);
}
sub different {
my ($new, $old) = @_;
$new = cleanup($new);
$old = cleanup($old);
my $diff = diff \$old, \$new, { STYLE => 'OldStyle' };
return $diff;
}
# This could do more
sub cleanup {
my ($s) = @_;
$s =~ s#^.*<div id="main-content"##s; # Header
$s =~ s#<div id="most-popular-promotion".*##s; # Footer
return $s;
}
# Create a hierarchy to store the snapshots, and include script to do the front end display
sub makepath {
my @bits = @_;
my @path;
foreach my $bit (@bits) {
my $part = strftime($bit, localtime());
push @path, $part;
mkdir(join('/',@path));
output(join('/', @path) . '/index.php', '<? include "bbcnews/list.php"; ?>');
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment