Skip to content

Instantly share code, notes, and snippets.

@kurtraschke
Created November 22, 2010 02:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kurtraschke/709447 to your computer and use it in GitHub Desktop.
Save kurtraschke/709447 to your computer and use it in GitHub Desktop.
NRC ENR Atom feed generator
#!/usr/local/bin/perl -T
#NRC ENR Atom feed generator
#Kurt Raschke
#kurt@kurtraschke.com
#Resources:
#http://linuxgazette.net/108/oregan2.html
#http://linuxgazette.net/109/oregan.html
#http://ttul.org/repos/cpan/trunk/Kwiki-Atom/lib/Kwiki/Atom.pm
#http://japhy.perlmonk.org/sexeger/sexeger.html
#http://www.ahinea.com/en/tech/perl-unicode-struggle.html
#Copyright (c) 2005 Kurt Raschke
#Permission is hereby granted, free of charge, to any person obtaining a
#copy of this software and associated documentation files (the "Software"),
#to deal in the Software without restriction, including without limitation
#the rights to use, copy, modify, merge, publish, distribute, sublicense,
#and/or sell copies of the Software, and to permit persons to whom the
#Software is furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
#THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
#DEALINGS IN THE SOFTWARE.
use strict;
use XML::Atom::Feed;
use XML::Atom::Entry;
use XML::Atom::Link;
use XML::Atom::Person;
use XML::Atom::Content;
use LWP::Simple;
use HTML::Entities;
use HTML::TokeParser::Simple;
use HTML::TableExtract;
use DateTime;
use Encode;
use Cache::FileCache;
use CGI qw/:standard/;
use CGI::Carp qw(fatalsToBrowser);
binmode(STDOUT, ":encoding(UTF-8)");
my $cache = new Cache::FileCache({"namespace" => "nrc-atom", "default_expires_in" => 3600});
$cache->Purge();
my $url = "http://www.nrc.gov/reading-rm/doc-collections/event-status/event/en.html";
if (my $purl = param('url'))
{
if ( $purl =~ /^http:\/\/www.nrc.gov\/reading-rm\/doc-collections\/event-status\/event(.*)/)
{
$url = $purl;
}
else
{
die "The URL parameter given ($purl) is invalid and will not be parsed.";
}
}
my $feed = $cache->get($url);
if (not defined $feed) {
my $page = get($url);
$page = Encode::decode('iso-8859-1', $page);
$feed = parsepage($page);
$cache->set($url,$feed);
}
print "Content-Type: application/atom+xml\r\n\r\n";
print $feed;
sub parsepage
{
my $toparse = shift(@_);
my $atom = XML::Atom::Feed->new;
my $entry;
my $stream = HTML::TokeParser::Simple->new(\$toparse);
my $tag;
$atom->title('NRC: Current Event Notification Report');
$atom->tagline('Event Notification Report from the U.S. Nuclear Regulatory Commission Operations Center');
my $link = XML::Atom::Link->new;
$link->type('text/html');
$link->rel('alternate');
$link->href($url);
$atom->add_link($link);
$atom->version('0.3');
$atom->modified(DateTime->now()->iso8601().'Z');
while ($tag = $stream->get_tag('a'))
{
next unless $tag->return_attr("name") =~ /en\d{5}/;
my $enrid = $tag->return_attr("name");
#print $tag->return_attr("name");
$tag = $stream->get_tag('table');
my $content_st = "";
until ($tag->is_tag('a'))
{
$content_st .= $tag->as_is;
$tag = $stream->get_token;
next;
}
$stream->unget_token($tag);
#caution, ugly code ahead.
$content_st = scalar reverse($content_st);
$content_st =~ s/^(.*?)>ELBAT\/</>ELBAT\/</ms;
$content_st = scalar reverse($content_st);
#end ugly code zone.
my $toptable = $content_st;
my $author_st = $toptable;
$author_st =~ /.*NRC Notified By: (.*)<BR>/;
$author_st = $1;
my $author = XML::Atom::Person->new;
$author->name($author_st);
my $date = $toptable;
$date =~ /.*Notification Date: (.*)\/(.*)\/(.*)<BR>/;
my $month = $1;
my $day = $2;
my $year = $3;
my $time = $toptable;
$time =~ /.*Notification Time: (.*):(.*)<BR>/;
my $hour = $1;
my $minute = $2;
my $dt = DateTime->new( year => $year,
month => $month,
day => $day,
hour => $hour,
minute => $minute,
second => 0,
time_zone => 'America/New_York',
);
$dt->set_time_zone('UTC');
my $te = new HTML::TableExtract();
$te->parse($content_st);
my @tc = $te->tables();
my $count = @tc;
my @ts = $te->table(0,$count-1);
my @rows = $ts[0]->rows;
my @lines = split("\n",$rows[0][0]);
chop $lines[0];
my $title = $lines[0];
my $en = $enrid;
$en =~ s/en(\d{5})/EN #$1/;
my $entry = XML::Atom::Entry->new;
$entry->title($en.": ".$title);
my $itemlink = XML::Atom::Link->new;
$itemlink->type('text/html');
$itemlink->rel('alternate');
$itemlink->href($url.'#'.$enrid);
$entry->add_link($itemlink);
$entry->author($author);
my $tagd = DateTime->now();
$tagd = $tagd->ymd('-');
$entry->id('tag:nrc.gov,'.$tagd.':'.$enrid);
$entry->issued($dt->iso8601().'Z');
$entry->modified($dt->iso8601().'Z');
my $content = XML::Atom::Content->new;
my $text = ($content->LIBXML) ? 'XML::LibXML::Text' : 'XML::XPath::Node::Text';
my $elem = $content->elem;
$content->type('text/html');
$elem->appendChild($text->new($content_st));
$elem->setAttribute('mode', 'escaped');
$entry->content($content);
$atom->add_entry($entry);
}
return $atom->as_xml;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment