Skip to content

Instantly share code, notes, and snippets.

@jikamens
Last active December 11, 2015 16:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jikamens/4625596 to your computer and use it in GitHub Desktop.
Save jikamens/4625596 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
# CGI script for generating an RSS feed of the comment on a Hacker
# News posting.
#
# Copyright (c) 2013 Jonathan Kamens <jik@kamens.us>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# See <http://www.gnu.org/licenses/>.
#
# Published at <https://gist.github.com/4625596>.
use strict;
use warnings;
use CGI ':standard';
use CGI::Carp 'fatalsToBrowser';
use Data::Dumper;
# Alas, the server I'm using this on has only perl 5.8. If you have
# the newer Date::Manip::Date, you can update the script to use it by
# searching for "DMD" and making the appropriate edits.
# DMD
# use Date::Manip::Date;
use Date::Manip;
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::RSS;
my $base = 'https://news.ycombinator.com';
my $title_limit = 80;
# Set this to undef if you don't want to cache dates. Caching dates
# is necessary because the dates we get back from HN are vague, e.g.,
# "4 hours ago".
my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache';
my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month
my $now = time();
my %date_cache;
&read_date_cache();
my $id = param('id');
die "No post id specified\n" if (! $id);
die "Bad id '$id'\n" if ($id !~ /^\d+$/);
my $ua = LWP::UserAgent->new();
my $url = "$base/item?id=$id";
my $response = $ua->get($url);
die "Failed to fetch $url\n" if (! $response->is_success);
my $tb = HTML::TreeBuilder->new();
$tb->parse_content($response->decoded_content());
my @comments;
my $title = $tb->find_by_tag_name('title')->as_text();
die "No title\n" if (! $title);
$title = "Comment feed for: $title";
my(@elements) = $tb->descendants();
foreach my $element (@elements) {
my $class = $element->attr('class');
if ($class and $class eq 'comhead') {
while (@comments and ! $comments[-1]{'comment'}) {
pop @comments;
}
push(@comments, {'comhead' => $element});
}
if ($class and $class eq 'comment') {
next if (! @comments);
$comments[-1]{'comment'} = $element;
}
}
# If there are no comments yet, there will be one comhead with no
# associated comment (because the HN formatting asininely uses the
# "comhead" class for both the site name in the post header and for
# comment headers. Hey, HN maintainers, learn how to use CSS properly!
if (@comments == 1 and ! $comments[0]->{'comment'}) {
@comments = ();
}
foreach my $comment (@comments) {
@elements = $comment->{'comhead'}->content_list();
# Structure of comhead is link to user, post time, link to comment
die "Bad comhead\n" if (@elements != 3);
my($author_tag) = $elements[0];
my($age) = $elements[1];
my($comment_link) = $elements[2];
my($author_link) = $author_tag->attr('href');
die "Bad author link\n" if (! $author_link);
$author_link = "$base/$author_link";
@elements = $author_tag->content_list();
die "Bad author tag\n" if (@elements != 1);
my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0];
my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0];
$comment_link = $comment_link->attr('href');
die "Bad comment link\n" if (! $comment_link);
$comment_link = "$base/$comment_link";
die "Bad comment age\n" if (ref $age);
die "Bad comment age\n" if ($age !~ s/\s*\|\s*$//);
# DMD
# my $date = Date::Manip::Date->new($age);
# die "Bad comment age\n" if ($date->err);
# $date->convert('UTC');
# $date = $date->printf('%O+00:00');
my $date = ParseDate($age);
die "Bad comment age\n" if (! $date);
$date = Date_ConvTZ($date, '', 'UTC');
$date = UnixDate($date, '%O+00:00');
$date = &get_date_cache($comment_link, $date);
@elements = $comment->{'comment'}->content_list();
my $content = '';
for (@elements) {
if (ref) {
$content .= $_->as_HTML();
}
else {
$content .= $_;
}
}
my($title) = $comment->{'comment'}->as_text();
if (length($title) > $title_limit) {
$title = substr($title, 0, $title_limit-3) . '...';
}
$comment->{'author_link'} = $author_link;
$comment->{'author_html'} = $author_html;
$comment->{'author_name'} = $author_name;
$comment->{'age'} = $age;
$comment->{'date'} = $date;
$comment->{'comment_link'} = $comment_link;
$comment->{'content'} = $content;
$comment->{'title'} = $title;
}
&write_date_cache();
@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments;
my $rss = XML::RSS->new();
$rss->channel(title => $title, link => $url);
for (@comments) {
my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>";
$rss->add_item
(
title => $_->{'title'},
link => $_->{'comment_link'},
description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>",
dc => {
date => $_->{'date'},
creator => $_->{'author_name'},
});
}
print header('application/rss+xml');
print $rss->as_string;
$tb->delete();
sub read_date_cache {
return if (! $date_cache_file);
return if (! -f $date_cache_file);
open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n";
while (<CACHE>) {
chomp;
my($url, $date, $last_used) = split;
$date_cache{$url} = {
date => $date,
last_used => $last_used
};
}
close(CACHE) or die "Error reading date cache\n";
}
sub get_date_cache {
my($url, $date) = @_;
$date_cache{$url}{'last_used'} = $now;
if ($date_cache{$url}{'date'}) {
return $date_cache{$url}{'date'};
}
else {
return $date_cache{$url}{'date'} = $date;
}
}
sub write_date_cache {
return if (! $date_cache_file);
my $new = "$date_cache_file.new";
open(CACHE, '>', $new) or die "Error writing date cache\n";
foreach my $url (keys %date_cache) {
next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout);
print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n")
or die "Error writing date cache\n";
}
close(CACHE) or die "Error writing date cache\n";
rename($new, $date_cache_file) or die "Error writing date cache\n";
}
# CHANGES
#
# 2013/02/01 - Don't barf on postings with no comments.
# 2013/01/24 - Creator name in the XML should not contain HTML.
# 2013/01/24 - Creator name in the XML should not be a link.
# 2013/01/24 - Initial release.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment