jikamens/hn-comment-feed.cgi

## hn-comment-feed.cgi
#!/usr/bin/perl

# CGI script for generating an RSS feed of the comment on a Hacker
# News posting.
#
# Copyright (c) 2013 Jonathan Kamens <jik@kamens.us>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# See <http://www.gnu.org/licenses/>.
#
# Published at <https://gist.github.com/4625596>.


use strict;
use warnings;

use CGI ':standard';
use CGI::Carp 'fatalsToBrowser';
use Data::Dumper;
# Alas, the server I'm using this on has only perl 5.8. If you have
# the newer Date::Manip::Date, you can update the script to use it by
# searching for "DMD" and making the appropriate edits.
# DMD
# use Date::Manip::Date;
use Date::Manip;
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::RSS;

my $base = 'https://news.ycombinator.com';
my $title_limit = 80;
# Set this to undef if you don't want to cache dates. Caching dates
# is necessary because the dates we get back from HN are vague, e.g.,
# "4 hours ago".
my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache';
my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month

my $now = time();

my %date_cache;
&read_date_cache();

my $id = param('id');
die "No post id specified\n" if (! $id);
die "Bad id '$id'\n" if ($id !~ /^\d+$/);

my $ua = LWP::UserAgent->new();

my $url = "$base/item?id=$id";
my $response = $ua->get($url);
die "Failed to fetch $url\n" if (! $response->is_success);

my $tb = HTML::TreeBuilder->new();
$tb->parse_content($response->decoded_content());

my @comments;

my $title = $tb->find_by_tag_name('title')->as_text();
die "No title\n" if (! $title);
$title = "Comment feed for: $title";

my(@elements) = $tb->descendants();
foreach my $element (@elements) {
    my $class = $element->attr('class');
    if ($class and $class eq 'comhead') {
	while (@comments and ! $comments[-1]{'comment'}) {
	    pop @comments;
	}
	push(@comments, {'comhead' => $element});
    }
    if ($class and $class eq 'comment') {
	next if (! @comments);
	$comments[-1]{'comment'} = $element;
    }
}

# If there are no comments yet, there will be one comhead with no
# associated comment (because the HN formatting asininely uses the
# "comhead" class for both the site name in the post header and for
# comment headers. Hey, HN maintainers, learn how to use CSS properly!
if (@comments == 1 and ! $comments[0]->{'comment'}) {
    @comments = ();
}

foreach my $comment (@comments) {
    @elements = $comment->{'comhead'}->content_list();
    # Structure of comhead is link to user, post time, link to comment
    die "Bad comhead\n" if (@elements != 3);

    my($author_tag) = $elements[0];
    my($age) = $elements[1];
    my($comment_link) = $elements[2];

    my($author_link) = $author_tag->attr('href');
    die "Bad author link\n" if (! $author_link);
    $author_link = "$base/$author_link";

    @elements = $author_tag->content_list();
    die "Bad author tag\n" if (@elements != 1);
    my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0];
    my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0];

    $comment_link = $comment_link->attr('href');
    die "Bad comment link\n" if (! $comment_link);
    $comment_link = "$base/$comment_link";

    die "Bad comment age\n" if (ref $age);
    die "Bad comment age\n" if ($age !~ s/\s*\|\s*$//);
    # DMD
    # my $date = Date::Manip::Date->new($age);
    # die "Bad comment age\n" if ($date->err);
    # $date->convert('UTC');
    # $date = $date->printf('%O+00:00');
    my $date = ParseDate($age);
    die "Bad comment age\n" if (! $date);
    $date = Date_ConvTZ($date, '', 'UTC');
    $date = UnixDate($date, '%O+00:00');

    $date = &get_date_cache($comment_link, $date);

    @elements = $comment->{'comment'}->content_list();
    my $content = '';
    for (@elements) {
	if (ref) {
	    $content .= $_->as_HTML();
	}
	else {
	    $content .= $_;
	}
    }

    my($title) = $comment->{'comment'}->as_text();
    if (length($title) > $title_limit) {
	$title = substr($title, 0, $title_limit-3) . '...';
    }

    $comment->{'author_link'} = $author_link;
    $comment->{'author_html'} = $author_html;
    $comment->{'author_name'} = $author_name;
    $comment->{'age'} = $age;
    $comment->{'date'} = $date;
    $comment->{'comment_link'} = $comment_link;
    $comment->{'content'} = $content;
    $comment->{'title'} = $title;
}

&write_date_cache();

@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments;

my $rss = XML::RSS->new();

$rss->channel(title => $title, link => $url);

for (@comments) {
    my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>";

    $rss->add_item
	(
	 title       => $_->{'title'},
	 link        => $_->{'comment_link'},
	 description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>",
	 dc => {
	     date    => $_->{'date'},
	     creator => $_->{'author_name'},
	 });
}

print header('application/rss+xml');
print $rss->as_string;

$tb->delete();

sub read_date_cache {
    return if (! $date_cache_file);
    return if (! -f $date_cache_file);
    open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n";
    while (<CACHE>) {
	chomp;
	my($url, $date, $last_used) = split;
	$date_cache{$url} = {
	    date => $date,
	    last_used => $last_used
	    };
    }
    close(CACHE) or die "Error reading date cache\n";
}

sub get_date_cache {
    my($url, $date) = @_;
    $date_cache{$url}{'last_used'} = $now;
    if ($date_cache{$url}{'date'}) {
	return $date_cache{$url}{'date'};
    }
    else {
	return $date_cache{$url}{'date'} = $date;
    }
}

sub write_date_cache {
    return if (! $date_cache_file);
    my $new = "$date_cache_file.new";
    open(CACHE, '>', $new) or die "Error writing date cache\n";
    foreach  my $url (keys %date_cache) {
	next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout);
	print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n")
	    or die "Error writing date cache\n";
    }
    close(CACHE) or die "Error writing date cache\n";
    rename($new, $date_cache_file) or die "Error writing date cache\n";
}

# CHANGES
#
# 2013/02/01 - Don't barf on postings with no comments.
# 2013/01/24 - Creator name in the XML should not contain HTML.
# 2013/01/24 - Creator name in the XML should not be a link.
# 2013/01/24 - Initial release.
	#!/usr/bin/perl

	# CGI script for generating an RSS feed of the comment on a Hacker
	# News posting.
	#
	# Copyright (c) 2013 Jonathan Kamens <jik@kamens.us>.
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# See <http://www.gnu.org/licenses/>.
	#
	# Published at <https://gist.github.com/4625596>.


	use strict;
	use warnings;

	use CGI ':standard';
	use CGI::Carp 'fatalsToBrowser';
	use Data::Dumper;
	# Alas, the server I'm using this on has only perl 5.8. If you have
	# the newer Date::Manip::Date, you can update the script to use it by
	# searching for "DMD" and making the appropriate edits.
	# DMD
	# use Date::Manip::Date;
	use Date::Manip;
	use HTML::TreeBuilder;
	use LWP::UserAgent;
	use XML::RSS;

	my $base = 'https://news.ycombinator.com';
	my $title_limit = 80;
	# Set this to undef if you don't want to cache dates. Caching dates
	# is necessary because the dates we get back from HN are vague, e.g.,
	# "4 hours ago".
	my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache';
	my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month

	my $now = time();

	my %date_cache;
	&read_date_cache();

	my $id = param('id');
	die "No post id specified\n" if (! $id);
	die "Bad id '$id'\n" if ($id !~ /^\d+$/);

	my $ua = LWP::UserAgent->new();

	my $url = "$base/item?id=$id";
	my $response = $ua->get($url);
	die "Failed to fetch $url\n" if (! $response->is_success);

	my $tb = HTML::TreeBuilder->new();
	$tb->parse_content($response->decoded_content());

	my @comments;

	my $title = $tb->find_by_tag_name('title')->as_text();
	die "No title\n" if (! $title);
	$title = "Comment feed for: $title";

	my(@elements) = $tb->descendants();
	foreach my $element (@elements) {
	my $class = $element->attr('class');
	if ($class and $class eq 'comhead') {
	while (@comments and ! $comments[-1]{'comment'}) {
	pop @comments;
	}
	push(@comments, {'comhead' => $element});
	}
	if ($class and $class eq 'comment') {
	next if (! @comments);
	$comments[-1]{'comment'} = $element;
	}
	}

	# If there are no comments yet, there will be one comhead with no
	# associated comment (because the HN formatting asininely uses the
	# "comhead" class for both the site name in the post header and for
	# comment headers. Hey, HN maintainers, learn how to use CSS properly!
	if (@comments == 1 and ! $comments[0]->{'comment'}) {
	@comments = ();
	}

	foreach my $comment (@comments) {
	@elements = $comment->{'comhead'}->content_list();
	# Structure of comhead is link to user, post time, link to comment
	die "Bad comhead\n" if (@elements != 3);

	my($author_tag) = $elements[0];
	my($age) = $elements[1];
	my($comment_link) = $elements[2];

	my($author_link) = $author_tag->attr('href');
	die "Bad author link\n" if (! $author_link);
	$author_link = "$base/$author_link";

	@elements = $author_tag->content_list();
	die "Bad author tag\n" if (@elements != 1);
	my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0];
	my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0];

	$comment_link = $comment_link->attr('href');
	die "Bad comment link\n" if (! $comment_link);
	$comment_link = "$base/$comment_link";

	die "Bad comment age\n" if (ref $age);
	die "Bad comment age\n" if ($age !~ s/\s\\|\s$//);
	# DMD
	# my $date = Date::Manip::Date->new($age);
	# die "Bad comment age\n" if ($date->err);
	# $date->convert('UTC');
	# $date = $date->printf('%O+00:00');
	my $date = ParseDate($age);
	die "Bad comment age\n" if (! $date);
	$date = Date_ConvTZ($date, '', 'UTC');
	$date = UnixDate($date, '%O+00:00');

	$date = &get_date_cache($comment_link, $date);

	@elements = $comment->{'comment'}->content_list();
	my $content = '';
	for (@elements) {
	if (ref) {
	$content .= $_->as_HTML();
	}
	else {
	$content .= $_;
	}
	}

	my($title) = $comment->{'comment'}->as_text();
	if (length($title) > $title_limit) {
	$title = substr($title, 0, $title_limit-3) . '...';
	}

	$comment->{'author_link'} = $author_link;
	$comment->{'author_html'} = $author_html;
	$comment->{'author_name'} = $author_name;
	$comment->{'age'} = $age;
	$comment->{'date'} = $date;
	$comment->{'comment_link'} = $comment_link;
	$comment->{'content'} = $content;
	$comment->{'title'} = $title;
	}

	&write_date_cache();

	@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments;

	my $rss = XML::RSS->new();

	$rss->channel(title => $title, link => $url);

	for (@comments) {
	my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>";

	$rss->add_item
	(
	title => $_->{'title'},
	link => $_->{'comment_link'},
	description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>",
	dc => {
	date => $_->{'date'},
	creator => $_->{'author_name'},
	});
	}

	print header('application/rss+xml');
	print $rss->as_string;

	$tb->delete();

	sub read_date_cache {
	return if (! $date_cache_file);
	return if (! -f $date_cache_file);
	open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n";
	while (<CACHE>) {
	chomp;
	my($url, $date, $last_used) = split;
	$date_cache{$url} = {
	date => $date,
	last_used => $last_used
	};
	}
	close(CACHE) or die "Error reading date cache\n";
	}

	sub get_date_cache {
	my($url, $date) = @_;
	$date_cache{$url}{'last_used'} = $now;
	if ($date_cache{$url}{'date'}) {
	return $date_cache{$url}{'date'};
	}
	else {
	return $date_cache{$url}{'date'} = $date;
	}
	}

	sub write_date_cache {
	return if (! $date_cache_file);
	my $new = "$date_cache_file.new";
	open(CACHE, '>', $new) or die "Error writing date cache\n";
	foreach my $url (keys %date_cache) {
	next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout);
	print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n")
	or die "Error writing date cache\n";
	}
	close(CACHE) or die "Error writing date cache\n";
	rename($new, $date_cache_file) or die "Error writing date cache\n";
	}

	# CHANGES
	#
	# 2013/02/01 - Don't barf on postings with no comments.
	# 2013/01/24 - Creator name in the XML should not contain HTML.
	# 2013/01/24 - Creator name in the XML should not be a link.
	# 2013/01/24 - Initial release.