Skip to content

Instantly share code, notes, and snippets.

@kimarx
Created October 27, 2010 17:02
Show Gist options
  • Save kimarx/649457 to your computer and use it in GitHub Desktop.
Save kimarx/649457 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
### Last Update: 2010-10-28 01:32+09:00.
### smart_nytfeed.pl: This script fetches a given Atom feed of NYT and
### reconstructs smarter a RSS feed file from it, however, FTTB, it does not
### fetch any article of NYT's blog.
### Kim, Yi-Chul < kimarx@gmail.com >
use strict;
use warnings;
use utf8;
use LWP::Simple;
use XML::Simple;
use Encode;
use HTML::TreeBuilder;
use XML::RSS;
use Data::Dumper;
### Configure the feed URI which you want this script to fetch to the following
### variable.
my $feed_uri = 'http://feeds.nytimes.com/nyt/rss/Books';
## Fetch and encode!
my $atom_feed = get( $feed_uri );
$atom_feed = encode( 'utf8', $atom_feed );
my ( $header_title, $header_link, $header_language ) = get_atom_header( $atom_feed );
my @feed_links = get_all_links( $atom_feed );
my @nyt_a_dates = make_date_list( @feed_links );
my ( $l_ref ) = \@feed_links;
my ( $d_ref ) = \@nyt_a_dates;
my ( $t_ref, $b_ref ) = wget_and_parse_articles( @feed_links );
create_rss(
$header_title,
$header_link,
$header_language,
$t_ref,
$l_ref,
$b_ref,
$d_ref);
sub create_rss
{
my ( $rss_title,
$rss_link,
$rss_language,
$rss_item_titles_ref,
$rss_item_links_ref,
$rss_item_bodies_ref,
$rss_item_dates_ref) = @_;
my @rss_item_titles = @$rss_item_titles_ref;
my @rss_item_links = @$rss_item_links_ref;
my @rss_item_bodies = @$rss_item_bodies_ref;
my @rss_item_dates = @$rss_item_dates_ref;
my $rss = XML::RSS->new( version => 1.0 );
$rss->channel(
title => $rss_title,
link => $rss_link,
dc => {
language => $rss_language,
},
);
my $number = @rss_item_links;
for ( my $n = 0; $n < $number; $n++ ) {
$rss->add_item(
'title' => $rss_item_titles[$n],
'link' => $rss_item_links[$n],
'description' => $rss_item_bodies[$n],
# 'dc:date' => $rss_item_dates[$n],
);
}
$rss->save( "./nyt.rdf" );
}
# my @nyt_a_titles = @$t_ref;
# my @nyt_a_bodies = @$b_ref;
# print Dumper @nyt_a_titles;
# print Dumper @nyt_a_bodies;
sub make_date_list
{
my @nyt_uris = @_;
my @nyt_dates;
foreach my $nyt_uri ( @nyt_uris ) {
if ( $nyt_uri =~ /\d{4}\/\d{2}\/\d{2}/ ) {
push @nyt_dates, $&;
}
}
## Replace '/' with '-'.
my @nyt_ds;
foreach my $nyt_date ( @nyt_dates ) {
$nyt_date =~ s/\//\-/g;
push @nyt_ds, $nyt_date;
}
return @nyt_ds;
}
### XML Parser.
sub xml_parser
{
my $atom = shift;
my $parser = XML::Simple->new;
my $xml_data = $parser->XMLin( $atom );
return $xml_data;
}
sub get_atom_header
{
my $xml_content = shift;
my $xml_data = xml_parser( $xml_content );
my $xml_title = $xml_data->{channel}->{title};
my $xml_link = $xml_data->{channel}->{link};
my $xml_language = $xml_data->{channel}->{language};
return ( $xml_title, $xml_link, $xml_language );
}
sub get_all_links
{
my $atom_content = shift;
my $data = xml_parser( $atom_content );
my $all_items = $data->{channel}->{item};
my @all_links;
foreach my $item ( @$all_items ) {
my $original_link = $item->{'pheedo:origLink'};
$original_link =~ s|\?partner=.*||; # Remove unnecessary characters.
if ( $original_link =~ /www.nytimes.com\/\d{4}\/\d{2}\/\d{2}/ ) {
push @all_links, $original_link;
}
}
return @all_links;
}
sub wget_and_parse_articles
{
my @nyt_links = @_;
my @titles;
my @bodies;
foreach my $nyt_link ( @nyt_links ) {
my $f_path = save_nyt_article_by_wget( $nyt_link );
my ( $a_title, $a_body ) = nyt_html_parser( $f_path );
push @titles, $a_title;
push @bodies, $a_body;
system( "rm $f_path" );
}
my ( $titles_ref ) = \@titles;
my ( $bodies_ref ) = \@bodies;
return ( $titles_ref, $bodies_ref );
}
### Parse a given HTML content of NYT.
sub nyt_html_parser
{
my $html_file_path = shift;
my $tree = HTML::TreeBuilder->new;
$tree = $tree->parse_file( $html_file_path );
$tree = $tree->look_down( 'id', 'article' );
## The title of this article.
my $article_title = $tree->find( 'h1' )->as_text;
my @paragraphs;
foreach my $ptag ( $tree->find( 'p' ) ) {
my $text_in_ps = encode( 'utf8', $ptag->as_HTML );
push @paragraphs, $text_in_ps;
}
my $article_body = "@paragraphs";
$tree = $tree->delete;
return ( $article_title, $article_body );
}
### Any means, save for wget, presumably, can not fetch articles of NYT.
sub save_nyt_article_by_wget
{
my $article_uri = shift;
my $article_file_name = get_html_file_name( $article_uri );
my $article_file_path = './' . $article_file_name;
system( "wget -q $article_uri -O $article_file_path" );
return $article_file_path;
}
sub get_html_file_name
{
my $raw_uri = shift;
my @elements = split /\//, $raw_uri;
my $html_file_name = pop(@elements);
return $html_file_name;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment