Created
October 27, 2010 17:02
-
-
Save kimarx/649457 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
### Last Update: 2010-10-28 01:32+09:00. | |
### smart_nytfeed.pl: This script fetches a given Atom feed of NYT and | |
### reconstructs smarter a RSS feed file from it, however, FTTB, it does not | |
### fetch any article of NYT's blog. | |
### Kim, Yi-Chul < kimarx@gmail.com > | |
use strict; | |
use warnings; | |
use utf8; | |
use LWP::Simple; | |
use XML::Simple; | |
use Encode; | |
use HTML::TreeBuilder; | |
use XML::RSS; | |
use Data::Dumper; | |
### Configure the feed URI which you want this script to fetch to the following | |
### variable. | |
my $feed_uri = 'http://feeds.nytimes.com/nyt/rss/Books'; | |
## Fetch and encode! | |
my $atom_feed = get( $feed_uri ); | |
$atom_feed = encode( 'utf8', $atom_feed ); | |
my ( $header_title, $header_link, $header_language ) = get_atom_header( $atom_feed ); | |
my @feed_links = get_all_links( $atom_feed ); | |
my @nyt_a_dates = make_date_list( @feed_links ); | |
my ( $l_ref ) = \@feed_links; | |
my ( $d_ref ) = \@nyt_a_dates; | |
my ( $t_ref, $b_ref ) = wget_and_parse_articles( @feed_links ); | |
create_rss( | |
$header_title, | |
$header_link, | |
$header_language, | |
$t_ref, | |
$l_ref, | |
$b_ref, | |
$d_ref); | |
sub create_rss | |
{ | |
my ( $rss_title, | |
$rss_link, | |
$rss_language, | |
$rss_item_titles_ref, | |
$rss_item_links_ref, | |
$rss_item_bodies_ref, | |
$rss_item_dates_ref) = @_; | |
my @rss_item_titles = @$rss_item_titles_ref; | |
my @rss_item_links = @$rss_item_links_ref; | |
my @rss_item_bodies = @$rss_item_bodies_ref; | |
my @rss_item_dates = @$rss_item_dates_ref; | |
my $rss = XML::RSS->new( version => 1.0 ); | |
$rss->channel( | |
title => $rss_title, | |
link => $rss_link, | |
dc => { | |
language => $rss_language, | |
}, | |
); | |
my $number = @rss_item_links; | |
for ( my $n = 0; $n < $number; $n++ ) { | |
$rss->add_item( | |
'title' => $rss_item_titles[$n], | |
'link' => $rss_item_links[$n], | |
'description' => $rss_item_bodies[$n], | |
# 'dc:date' => $rss_item_dates[$n], | |
); | |
} | |
$rss->save( "./nyt.rdf" ); | |
} | |
# my @nyt_a_titles = @$t_ref; | |
# my @nyt_a_bodies = @$b_ref; | |
# print Dumper @nyt_a_titles; | |
# print Dumper @nyt_a_bodies; | |
sub make_date_list | |
{ | |
my @nyt_uris = @_; | |
my @nyt_dates; | |
foreach my $nyt_uri ( @nyt_uris ) { | |
if ( $nyt_uri =~ /\d{4}\/\d{2}\/\d{2}/ ) { | |
push @nyt_dates, $&; | |
} | |
} | |
## Replace '/' with '-'. | |
my @nyt_ds; | |
foreach my $nyt_date ( @nyt_dates ) { | |
$nyt_date =~ s/\//\-/g; | |
push @nyt_ds, $nyt_date; | |
} | |
return @nyt_ds; | |
} | |
### XML Parser. | |
sub xml_parser | |
{ | |
my $atom = shift; | |
my $parser = XML::Simple->new; | |
my $xml_data = $parser->XMLin( $atom ); | |
return $xml_data; | |
} | |
sub get_atom_header | |
{ | |
my $xml_content = shift; | |
my $xml_data = xml_parser( $xml_content ); | |
my $xml_title = $xml_data->{channel}->{title}; | |
my $xml_link = $xml_data->{channel}->{link}; | |
my $xml_language = $xml_data->{channel}->{language}; | |
return ( $xml_title, $xml_link, $xml_language ); | |
} | |
sub get_all_links | |
{ | |
my $atom_content = shift; | |
my $data = xml_parser( $atom_content ); | |
my $all_items = $data->{channel}->{item}; | |
my @all_links; | |
foreach my $item ( @$all_items ) { | |
my $original_link = $item->{'pheedo:origLink'}; | |
$original_link =~ s|\?partner=.*||; # Remove unnecessary characters. | |
if ( $original_link =~ /www.nytimes.com\/\d{4}\/\d{2}\/\d{2}/ ) { | |
push @all_links, $original_link; | |
} | |
} | |
return @all_links; | |
} | |
sub wget_and_parse_articles | |
{ | |
my @nyt_links = @_; | |
my @titles; | |
my @bodies; | |
foreach my $nyt_link ( @nyt_links ) { | |
my $f_path = save_nyt_article_by_wget( $nyt_link ); | |
my ( $a_title, $a_body ) = nyt_html_parser( $f_path ); | |
push @titles, $a_title; | |
push @bodies, $a_body; | |
system( "rm $f_path" ); | |
} | |
my ( $titles_ref ) = \@titles; | |
my ( $bodies_ref ) = \@bodies; | |
return ( $titles_ref, $bodies_ref ); | |
} | |
### Parse a given HTML content of NYT. | |
sub nyt_html_parser | |
{ | |
my $html_file_path = shift; | |
my $tree = HTML::TreeBuilder->new; | |
$tree = $tree->parse_file( $html_file_path ); | |
$tree = $tree->look_down( 'id', 'article' ); | |
## The title of this article. | |
my $article_title = $tree->find( 'h1' )->as_text; | |
my @paragraphs; | |
foreach my $ptag ( $tree->find( 'p' ) ) { | |
my $text_in_ps = encode( 'utf8', $ptag->as_HTML ); | |
push @paragraphs, $text_in_ps; | |
} | |
my $article_body = "@paragraphs"; | |
$tree = $tree->delete; | |
return ( $article_title, $article_body ); | |
} | |
### Any means, save for wget, presumably, can not fetch articles of NYT. | |
sub save_nyt_article_by_wget | |
{ | |
my $article_uri = shift; | |
my $article_file_name = get_html_file_name( $article_uri ); | |
my $article_file_path = './' . $article_file_name; | |
system( "wget -q $article_uri -O $article_file_path" ); | |
return $article_file_path; | |
} | |
sub get_html_file_name | |
{ | |
my $raw_uri = shift; | |
my @elements = split /\//, $raw_uri; | |
my $html_file_name = pop(@elements); | |
return $html_file_name; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment