Skip to content

Instantly share code, notes, and snippets.

@yseto
Last active December 11, 2015 01:18
Show Gist options
  • Save yseto/4522221 to your computer and use it in GitHub Desktop.
Save yseto/4522221 to your computer and use it in GitHub Desktop.
中国新聞のRSSを生成するスクリプト あんたぁ、中国新聞読みたいんじゃろう。じゃけどRSSがないけえ、生成するんよ。 記事を取得してRSSの中身に出すようにしました。
#!/usr/bin/env perl
#
# Copyright 2013 yseto
# Released under the MIT License - Please reuse change and share
#
use strict;
use warnings;
use utf8;
use Web::Scraper;
use URI;
use Encode qw/encode_utf8/;
use DateTime;
use DateTime::Format::Mail;
use Text::Xslate;
use FindBin;
use Digest::MD5 qw(md5_hex);
use IO::All;
my $base = "http://www.chugoku-np.co.jp/News/";
#Cache Directory
my $dir = "$FindBin::Bin/cache/";
mkdir $dir unless ( -d $dir );
#Cache Expire (sec)
my $expire = 86400 * 3;
#Scraper (Main)
my $entry = scraper {
process 'a',
'link' => '@href',
'title' => 'text';
result 'title', 'link';
};
my $scraper = scraper {
process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry;
result 'items';
};
my $result = $scraper->scrape( URI->new($base) );
sub pubdate {
my $a = shift;
if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) {
my $dt = DateTime->new(
year => $1,
month => $2,
day => $3,
hour => 0,
minute => 0,
second => 0,
time_zone => 'Asia/Tokyo',
);
return $dt;
}
DateTime->now();
}
sub description {
my $tn = shift;
#hash generate.
my $name = md5_hex($tn);
#old entries cache delete.
if ( ( time() - &pubdate($tn)->epoch ) > $expire ) {
unlink( $dir . $name ) if ( -f $dir . $name );
return "";
}
#if exist cache.
if ( -f $dir . $name ) {
my $str < io( $dir . $name );
return $str;
}
#oh...
sleep 1;
my $url = URI->new_abs( $tn, $base )->as_string;
my $d_scraper = scraper {
process '//div[@class="txt"]', 'content' => 'TEXT';
result 'content';
};
my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) );
#cache generate.
$content > io( $dir . $name );
#return description.
$content;
}
my $tx = Text::Xslate->new(
function => {
p => sub {
return DateTime::Format::Mail->format_datetime( &pubdate(shift) );
},
t => sub { return encode_utf8(shift); },
g => sub { return URI->new_abs( shift, $base )->as_string; },
d => \&description,
},
);
my $template = q{<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>中国新聞 自家RSS</title>
<language>ja</language>
<pubDate>$now</pubDate>
: for $result -> $data {
<item>
<title><: t($data.title) :></title>
<pubDate><: p($data.link) :></pubDate>
<guid><: g($data.link) :></guid>
<description><: d($data.link) :></description>
</item>
: }
</channel>
</rss>
};
print encode_utf8(
$tx->render_string(
$template,
{
result => $result,
now => DateTime::Format::Mail->format_datetime(
DateTime->now()->set_time_zone('Asia/Tokyo')
)
}
)
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment