Created
September 7, 2010 16:55
-
-
Save satojkovic/568655 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; | |
use warnings; | |
use XML::RSS; | |
use LWP::Simple; | |
use URI; | |
use Encode; | |
## フィードを取得するサイトのURLをファイルから取得する関数 | |
sub get_feed_url{ | |
my $fn = shift; | |
# リストファイルを行単位で読み込む | |
my @feedlist = (); | |
open(my $fh, "<", $fn) or die "$!"; | |
while(<$fh>) { | |
chomp; | |
push(@feedlist, $_); | |
} | |
close $fh; | |
return \@feedlist; | |
} | |
my $feedlist = get_feed_url('../feedlist.txt'); | |
## 確認 | |
# my $cnt = @{$feedlist}; | |
# for(my $i=0; $i<$cnt; $i++) { | |
# print $feedlist->[$i] . "\n"; | |
# } | |
## Feedの単語を取得する関数 | |
sub get_words { | |
my $item = shift; | |
# タグを除去 | |
my $body = $item->{description}; | |
$body =~ s/<.*?>//g; | |
# 確認出力 | |
printf "| %s\n%s\n\n", | |
encode_utf8($item->{title}), encode_utf8($body); | |
} | |
## Feedの単語数をカウントする関数 | |
sub get_word_counts { | |
my $url = shift; | |
# Feedをパースする | |
my $xml = get(URI->new($url)) or die "Can't GET $url\n"; | |
my $rss = XML::RSS->new(); | |
$rss->parse($xml); | |
# 単語を取り出す | |
for my $item (@{$rss->{'items'}}) { | |
my $words = get_words($item); | |
} | |
} | |
## FeedlistにあるURLにアクセスして単語を取り出す | |
my $words = sub { | |
my $feedlist = shift; | |
foreach my $url (@{$feedlist}) { | |
my $wc = get_word_counts($url); | |
} | |
}; | |
$words->($feedlist); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment