Created
October 23, 2012 09:01
-
-
Save nichtich/3937752 to your computer and use it in GitHub Desktop.
Fetch collection of Thesaurus description from http://thesaurusportal.blogspot.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use v5.14; | |
use XML::Atom::Feed; | |
use XML::Atom::Util; | |
use XML::LibXML; | |
use HTML::Parser; | |
use HTML::TreeBuilder::XPath; | |
use URI; | |
my $url = URI->new('http://thesaurusportal.blogspot.de/atom.xml?max-results=500'); | |
#$url = 'atom.xml'; | |
my $feed = XML::Atom::Feed->new($url); | |
binmode *STDOUT, 'utf8'; | |
foreach my $entry ($feed->entries) { | |
my $title = $entry->title; | |
my $author = $entry->author->name; | |
my $content = $entry->content->body; | |
my $html = HTML::TreeBuilder::XPath->new->parse_content( $content ); | |
my ($link) = map { $_->href } | |
grep { $_->rel eq 'alternate' } $entry->link; | |
my @categories = map { $_->term } $entry->category; | |
my $published = XML::Atom::Util::iso2dt($entry->published); | |
my ($img) = grep { $_ !~ qr{googleusercontent.com/tracker} } | |
$html->findvalues('/html/body/div//img/@src'); | |
# clean up HTML | |
$_->detach for $html->findnodes('//br'); | |
$_->detach for $html->findnodes('//a[@imageanchor]'); | |
$_->detach for $html->findnodes('//img'); | |
$_->replace_with_content for $html->findnodes('//span'); | |
$_->attr('style',undef) for $html->findnodes('//*[@style]'); | |
$_->attr('target',undef) for $html->findnodes('//a[@target]'); | |
foreach my $div ($html->findnodes('/html/body/*')) { | |
$div->detach unless $div->findvalue('.'); # ignore empty divs | |
} | |
my $body = join "\n", map { ref $_ ? $_->as_HTML('<>&',' ') : $_ } | |
$html->content->[1]->content_list; | |
say $title; | |
say $link; | |
say $author; | |
say $published; | |
say join ', ', @categories; | |
say $img; | |
say $body; | |
say "----"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment