Skip to content

Instantly share code, notes, and snippets.

@nichtich
Created October 23, 2012 09:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nichtich/3937752 to your computer and use it in GitHub Desktop.
Save nichtich/3937752 to your computer and use it in GitHub Desktop.
Fetch collection of Thesaurus description from http://thesaurusportal.blogspot.com
#!/usr/bin/perl
use v5.14;
use XML::Atom::Feed;
use XML::Atom::Util;
use XML::LibXML;
use HTML::Parser;
use HTML::TreeBuilder::XPath;
use URI;
my $url = URI->new('http://thesaurusportal.blogspot.de/atom.xml?max-results=500');
#$url = 'atom.xml';
my $feed = XML::Atom::Feed->new($url);
binmode *STDOUT, 'utf8';
foreach my $entry ($feed->entries) {
my $title = $entry->title;
my $author = $entry->author->name;
my $content = $entry->content->body;
my $html = HTML::TreeBuilder::XPath->new->parse_content( $content );
my ($link) = map { $_->href }
grep { $_->rel eq 'alternate' } $entry->link;
my @categories = map { $_->term } $entry->category;
my $published = XML::Atom::Util::iso2dt($entry->published);
my ($img) = grep { $_ !~ qr{googleusercontent.com/tracker} }
$html->findvalues('/html/body/div//img/@src');
# clean up HTML
$_->detach for $html->findnodes('//br');
$_->detach for $html->findnodes('//a[@imageanchor]');
$_->detach for $html->findnodes('//img');
$_->replace_with_content for $html->findnodes('//span');
$_->attr('style',undef) for $html->findnodes('//*[@style]');
$_->attr('target',undef) for $html->findnodes('//a[@target]');
foreach my $div ($html->findnodes('/html/body/*')) {
$div->detach unless $div->findvalue('.'); # ignore empty divs
}
my $body = join "\n", map { ref $_ ? $_->as_HTML('<>&',' ') : $_ }
$html->content->[1]->content_list;
say $title;
say $link;
say $author;
say $published;
say join ', ', @categories;
say $img;
say $body;
say "----";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment