Skip to content

Instantly share code, notes, and snippets.

@blech
Last active August 29, 2015 20:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save blech/43126 to your computer and use it in GitHub Desktop.
Save blech/43126 to your computer and use it in GitHub Desktop.
Fetch a Vox post as XML
#!/usr/bin/perl
use strict;
use warnings;
no warnings 'redefine';
use XML::Atom::Client;
use XML::XPath;
my $username = 'email-address-here';
my $password = 'password-here';
my $api = XML::Atom::Client->new;
$api->username($username);
$api->password($password);
mkdir('post'); # directory for entries
# replace 'blech' with your user name here
get_entries_for_feed("http://blech.vox.com/library/posts/atom-full.xml");
sub get_entries_for_feed {
my $FeedURI = shift;
print "Fetching '$FeedURI'\n\n";
my $feed = $api->getFeed($FeedURI);
if (!$feed) {
print "failure\n";
print $api->errstr;
die
}
# print $feed->as_xml;
# now use XPath to parse, then loop through, each entry
my $xp = XML::XPath->new($feed->as_xml);
my $entries = $xp->find('//entry');
foreach my $entry ($entries->get_nodelist) {
# get the service.edit link's href (for the ID)
my $body = $entry->find('link[@rel="service.edit"]');
my @contents = $body->get_nodelist;
my $entry_uri = $contents[0]->getAttribute('href');
# parse out the ID and use the service asset URL from the "proper"
# Atom API, since that returns a much nicer XML document
$entry_uri =~ m/id=([0-9a-f]+)/;
my $id = $1;
$entry_uri = "http://www.vox.com/services/atom/svc=asset/6p00c2251d985ff219/$id";
my $entry_feed = $api->getFeed($entry_uri);
if (!$entry_feed) {
print "failure\n";
print $api->errstr;
die;
}
# output entry XML
my $exp = XML::XPath->new($entry_feed->as_xml);
my $link = $exp->find('//link[@rel="alternate"]');
my @links = $link->get_nodelist;
my $href = $links[0]->getAttribute('href');
$href =~ s!.*/post!post!;
$href =~ s!html!xml!;
open(ENTRY, ">$href") or die "Can't open $href: $!\n";
print "Fetched '$href'\n";
print ENTRY $entry_feed->as_xml;
close(ENTRY);
}
print "\n";
# now find URL for next inks and recurse
my $links = $xp->find('//link[@rel="next"]');
my @links_nodes = $links->get_nodelist;
if ($links_nodes[0]) {
get_entries_for_feed($links_nodes[0]->getAttribute('href'));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment