Created
March 1, 2013 20:43
-
-
Save jesboat/5067634 to your computer and use it in GitHub Desktop.
Convert a HTML dump of an amazon.com wishlist to a CSV.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use open qw(:utf8 :std); | |
use HTML::TreeBuilder; | |
use XML::XPath; | |
use HTML::Entities; | |
use Text::CSV; | |
if (@ARGV != 1) { | |
die "Usage: $0 wishlist.html\n"; | |
} | |
my $tree = HTML::TreeBuilder->new_from_file($ARGV[0]); | |
my $xp = XML::XPath->new(xml => $tree->as_XML); | |
my $csv = Text::CSV->new({binary => 1}); | |
sub join_text { | |
my @chunks = @_; | |
my $text; | |
for my $chunk (@chunks) { | |
$text .= $chunk->getValue; | |
} | |
decode_entities($text); | |
for ($text) { | |
s/^\s+//, s/\s+$//, s/\s+/ /g; | |
} | |
return $text; | |
} | |
sub for_item { | |
my ($item) = @_; | |
my ($info) = $xp->findnodes(q{.//*[@class='lineItemGroup'][1]}, $item); | |
my @fields; | |
# Print a link | |
my @links = $xp->findnodes(q{.//a/@href}, $info); | |
if (@links >= 1) { | |
if (@links > 1) { | |
warn "Multiple links available: @links; picking first.\n" | |
} | |
push @fields, $links[0]->getData; | |
} else { | |
print "no link available\n"; | |
} | |
# Print the item's info | |
my @lines = $xp->findnodes(q{./*}, $info); | |
for my $l (@lines) { | |
my @author = $xp->findnodes(q{.//*[@class='authorPart']//text()}, $l); | |
my %author = map { $_->getValue => 1 } @author; | |
if (@author) { | |
push @fields, join_text(@author); | |
$fields[-1] =~ s/^by //; | |
} | |
my @texts = $xp->findnodes(q{.//text()}, $l); | |
push @fields, join_text(grep { ! $author{$_->getValue} } @texts); | |
} | |
$csv->print(\*STDOUT, \@fields); | |
print "\n"; | |
} | |
for my $no ($xp->findnodes(q{//tbody[@class='itemWrapper']})) { | |
for_item($no); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment