Skip to content

Instantly share code, notes, and snippets.

@jesboat
Created March 1, 2013 20:43
Show Gist options
  • Save jesboat/5067634 to your computer and use it in GitHub Desktop.
Save jesboat/5067634 to your computer and use it in GitHub Desktop.
Convert a HTML dump of an amazon.com wishlist to a CSV.
#!/usr/bin/perl
use strict;
use warnings;
use open qw(:utf8 :std);
use HTML::TreeBuilder;
use XML::XPath;
use HTML::Entities;
use Text::CSV;
if (@ARGV != 1) {
die "Usage: $0 wishlist.html\n";
}
my $tree = HTML::TreeBuilder->new_from_file($ARGV[0]);
my $xp = XML::XPath->new(xml => $tree->as_XML);
my $csv = Text::CSV->new({binary => 1});
sub join_text {
my @chunks = @_;
my $text;
for my $chunk (@chunks) {
$text .= $chunk->getValue;
}
decode_entities($text);
for ($text) {
s/^\s+//, s/\s+$//, s/\s+/ /g;
}
return $text;
}
sub for_item {
my ($item) = @_;
my ($info) = $xp->findnodes(q{.//*[@class='lineItemGroup'][1]}, $item);
my @fields;
# Print a link
my @links = $xp->findnodes(q{.//a/@href}, $info);
if (@links >= 1) {
if (@links > 1) {
warn "Multiple links available: @links; picking first.\n"
}
push @fields, $links[0]->getData;
} else {
print "no link available\n";
}
# Print the item's info
my @lines = $xp->findnodes(q{./*}, $info);
for my $l (@lines) {
my @author = $xp->findnodes(q{.//*[@class='authorPart']//text()}, $l);
my %author = map { $_->getValue => 1 } @author;
if (@author) {
push @fields, join_text(@author);
$fields[-1] =~ s/^by //;
}
my @texts = $xp->findnodes(q{.//text()}, $l);
push @fields, join_text(grep { ! $author{$_->getValue} } @texts);
}
$csv->print(\*STDOUT, \@fields);
print "\n";
}
for my $no ($xp->findnodes(q{//tbody[@class='itemWrapper']})) {
for_item($no);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment