Skip to content

Instantly share code, notes, and snippets.

@brianjmiller
Created January 25, 2012 19:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brianjmiller/1678212 to your computer and use it in GitHub Desktop.
Save brianjmiller/1678212 to your computer and use it in GitHub Desktop.
data extraction using HTML::DOM
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use DBI;
use HTML::DOM;
my $dbh = DBI->connect(
'dbi:mysql:dbname=...;host=localhost;sslmode=disable',
'...',
'...',
{
RaiseError => 1,
},
);
my $st = q{
SELECT
*
FROM
table
WHERE
some_field = ?
};
my @bind = qw( some_value );
my $dom = HTML::DOM->new;
my $struct = {
products => [],
};
my $products = $dbh->selectall_arrayref($st, { Slice => {} }, @bind);
for my $product (@$products) {
eval {
my $ref = {
product_id => $product->{id},
};
push @{ $struct->{products} }, $ref;
$dom->open;
$dom->write($product->{html_field});
my $h1_list = $dom->getElementsByTagName('h1');
if ($h1_list->length == 1) {
my $h1 = $h1_list->item(0);
$ref->{product_name} = $h1->as_text;
}
elsif ($h1_list->length == 0) {
die "No product name h1 detected\n";
}
else {
die "Cannot handle multiple h1s\n";
}
my $display_order = 0;
for my $tab_link ($dom->getElementsByClassName('tab')) {
my $tab_ref = {
display_order => $display_order,
};
$display_order = $display_order + 1;
push @{ $ref->{tabs} }, $tab_ref;
my ($span) = $tab_link->getElementsByTagName('span');
$tab_ref->{tab_title} = $span->as_text;
my $href = $tab_link->href;
if ($href =~ /\A#(.+)\z/) {
my $tab = $dom->getElementById($1);
my $tab_html = $tab->innerHTML;
if (defined $tab_html and $tab_html ne '') {
$tab_html =~ s/{include file="((?!").*)"}/{include file="$1"}/sg;
}
$tab_ref->{tab_html} = $tab_html;
#
# check to see if this tab has store_options (actions)
#
for my $action_link ($tab->getElementsByClassName('green_link')) {
my $href = $action_link->href;
if ($href =~ /javascript:store_options\(\\'(.+)\\'\)/) {
$ref->{actions}->{$1} = 1;
}
}
}
else {
die "Unrecognized href as tab link: $href\n";
}
}
$dom->close;
};
if ($@) {
warn "Failed to build product info ($product->{id}): $@\n";
}
}
print Dumper($struct);
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment