Skip to content

Instantly share code, notes, and snippets.

@marcioferreira
Created February 2, 2015 14:51
Show Gist options
  • Save marcioferreira/4d893a82be285f5b3066 to your computer and use it in GitHub Desktop.
Save marcioferreira/4d893a82be285f5b3066 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use use qw( common::sense YADA Web::Scraper DDP );
my $dom_sitemap = scraper { process 'loc', 'url[]' => 'text'; };
my $sitemap = 'http://www.tricae.com.br/sitemapProducts.xml';
#my ($loc) = $dom_sitemap->scrape( URI->new($sitemap) );
my $dom = scraper {
process 'h1', 'name' => 'TEXT';
process 'div.txt-brand', 'brand' => 'TEXT';
process '.breadcrumb li:not(:first-child) a', 'categories[]' => 'TEXT';
process '//div[@property="gr:description"]', 'description' => 'TEXT';
process 'div.subtit-product', 'resume' => 'TEXT';
process '.header-product .box-rating', 'rating[]' => '@class';#FIXME
process '#productMoreImagesList li', 'images[]' => scraper{
process './/', 'normal' => '@data-image-product';
process './/', 'zoom' => '@data-image-big';
process 'img', 'thumb' => '@alt';
};
process 'div.box-price', 'price' => scraper{
process 'span.price-old', 'from' => ['TEXT', sub{ s/R\$//g; } ];
process '.richsnippets.price b', 'price' => ['TEXT', sub{ s/,/./g; } ];
process '.plots span', 'split' => ['TEXT', sub{ s/,/./g; s/[^x0-9\.]//gi }];
};
process '#prd-attributes-box table tr', '_esp[]' => scraper{
process '//td[1]', 'fdl' => 'text';
process '//td[2]', 'val' => 'text';
};
# process '.item-opinion', 'comments[]' => scraper{
# process 'span:first-child', 'comment' => 'text';
# process '.box-rating', 'rating' => '@class';
# };
# process '', 'recommendation' => 'TEXT';
};
YADA->new( http_response => 1, max => 10, common_opts => { FOLLOWLOCATION => 1 } )->append(
#[ @{ $loc->{url} } ] => sub {
[ qw(
http://www.tricae.com.br/carrinho-de-bebe-de-bebe-bye-bye-cinza-e-azul-burigotto-11084.html
http://www.tricae.com.br/guittara-sonora-carros-2-elka--6753.html
) ] => sub {
my ($me) = @_;
return if not $me->response->is_success;
my $res = $dom->scrape( $me->response->decoded_content );
$res->{url} = $me->final_url .'';
$res->{especification}{ $_->{fdl} } = $_->{val} for @{$res->{_esp}};
delete $res->{$_} for qw/ _esp /;
p$res;
}
)->wait;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment