Skip to content

Instantly share code, notes, and snippets.

@marcioferreira
Last active December 28, 2015 02:59
Show Gist options
  • Save marcioferreira/7431697 to your computer and use it in GitHub Desktop.
Save marcioferreira/7431697 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
package Store::Americanas::Category::Spider;
use Log::Log4perl qw(:easy);
use Time::Out qw(timeout);
use use qw/ common::sense DDP Moo MooX::late
YADA Web::Scraper URI WWW::UserAgent::Random
Redis
Parallel::ForkManager List::MoreUtils DateTime DateTime::Format::Strptime
/;
has conf => ( is => 'rw', isa => 'HashRef',
default => sub{ { http_response => 1, max => 4,
common_opts => { FOLLOWLOCATION => 1, PROXY => 'localhost:9050',
PROXYTYPE => Net::Curl::Easy::CURLPROXY_SOCKS4,
USERAGENT => WWW::UserAgent::Random::rand_ua("browsers")
}}});
has dom => ( is => 'rw', isa => 'Web::Scraper',);
sub run_with_timeout {
my ($self, $pool) = @_;
my @proc;
p$pool;
timeout 9, $pool => sub {
my ($pool) = shift;
YADA->new( $self->conf )->append( [@$pool] => sub{
my ($me) = @_;
return if not $me->response->is_success;
my $res = $self->dom->scrape( $me->response->decoded_content );
push @proc, ${$me->initial_url};
})->wait;
};
$@ ? say "OK" && p@proc: say "Done!";
return @_;
}
sub run{
my ($self) = @_;
my @url;
chomp(@url = <DATA>);
my $fkm = Parallel::ForkManager->new(14);
my $it = List::MoreUtils::natatime 10, @url;
while (my @pool = $it->()) {
$fkm->start and sleep 30 and next;
my ($proc) = $self->run_with_timeout(\@pool);
# p$proc;
$fkm->finish;
}
$fkm->wait_all_children;
}
sub now{
my ($self, $regex) = @_;
my $now = DateTime->now->set_time_zone("America/Sao_Paulo");
$now =~ s/$regex//g if $regex;
return $now
}
1;
Store::Americanas::Category::Spider
->new(
dom => scraper {
process 'div.rSearch p', 'n' => ['TEXT', sub{ /total\s+de\s+(\d+)/;$1 }];
process 'div.productImg .url', 'link[]' => ['@href', sub{ /(www.*)/;$1 }];
})->run;
__DATA__
www.americanas.com.br/sublinha/351252/moveis-e-decoracao/vela-e-porta-vela/casticais
www.americanas.com.br/linha/314051/moda-e-acessorios/sutias
www.americanas.com.br/sublinha/263308/tv-e-home-theater/dvd-player/gravador-de-dvd
www.americanas.com.br/sublinha/324877/livros/informatica/certificacoes
www.americanas.com.br/sublinha/341350/moveis-e-decoracao/box-bau/box-bau-queen
www.americanas.com.br/sublinha/285013/utilidades-domesticas/aparelho-de-jantar/acima-de-42-pecas
www.americanas.com.br/sublinha/332993/livros/ensino-de-linguas-estrangeiras/polones
www.americanas.com.br/sublinha/324899/livros/livro-infantil/quadrinhos-infantis
www.americanas.com.br/sublinha/227423/cds-e-dvds-musicais/cds-importados/black-music
www.americanas.com.br/sublinha/314074/moda-e-acessorios/lingerie-para-gestante-e-pos-parto/sutias-para-gestantes
www.americanas.com.br/linha/341382/esporte-e-lazer/jogos-de-mesa-e-salao
www.americanas.com.br/linha/262876/tv-e-home-theater/acessorios-3d
www.americanas.com.br/linha/282715/moveis-e-decoracao/sofa-cama
www.americanas.com.br/sublinha/341422/esporte-e-lazer/jogos-de-mesa-e-salao/poker
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment