public
Last active

Simple web crawler/scraper implemented using Web::Scraper & YADA

  • Download Gist
yada-crawler.pl
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
#!/usr/bin/env perl
use 5.016;
use common::sense;
use utf8::all;
 
# Use fast binary libraries
use EV;
use Web::Scraper::LibXML;
use YADA 0.039;
 
YADA->new(
common_opts => {
# Available opts @ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html
encoding => '',
followlocation => 1,
maxredirs => 5,
}, http_response => 1, max => 4,
)->append([qw[
http://sysd.org/page/1/
http://sysd.org/page/2/
http://sysd.org/page/3/
]] => sub {
my ($self) = @_;
return if $self->has_error
or not $self->response->is_success
or not $self->response->content_is_html;
 
# Declare the scraper once and then reuse it
state $scraper = scraper {
process q(html title), title => q(text);
process q(a), q(links[]) => q(@href);
};
 
# Employ amazing Perl (en|de)coding powers to handle HTML charsets
my $doc = $scraper->scrape(
$self->response->decoded_content,
$self->final_url,
);
 
printf qq(%-64s %s\n), $self->final_url, $doc->{title};
 
# Enqueue links from the parsed page
$self->queue->prepend([
grep {
$_->can(q(host)) and $_->scheme =~ m{^https?$}x
and $_->host eq $self->initial_url->host
and (grep { length } $_->path_segments) <= 3
} @{$doc->{links} // []}
] => __SUB__);
})->wait;
__DATA__
Featured at:
http://blogs.perl.org/users/stas/2013/02/web-scraping-with-modern-perl-part-2---speed-edition.html

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.