public
Last active

Simple web crawler/scraper implemented using Mojolicious

  • Download Gist
mojo-crawler.pl
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
#!/usr/bin/env perl
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);
 
use Mojo::UserAgent;
 
# FIFO queue
my @urls = map { Mojo::URL->new($_) } qw(
http://sysd.org/page/1/
http://sysd.org/page/2/
http://sysd.org/page/3/
);
 
# Limit parallel connections to 4
my $max_conn = 4;
 
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent
->new(max_redirects => 5)
->detect_proxy;
 
# Keep track of active connections
my $active = 0;
 
Mojo::IOLoop->recurring(
0 => sub {
for ($active + 1 .. $max_conn) {
 
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop)
unless my $url = shift @urls;
 
# Fetch non-blocking just by adding
# a callback and marking as active
++$active;
$ua->get($url => \&get_callback);
}
}
);
 
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
 
sub get_callback {
my (undef, $tx) = @_;
 
# Deactivate
--$active;
 
# Parse only OK HTML responses
return
if not $tx->res->is_status_class(200)
or $tx->res->headers->content_type !~ m{^text/html\b}ix;
 
# Request URL
my $url = $tx->req->url;
 
say $url;
parse_html($url, $tx);
 
return;
}
 
sub parse_html {
my ($url, $tx) = @_;
 
say $tx->res->dom->at('html title')->text;
 
# Extract and enqueue URLs
for my $e ($tx->res->dom('a[href]')->each) {
 
# Validate href attribute
my $link = Mojo::URL->new($e->{href});
next if 'Mojo::URL' ne ref $link;
 
# "normalize" link
$link = $link->to_abs($tx->req->url)->fragment(undef);
next unless grep { $link->protocol eq $_ } qw(http https);
 
# Don't go deeper than /a/b/c
next if @{$link->path->parts} > 3;
 
# Access every link only once
state $uniq = {};
++$uniq->{$url->to_string};
next if ++$uniq->{$link->to_string} > 1;
 
# Don't visit other hosts
next if $link->host ne $url->host;
 
push @urls, $link;
say " -> $link";
}
say '';
 
return;
}
__DATA__
Featured at:
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.