Last active
November 28, 2020 13:15
-
-
Save creaktive/4347600 to your computer and use it in GitHub Desktop.
Simple web crawler/scraper implemented using Mojolicious
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.010; | |
use open qw(:locale); | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use Mojo::UserAgent; | |
# FIFO queue | |
my @urls = map { Mojo::URL->new($_) } qw( | |
http://sysd.org/page/1/ | |
http://sysd.org/page/2/ | |
http://sysd.org/page/3/ | |
); | |
# Limit parallel connections to 4 | |
my $max_conn = 4; | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
$ua->proxy->detect; | |
# Keep track of active connections | |
my $active = 0; | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
for ($active + 1 .. $max_conn) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; | |
$ua->get($url => \&get_callback); | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
sub get_callback { | |
my (undef, $tx) = @_; | |
# Deactivate | |
--$active; | |
# Parse only OK HTML responses | |
return | |
if not $tx->res->is_status_class(200) | |
or $tx->res->headers->content_type !~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
say $url; | |
parse_html($url, $tx); | |
return; | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
say $tx->res->dom->at('html title')->text; | |
# Extract and enqueue URLs | |
for my $e ($tx->res->dom('a[href]')->each) { | |
# Validate href attribute | |
my $link = Mojo::URL->new($e->{href}); | |
next if 'Mojo::URL' ne ref $link; | |
# "normalize" link | |
$link = $link->to_abs($tx->req->url)->fragment(undef); | |
next unless grep { $link->protocol eq $_ } qw(http https); | |
# Don't go deeper than /a/b/c | |
next if @{$link->path->parts} > 3; | |
# Access every link only once | |
state $uniq = {}; | |
++$uniq->{$url->to_string}; | |
next if ++$uniq->{$link->to_string} > 1; | |
# Don't visit other hosts | |
next if $link->host ne $url->host; | |
push @urls, $link; | |
say " -> $link"; | |
} | |
say ''; | |
return; | |
} | |
__DATA__ | |
Featured at: | |
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html |
As of December 23, 2016 is_status_class is deprecated. As such, this script is no longer compatible.
@gdog2u aww.
Maybe is_success can be used instead of is_status_class?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Line 87 (
++$uniq->{$url->to_string};
) can be deleted, I think