Skip to content

Instantly share code, notes, and snippets.

@robhammond
Last active January 16, 2018 09:17
Show Gist options
  • Save robhammond/5597327 to your computer and use it in GitHub Desktop.
Save robhammond/5597327 to your computer and use it in GitHub Desktop.
Event source mojolicious crawler
package WebCrawl::Crawl;
use Mojo::Base 'Mojolicious::Controller';
sub crawl {
my $self = shift;
# Increase inactivity timeout for connection a bit
Mojo::IOLoop->stream($self->tx->connection)->timeout(15);
# Change content type
$self->res->headers->content_type('text/event-stream');
my $seed_url = $self->param('url');
# FIFO queue
my @urls = (Mojo::URL->new($seed_url));
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
# Track accessed URLs
my %uniq;
# already visited seed url, duh
$uniq{$seed_url} = 1;
my $active = 0;
our $url_count = 0;
sub parse {
my ($tx) = @_;
# Request URL
my $url = $tx->req->url;
my $title = $tx->res->dom->at('html title')->text;
my $json = Mojo::JSON->new;
my $data = $json->encode({
url => $url,
title => $title,
});
$self->write("event:url\ndata:$data\n\n");
# Extract and enqueue URLs
for my $e ($tx->res->dom('a[href]')->each) {
# Validate href attribute
my $link = Mojo::URL->new($e->{href});
next if 'Mojo::URL' ne ref $link;
# "normalize" link
$link = $link->to_abs($tx->req->url)->fragment(undef);
next unless $link->protocol =~ /^https?$/x;
# Access every link only once
next if ++$uniq{$link->to_string} > 1;
# Don't visit other hosts (inc sub-doms)
next if $link->host ne $url->host;
push @urls, $link;
}
return;
}
sub get_callback {
my (undef, $tx) = @_;
# Exit if request times out
if (!$tx->res->code) {
my $json = Mojo::JSON->new;
my $data = $json->encode({
url => $tx->req->url,
title => "Error: time out",
status => 500,
});
$self->write("event:url\ndata:$data\n\n");
return;
}
# Parse only OK HTML responses
$tx->res->code == 200
and
$tx->res->headers->content_type =~ m{^text/html\b}ix
and
parse($tx);
# Deactivate
--$active;
return;
}
Mojo::IOLoop->recurring(
0 => sub {
# Keep up to 4 parallel crawlers sharing the same user agent
for ($active .. 4 - 1) {
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop)
unless my $url = shift @urls;
# Limit to x number of URLs
return ($active or Mojo::IOLoop->stop)
unless ($url_count < 30);
# Fetch non-blocking just by adding
# a callback and marking as active
++$active;
if ($rules->allowed($url)) {
$ua->get($url => \&get_callback);
}
++$url_count;
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment