Skip to content

Instantly share code, notes, and snippets.

@robhammond
Created November 27, 2012 17:46
Show Gist options
  • Save robhammond/4155823 to your computer and use it in GitHub Desktop.
Save robhammond/4155823 to your computer and use it in GitHub Desktop.
Basic Mojo UserAgent parallel crawler
#!/usr/bin/env perl
use Modern::Perl;
use Mojo::UserAgent;
use Mojo::IOLoop;
use Mojo::URL;
# FIFO queue
my $seed_url = 'http://www.google.co.uk/';
my @urls = ($seed_url);
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
my %seen = ($seed_url => 1);
my $active = 0;
Mojo::IOLoop->recurring(0 => sub {
# Keep up to 4 parallel crawlers sharing the same user agent
for ($active .. 4 - 1) {
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop) unless my $url = shift @urls;
# Fetch non-blocking just by adding a callback and marking as active
++$active;
$ua->get($url => sub {
my ($ua, $tx) = @_;
# Extract and enqueue URLs
say $url;
for my $e ($tx->res->dom('a[href]')->each) {
my $link = Mojo::URL->new($e->{href})->to_abs($tx->req->url);
$link = clean_url($link);
next unless is_valid_url($link);
next if ($seen{$link});
push @urls, $link;
$seen{$link} = 1;
say " -> $urls[-1]";
}
my $title = $tx->res->dom->at('title')->text;
# Deactivate
--$active;
});
}
});
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
sub is_valid_url {
my $url = shift;
if ($url =~ m{\.(?:css|js|png|pdf|jpe?g|docx?|xlsx?|pptx?|gif)$}i) {
return undef;
}
# don't crawl external sites
unless ($url =~ m{^$seed_url}i) {
return undef;
}
return 1;
}
sub clean_url {
my $link = shift;
# remove url fragments
$link =~ s!#.*$!!;
# remove session id params
$link =~ s![?&]j?session(?:id)?=[^&]+!!gi;
# remove google analytics tracking
$link =~ s![?&]utm_(?:medium|source|campaign|content)=[^&]+!!gi;
return $link;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment