Skip to content

Instantly share code, notes, and snippets.

@dinkypumpkin
Forked from jberger/URLQueue.pl
Created September 28, 2016 22:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dinkypumpkin/0335e7d464b30762e28562e118adf2cc to your computer and use it in GitHub Desktop.
Save dinkypumpkin/0335e7d464b30762e28562e118adf2cc to your computer and use it in GitHub Desktop.
Modularization of my answer from SO on URL queuing for non-blocking ua
#!/usr/bin/env perl
package Mojo::URLQueue;
use Mojo::Base 'Mojo::EventEmitter';
use Mojo::UserAgent;
has queue => sub { [] };
has ua => sub { Mojo::UserAgent->new(max_redirects => 5) };
has concurrency => 4;
sub start {
my ($self, $cb) = @_;
return unless @{ $self->queue };
unless ( $self->{delay} ) {
$self->{concurrent} = 0;
$self->{delay} = Mojo::IOLoop->delay;
$self->{delay}->on(finish => sub{
warn("Loop ended before queue depleted\n") if @{ $self->queue };
undef $self->{delay};
$self->$cb() if $cb;
});
}
$self->_refresh;
# Start event loop if necessary
$self->{delay}->wait unless $self->{delay}->ioloop->is_running;
}
sub _refresh {
my $self = shift;
my $concurrency = $self->concurrency;
while ( $self->{concurrent} < $concurrency and my $url = shift @{ $self->queue } ) {
$self->{concurrent}++;
my $end = $self->{delay}->begin;
$self->ua->get($url => sub{
my ($ua, $tx) = @_;
$self->emit( process => $tx );
# refresh worker pool
$self->{concurrent}--;
$self->_refresh;
$end->();
});
}
}
package main;
use Mojo::Base -strict;
use Mojo::URL;
use utf8::all;
# FIFO queue
my @urls = qw(
http://sysd.org/page/1/
http://sysd.org/page/2/
http://sysd.org/page/3/
);
my $q = Mojo::URLQueue->new( queue => \@urls );
$q->on( process => \&process );
$q->start(sub { say 'Finished' });
sub process {
my ($q, $tx) = @_;
my $queue = $q->queue;
# Parse only OK HTML responses
return unless
$tx->res->is_status_class(200)
and $tx->res->headers->content_type =~ m{^text/html\b}ix;
# Request URL
my $url = $tx->req->url;
say "Processing $url";
push @$queue, parse_html($url, $tx);
}
sub parse_html {
my ($url, $tx) = @_;
state %visited;
my @links;
my $dom = $tx->res->dom;
say $dom->at('html title')->text;
# Extract and enqueue URLs
$dom->find('a[href]')->each(sub{
# Validate href attribute
my $link = Mojo::URL->new($_->{href});
return unless eval { $link->isa('Mojo::URL') };
# "normalize" link
$link = $link->to_abs($url)->fragment(undef);
return unless grep { $link->protocol eq $_ } qw(http https);
# Don't go deeper than /a/b/c
return if @{$link->path->parts} > 3;
# Access every link only once
return if $visited{$link->to_string}++;
# Don't visit other hosts
return if $link->host ne $url->host;
push @links, $link;
say " -> $link";
});
say '';
return @links;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment