Skip to content

Instantly share code, notes, and snippets.

@klopp
Last active July 13, 2016 15:26
Show Gist options
  • Save klopp/baef16317ddfc7e321745480b9ab1167 to your computer and use it in GitHub Desktop.
Save klopp/baef16317ddfc7e321745480b9ab1167 to your computer and use it in GitHub Desktop.
Тестовое задание
#!/usr/bin/env perl
# ------------------------------------------------------------------------------
# Написать скрипт, который принимает в аргументах url с сайтом и делает
# рекурсивный обход всех страниц этого сайта (как указанного в аргументе,
# так и его поддомены, если они будут встречаться) и выводит их в виде дерева
# на stdout.
#
# Необходимо использовать Mojo::IOLoop, Mojo::UserAgent и Mojo::DOM для решения.
#
# Всё должно обрабатываться в одном процессе, одновременных запросов к любому
# домену должно быть не более 6.
# ------------------------------------------------------------------------------
use Modern::Perl;
use Mojo::UserAgent;
use Mojo::IOLoop;
use Mojo::DOM;
use Mojo::URL;
use Const::Fast;
# ------------------------------------------------------------------------------
# Devel stuff:
# ------------------------------------------------------------------------------
use Carp qw/confess/;
use Data::Printer;
# ------------------------------------------------------------------------------
const my $DEPTH_LIMIT => 3;
const my $PAGES_PER_DOMAIN => 10;
const my $THREADS_LIMIT => 6;
const my $VALID_PROTO => qr/^http/;
const my $DEFAULT_PROTO => 'http'; # for //links
const my $DEBUG => 0;
const my %UA_OPTIONS => ( max_redirects => 5 );
# ------------------------------------------------------------------------------
my %url_tree;
my @queue;
my %urls;
my %domain_pages;
# ------------------------------------------------------------------------------
$ARGV[0] = 'http://google.com' if $DEBUG;
$ARGV[0] or _usage();
my $baseurl = Mojo::URL->new( $ARGV[0] )->to_abs();
my $basehost = $baseurl->host();
$basehost or _usage();
$basehost = qr/$basehost$/;
push @queue, $ARGV[0];
my $ua = Mojo::UserAgent->new(%UA_OPTIONS);
my $spider;
my $active_requests = 0;
$spider = sub {
my $url = shift @queue;
unless ($url) {
return $active_requests ? undef : Mojo::IOLoop->stop();
}
$ua->get(
$url => sub {
my ( $_ua, $tx ) = @_;
return unless $tx->success();
$active_requests++;
my $added = 0;
say $url if $DEBUG;
$tx->res->dom('a[href]')->each(
sub {
my ( $e, $_id ) = @_;
say '>> ' . $e->{href} if $DEBUG;
my $url
= MURL->new( $e->{href} )->to_abs( $tx->req->url );
my $host = $url->host();
my $proto = $url->protocol() || $DEFAULT_PROTO;
if ( $host && $host =~ /^www\./ ) {
$host =~ s/^www\.//;
$url->host($host);
}
if ( $host
&& $proto
&& $host =~ $basehost
&& $proto =~ $VALID_PROTO
&& $url->depth() <= $DEPTH_LIMIT )
{
my $geturl = $url->opaque();
$urls{$geturl} ||= 0;
$domain_pages{$host} ||= 0;
$active_requests--, return if $urls{$geturl};
$urls{$geturl} = 1;
$domain_pages{$host}++;
$active_requests--, return
if $domain_pages{$host} > $PAGES_PER_DOMAIN;
say '>> ' . $url->depth() . " -> $url" if $DEBUG;
$added++;
push @queue, $proto . ':' . $geturl;
_put_to_tree($url);
}
}
);
$active_requests--;
$spider->() if $added;
}
);
};
$spider->() for 1 .. $THREADS_LIMIT;
Mojo::IOLoop->start();
_print_tree( \%url_tree, 0 );
# ------------------------------------------------------------------------------
sub _print_tree {
my ( $branch, $level ) = @_;
foreach my $part ( sort { length $a <=> length $b } keys %{$branch} ) {
if ($level) {
print ' ' x $level;
say "/$part";
}
else {
say "\n$part";
}
_print_tree( $branch->{$part}, $level + 1 )
if defined $branch->{$part};
}
}
# ------------------------------------------------------------------------------
sub _put_to_tree {
my ($url) = @_;
my $path = $url->path();
$path =~ s/\/+$//;
my $host = $url->host();
$url_tree{$host} = undef unless exists $url_tree{$host};
if ($path) {
$url_tree{$host} = {} unless $url_tree{$host};
my @parts = split( /\/+/, $path );
my $branch = $url_tree{$host};
$parts[$#parts] .= $url->query() if $url->query();
foreach my $part (@parts) {
next unless $part;
$branch->{$part} ||= {};
$branch = $branch->{$part};
}
}
}
# ------------------------------------------------------------------------------
sub _usage {
die "Usage: $0 base_url\n";
}
# ------------------------------------------------------------------------------
package MURL;
use Modern::Perl;
use Mojo::URL;
use base qw/Mojo::URL/;
# ------------------------------------------------------------------------------
sub opaque {
my ($self) = @_;
my $url = Mojo::URL->new($self);
$url->fragment('') if $url->fragment();
$url->scheme('');
return $url;
}
# ------------------------------------------------------------------------------
sub depth {
my ($self) = @_;
my $path = $self->path()->to_abs_string();
$path =~ s/\/+$//;
my $depth = split( /[^\/]+/, $path );
return $depth;
}
# ------------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment