Skip to content

Instantly share code, notes, and snippets.

@niczero
Created June 10, 2016 13:20
Show Gist options
  • Save niczero/a220b03a95f7979a74d8ea155abadd15 to your computer and use it in GitHub Desktop.
Save niczero/a220b03a95f7979a74d8ea155abadd15 to your computer and use it in GitHub Desktop.
Check for dead links recursively on a page
#!/usr/bin/env perl
use strict;
use warnings;
use Mojo::UserAgent;
my $base = shift or die "Usage: $0 http://example.com/foo/bar\n";
my @queue = ([$base = Mojo::URL->new($base)]);
my $ua = Mojo::UserAgent->new;
my %info;
Mojo::IOLoop->delay(\&checker)->wait;
exit 0;
sub checker {
my ($delay, @tx) = @_;
my $n = 0;
for my $tx (@tx) {
my $doc_url = $tx->req->url;
unless ($info{reported}{$doc_url}++) {
printf "%s - %s - %s\n", $tx->res->code || '000', $info{referrer}{$doc_url}, $doc_url->path;
if ($tx->res->headers->content_type =~ /html/) {
$ua->get($doc_url => $delay->begin);
$n++;
next;
}
}
$tx->res->dom->find('a')->each(
sub {
my $next_url = Mojo::URL->new($_->{href} || '');
$next_url->scheme($base->scheme)->authority($base->authority) unless $next_url->authority;
return if $info{seen}{$next_url}++;
return warn "Skip mail: $_->{href}\n" if $_->{href} and $_->{href} =~ /^mailto:/;
return warn "Skip host: $next_url\n" if $next_url->host ne $base->host;
return warn "Skip len: $next_url\n" if length $next_url < length $base;
push @queue, [$next_url, $doc_url];
}
);
}
while (my $q = shift @queue) {
last if ++$n > 10;
$info{referrer}{$q->[0]} = $q->[1] ? $q->[1]->path : $q->[0];
$ua->head($q->[0] => $delay->begin);
}
unshift @{$delay->remaining}, \&checker if $n;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment