Skip to content

Instantly share code, notes, and snippets.

@robhammond
Last active October 1, 2015 21:29
Show Gist options
  • Save robhammond/c6f9f1677c6a3d166a5e to your computer and use it in GitHub Desktop.
Save robhammond/c6f9f1677c6a3d166a5e to your computer and use it in GitHub Desktop.
Non-blocking HTTP status checker script using Mojolicious. Accepts text file 'urls.txt' and writes to 'http-status.csv'
#!/usr/bin/env perl
use strict;
use utf8;
use warnings qw(all);
use Modern::Perl;
use Mojo::Util qw(decode encode html_unescape xml_escape);
use Mojo::DOM;
use Mojo::Log;
use Mojo::Asset::File;
use File::Slurp;
use Mojo::UserAgent;
my $log = Mojo::Log->new;
# change user agent string and add contact email for webmasters
my $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36';
my $input_urls = 'urls.txt';
my @urls = read_file($input_urls);
my $file = Mojo::Asset::File->new;
$file->add_chunk("Request URL\tResult URL\tStatus\tServer\tContent Type\tContent Length\tRedirects\n");
# Limit parallel connections to 5
my $max_conn = 5;
# User agent following up to 25 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 25);
$ua->proxy->detect;
# Keep track of active connections
my $active = 0;
Mojo::IOLoop->recurring(
0 => sub {
for ($active + 1 .. $max_conn) {
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop)
unless my $url = shift @urls;
chomp($url);
# Fetch non-blocking just by adding
# a callback and marking as active
++$active;
$ua->get($url => sub {
my (undef, $tx) = @_;
# Deactivate
--$active;
if (!$tx->res->code) {
$log->info('error ' . $tx->res->status);
my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->error, '', '', '') );
$file->add_chunk("$csv\n");
return;
}
if ($tx->res->code =~ m{^[456789]}) {
my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->code, $tx->res->headers->server, $tx->res->headers->content_type, $tx->res->headers->content_length) );
$file->add_chunk("$csv\n");
}
my $headers = $tx->res->headers->to_hash;
my @redirects;
my $redirects_csv = '';
my $orig_url = $tx->req->url;
if ($tx->redirects) {
my $i = 0;
for my $redir (@{$tx->redirects}) {
if ($i == 0) {
$orig_url = $redir->req->url;
}
push @redirects, { url => $redir->req->url, status => $redir->res->code };
$redirects_csv .= $redir->req->url . " (" . $redir->res->code . ")\t";
$i++;
}
}
my $dhash = {
url => $tx->req->url,
status => $tx->res->code,
redirects => \@redirects,
server => $tx->res->headers->server,
content_type => $tx->res->headers->content_type,
content_length => $tx->res->headers->content_length,
msg => $tx->res->message,
};
my $csv = join( "\t", ($orig_url, $dhash->{url}, $dhash->{status}, $dhash->{server}, $dhash->{content_type}, $dhash->{content_length}) );
say $csv;
$file->add_chunk("$csv\t$redirects_csv\n");
return;
});
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
$file->move_to('http-status.tsv');
say $file->slurp;
__DATA__
Forked from:
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment