robhammond/http-status-checker-async.pl

## http-status-checker-async.pl
#!/usr/bin/env perl
use strict;
use utf8;
use warnings qw(all);

use Modern::Perl;
use Mojo::Util qw(decode encode html_unescape xml_escape);
use Mojo::DOM;
use Mojo::Log;
use Mojo::Asset::File;
use File::Slurp;
use Mojo::UserAgent;

my $log = Mojo::Log->new;

# change user agent string and add contact email for webmasters
my $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36';

my $input_urls = 'urls.txt';

my @urls = read_file($input_urls);

my $file = Mojo::Asset::File->new;
$file->add_chunk("Request URL\tResult URL\tStatus\tServer\tContent Type\tContent Length\tRedirects\n");

# Limit parallel connections to 5
my $max_conn = 5;

# User agent following up to 25 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 25);
$ua->proxy->detect;

# Keep track of active connections
my $active = 0;

Mojo::IOLoop->recurring(
    0 => sub {
        for ($active + 1 .. $max_conn) {

            # Dequeue or halt if there are no active crawlers anymore
            return ($active or Mojo::IOLoop->stop)
                unless my $url = shift @urls;
            chomp($url);
            # Fetch non-blocking just by adding
            # a callback and marking as active
            ++$active;
            $ua->get($url => sub {
                my (undef, $tx) = @_;

                # Deactivate
                --$active;

                if (!$tx->res->code) {
                    $log->info('error ' . $tx->res->status);
                    my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->error, '', '', '') );
                    $file->add_chunk("$csv\n");
                    return;
                }

                if ($tx->res->code =~ m{^[456789]}) {
                    my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->code, $tx->res->headers->server, $tx->res->headers->content_type, $tx->res->headers->content_length) );
                    $file->add_chunk("$csv\n");
                }

                my $headers = $tx->res->headers->to_hash;

                my @redirects;
                my $redirects_csv = '';
                my $orig_url = $tx->req->url;
                if ($tx->redirects) {
                    my $i = 0;
                    for my $redir (@{$tx->redirects}) {
                        if ($i == 0) {
                            $orig_url = $redir->req->url;
                        }
                        push @redirects, { url => $redir->req->url, status => $redir->res->code };
                        $redirects_csv .= $redir->req->url . " (" . $redir->res->code . ")\t";
                        $i++;
                    }
                }
                my $dhash = {
                    url => $tx->req->url,
                    status => $tx->res->code,
                    redirects => \@redirects,
                    server => $tx->res->headers->server,
                    content_type => $tx->res->headers->content_type,
                    content_length => $tx->res->headers->content_length,
                    msg => $tx->res->message,
                };

                my $csv = join( "\t", ($orig_url, $dhash->{url}, $dhash->{status}, $dhash->{server}, $dhash->{content_type}, $dhash->{content_length}) );
                say $csv;
                $file->add_chunk("$csv\t$redirects_csv\n");

                return;
            });
        }
    }
);

# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

$file->move_to('http-status.tsv');
say $file->slurp;


__DATA__
Forked from:
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
	#!/usr/bin/env perl
	use strict;
	use utf8;
	use warnings qw(all);

	use Modern::Perl;
	use Mojo::Util qw(decode encode html_unescape xml_escape);
	use Mojo::DOM;
	use Mojo::Log;
	use Mojo::Asset::File;
	use File::Slurp;
	use Mojo::UserAgent;

	my $log = Mojo::Log->new;

	# change user agent string and add contact email for webmasters
	my $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36';

	my $input_urls = 'urls.txt';

	my @urls = read_file($input_urls);

	my $file = Mojo::Asset::File->new;
	$file->add_chunk("Request URL\tResult URL\tStatus\tServer\tContent Type\tContent Length\tRedirects\n");

	# Limit parallel connections to 5
	my $max_conn = 5;

	# User agent following up to 25 redirects
	my $ua = Mojo::UserAgent->new(max_redirects => 25);
	$ua->proxy->detect;

	# Keep track of active connections
	my $active = 0;

	Mojo::IOLoop->recurring(
	0 => sub {
	for ($active + 1 .. $max_conn) {

	# Dequeue or halt if there are no active crawlers anymore
	return ($active or Mojo::IOLoop->stop)
	unless my $url = shift @urls;
	chomp($url);
	# Fetch non-blocking just by adding
	# a callback and marking as active
	++$active;
	$ua->get($url => sub {
	my (undef, $tx) = @_;

	# Deactivate
	--$active;

	if (!$tx->res->code) {
	$log->info('error ' . $tx->res->status);
	my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->error, '', '', '') );
	$file->add_chunk("$csv\n");
	return;
	}

	if ($tx->res->code =~ m{^[456789]}) {
	my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->code, $tx->res->headers->server, $tx->res->headers->content_type, $tx->res->headers->content_length) );
	$file->add_chunk("$csv\n");
	}

	my $headers = $tx->res->headers->to_hash;

	my @redirects;
	my $redirects_csv = '';
	my $orig_url = $tx->req->url;
	if ($tx->redirects) {
	my $i = 0;
	for my $redir (@{$tx->redirects}) {
	if ($i == 0) {
	$orig_url = $redir->req->url;
	}
	push @redirects, { url => $redir->req->url, status => $redir->res->code };
	$redirects_csv .= $redir->req->url . " (" . $redir->res->code . ")\t";
	$i++;
	}
	}
	my $dhash = {
	url => $tx->req->url,
	status => $tx->res->code,
	redirects => \@redirects,
	server => $tx->res->headers->server,
	content_type => $tx->res->headers->content_type,
	content_length => $tx->res->headers->content_length,
	msg => $tx->res->message,
	};

	my $csv = join( "\t", ($orig_url, $dhash->{url}, $dhash->{status}, $dhash->{server}, $dhash->{content_type}, $dhash->{content_length}) );
	say $csv;
	$file->add_chunk("$csv\t$redirects_csv\n");

	return;
	});
	}
	}
	);

	# Start event loop if necessary
	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

	$file->move_to('http-status.tsv');
	say $file->slurp;



	__DATA__
	Forked from:
	http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html