dbiesecke/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Simple multi-threaded crawler

  % cat test | perl ./test.pl
  http://50.205.50.130
  http://50.205.50.130/district/servlet/presentdistrictloginform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
  http://50.205.50.130/common/welcome.jsp;jsessionid=AD7B955AD398F6F2EFBECFD39586E585?site=100
  http://50.205.50.130/backoffice/servlet/presentaboutform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
  http://113.106.154.162:80/lcxweb/index.ext
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zyzhize
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zuzhijigou
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=lingdaobanzi
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25C0%25BA%25E9%25B5%25F7%25B6%25C8&lanmuCode=fanghongdiaodu
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25A2%25B5%25E7%25B9%25DC%25C0%25ED&lanmuCode=fadianxinxi
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B0%25B2%25C8%25AB%25C9%25FA%25B2%25FA&lanmuCode=aqshengchan
  http://131.108.103.199


## crawler.pl
#!/usr/bin/env perl
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);
 use URI::URL;
  use File::Grep qw( fgrep fmap fdo );
use Data::Dumper;
use Mojo::UserAgent;
use Mojo::URL;
my $host;
my @hosts = ();
# my $filename = shift or die("need input!");
# if(-f $filename) {
#     @hosts = fgrep { /(http.+)/ } glob $filename;
# #   print SUMMARY $_ foreach @matches;
#
# }
#
# print Dumper(@hosts)."\n";
# die;
# foreach(split(/\n/,$host) ) {
#    if ($url->eq("http://www.sn.no")) or die;
#
#   next if !($_=~/((http|https):\/\/.+)/ig);
#   print "$1\n";
# }
# print "@hosts\n";
# FIFO queue
# my @hosts = fgrep { /(http.+)/ } glob $filename;
# my $murl = Mojo::URL->new
my @urls = map { Mojo::URL->new($_)->host } <>;
# }
# Limit parallel connections to 4
my $max_conn = 4;

# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
$ua->proxy->detect;

# Keep track of active connections
my $active = 0;

Mojo::IOLoop->recurring(
    0 => sub {
        for ($active + 1 .. $max_conn) {

            # Dequeue or halt if there are no active crawlers anymore
            return ($active or Mojo::IOLoop->stop)
                unless my $url = shift @urls;

            # Fetch non-blocking just by adding
            # a callback and marking as active
            ++$active;
            $ua->get($url => \&get_callback);
        }
    }
);

# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

sub get_callback {
    my (undef, $tx) = @_;

    # Deactivate
    --$active;
    ( print  "ERROR:\t".$tx->req->url."\n" && return) if !($tx->res->code());
#     print "-".$tx->res->code()."\n";
    # Parse only OK HTML responses
#     if ($tx->res->code() =~/^4/) {
# 	  print $tx->host."\n";
# 	  $tx =  Mojo::URL->new($tx->host);
#     }
    return
        if not ($tx->res->code() == 200)
        or $tx->res->headers->content_type !~ m{^text/html\b}ix;

    # Request URL
    my $url = $tx->req->url;

    say $url;
    parse_html($url, $tx);

    return;
}

sub parse_html {
    my ($url, $tx) = @_;
#             print $url."\n";

#     say $tx->res->dom->at('html title')->text if ($tx->res->dom->at('html title')->text);
    my $counter = 0;
    # Extract and enqueue URLs
    for my $e ($tx->res->dom('a[href]')->each) {

        # Don't visit other hosts
        # Validate href attribute
        my $link = Mojo::URL->new($e->{href});
        $counter++;
        next if 'Mojo::URL' ne ref $link;
        next if($counter > 10);
#         next if $link->host ne $url->host;

        # "normalize" link
        $link = $link->to_abs($tx->req->url)->fragment(undef);
        next unless grep { $link->protocol eq $_ } qw(http https);

        # Don't go deeper than /a/b/c
        next if @{$link->path->parts} > 3;

        # Access every link only once
        state $uniq = {};
        ++$uniq->{$url->to_string};
        next if ++$uniq->{$link->to_string} > 1;

        next if $link->host ne $url->host;

        push @urls, $link;
        say "$link";

    }
#     say '';

    return;
}
__DATA__
	#!/usr/bin/env perl
	use 5.010;
	use open qw(:locale);
	use strict;
	use utf8;
	use warnings qw(all);
	use URI::URL;
	use File::Grep qw( fgrep fmap fdo );
	use Data::Dumper;
	use Mojo::UserAgent;
	use Mojo::URL;
	my $host;
	my @hosts = ();
	# my $filename = shift or die("need input!");
	# if(-f $filename) {
	# @hosts = fgrep { /(http.+)/ } glob $filename;
	# # print SUMMARY $_ foreach @matches;
	#
	# }
	#
	# print Dumper(@hosts)."\n";
	# die;
	# foreach(split(/\n/,$host) ) {
	# if ($url->eq("http://www.sn.no")) or die;
	#
	# next if !($_=~/((http\|https):\/\/.+)/ig);
	# print "$1\n";
	# }
	# print "@hosts\n";
	# FIFO queue
	# my @hosts = fgrep { /(http.+)/ } glob $filename;
	# my $murl = Mojo::URL->new
	my @urls = map { Mojo::URL->new($_)->host } <>;
	# }
	# Limit parallel connections to 4
	my $max_conn = 4;

	# User agent following up to 5 redirects
	my $ua = Mojo::UserAgent->new(max_redirects => 5);
	$ua->proxy->detect;

	# Keep track of active connections
	my $active = 0;

	Mojo::IOLoop->recurring(
	0 => sub {
	for ($active + 1 .. $max_conn) {

	# Dequeue or halt if there are no active crawlers anymore
	return ($active or Mojo::IOLoop->stop)
	unless my $url = shift @urls;

	# Fetch non-blocking just by adding
	# a callback and marking as active
	++$active;
	$ua->get($url => \&get_callback);
	}
	}
	);

	# Start event loop if necessary
	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

	sub get_callback {
	my (undef, $tx) = @_;

	# Deactivate
	--$active;
	( print "ERROR:\t".$tx->req->url."\n" && return) if !($tx->res->code());
	# print "-".$tx->res->code()."\n";
	# Parse only OK HTML responses
	# if ($tx->res->code() =~/^4/) {
	# print $tx->host."\n";
	# $tx = Mojo::URL->new($tx->host);
	# }
	return
	if not ($tx->res->code() == 200)
	or $tx->res->headers->content_type !~ m{^text/html\b}ix;

	# Request URL
	my $url = $tx->req->url;

	say $url;
	parse_html($url, $tx);

	return;
	}

	sub parse_html {
	my ($url, $tx) = @_;
	# print $url."\n";

	# say $tx->res->dom->at('html title')->text if ($tx->res->dom->at('html title')->text);
	my $counter = 0;
	# Extract and enqueue URLs
	for my $e ($tx->res->dom('a[href]')->each) {

	# Don't visit other hosts
	# Validate href attribute
	my $link = Mojo::URL->new($e->{href});
	$counter++;
	next if 'Mojo::URL' ne ref $link;
	next if($counter > 10);
	# next if $link->host ne $url->host;

	# "normalize" link
	$link = $link->to_abs($tx->req->url)->fragment(undef);
	next unless grep { $link->protocol eq $_ } qw(http https);

	# Don't go deeper than /a/b/c
	next if @{$link->path->parts} > 3;

	# Access every link only once
	state $uniq = {};
	++$uniq->{$url->to_string};
	next if ++$uniq->{$link->to_string} > 1;

	next if $link->host ne $url->host;

	push @urls, $link;
	say "$link";

	}
	# say '';

	return;
	}
	__DATA__