Skip to content

Instantly share code, notes, and snippets.

@dbiesecke
Last active January 1, 2018 02:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbiesecke/15001022969c542176fe84468e402bc4 to your computer and use it in GitHub Desktop.
Save dbiesecke/15001022969c542176fe84468e402bc4 to your computer and use it in GitHub Desktop.
Simple perl based cralwer

Simple multi-threaded crawler

  % cat test | perl ./test.pl
  http://50.205.50.130
  http://50.205.50.130/district/servlet/presentdistrictloginform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
  http://50.205.50.130/common/welcome.jsp;jsessionid=AD7B955AD398F6F2EFBECFD39586E585?site=100
  http://50.205.50.130/backoffice/servlet/presentaboutform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
  http://113.106.154.162:80/lcxweb/index.ext
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zyzhize
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zuzhijigou
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=lingdaobanzi
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25C0%25BA%25E9%25B5%25F7%25B6%25C8&lanmuCode=fanghongdiaodu
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25A2%25B5%25E7%25B9%25DC%25C0%25ED&lanmuCode=fadianxinxi
  http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B0%25B2%25C8%25AB%25C9%25FA%25B2%25FA&lanmuCode=aqshengchan
  http://131.108.103.199
#!/usr/bin/env perl
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);
use URI::URL;
use File::Grep qw( fgrep fmap fdo );
use Data::Dumper;
use Mojo::UserAgent;
use Mojo::URL;
my $host;
my @hosts = ();
# my $filename = shift or die("need input!");
# if(-f $filename) {
# @hosts = fgrep { /(http.+)/ } glob $filename;
# # print SUMMARY $_ foreach @matches;
#
# }
#
# print Dumper(@hosts)."\n";
# die;
# foreach(split(/\n/,$host) ) {
# if ($url->eq("http://www.sn.no")) or die;
#
# next if !($_=~/((http|https):\/\/.+)/ig);
# print "$1\n";
# }
# print "@hosts\n";
# FIFO queue
# my @hosts = fgrep { /(http.+)/ } glob $filename;
# my $murl = Mojo::URL->new
my @urls = map { Mojo::URL->new($_)->host } <>;
# }
# Limit parallel connections to 4
my $max_conn = 4;
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
$ua->proxy->detect;
# Keep track of active connections
my $active = 0;
Mojo::IOLoop->recurring(
0 => sub {
for ($active + 1 .. $max_conn) {
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop)
unless my $url = shift @urls;
# Fetch non-blocking just by adding
# a callback and marking as active
++$active;
$ua->get($url => \&get_callback);
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
sub get_callback {
my (undef, $tx) = @_;
# Deactivate
--$active;
( print "ERROR:\t".$tx->req->url."\n" && return) if !($tx->res->code());
# print "-".$tx->res->code()."\n";
# Parse only OK HTML responses
# if ($tx->res->code() =~/^4/) {
# print $tx->host."\n";
# $tx = Mojo::URL->new($tx->host);
# }
return
if not ($tx->res->code() == 200)
or $tx->res->headers->content_type !~ m{^text/html\b}ix;
# Request URL
my $url = $tx->req->url;
say $url;
parse_html($url, $tx);
return;
}
sub parse_html {
my ($url, $tx) = @_;
# print $url."\n";
# say $tx->res->dom->at('html title')->text if ($tx->res->dom->at('html title')->text);
my $counter = 0;
# Extract and enqueue URLs
for my $e ($tx->res->dom('a[href]')->each) {
# Don't visit other hosts
# Validate href attribute
my $link = Mojo::URL->new($e->{href});
$counter++;
next if 'Mojo::URL' ne ref $link;
next if($counter > 10);
# next if $link->host ne $url->host;
# "normalize" link
$link = $link->to_abs($tx->req->url)->fragment(undef);
next unless grep { $link->protocol eq $_ } qw(http https);
# Don't go deeper than /a/b/c
next if @{$link->path->parts} > 3;
# Access every link only once
state $uniq = {};
++$uniq->{$url->to_string};
next if ++$uniq->{$link->to_string} > 1;
next if $link->host ne $url->host;
push @urls, $link;
say "$link";
}
# say '';
return;
}
__DATA__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment