% cat test | perl ./test.pl
http://50.205.50.130
http://50.205.50.130/district/servlet/presentdistrictloginform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
http://50.205.50.130/common/welcome.jsp;jsessionid=AD7B955AD398F6F2EFBECFD39586E585?site=100
http://50.205.50.130/backoffice/servlet/presentaboutform.do;jsessionid=AD7B955AD398F6F2EFBECFD39586E585
http://113.106.154.162:80/lcxweb/index.ext
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zyzhize
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=zuzhijigou
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getMsgById2&mdlid=49&mdlcode=lingdaobanzi
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25C0%25BA%25E9%25B5%25F7%25B6%25C8&lanmuCode=fanghongdiaodu
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B7%25A2%25B5%25E7%25B9%25DC%25C0%25ED&lanmuCode=fadianxinxi
http://113.106.154.162:80/lcxweb/msg.ext?pmethod=getCommonPage&leibie=%25CA%25E0%25C5%25A6%25D4%25CB%25D0%25D0&lanmu=%25B0%25B2%25C8%25AB%25C9%25FA%25B2%25FA&lanmuCode=aqshengchan
http://131.108.103.199
Last active
January 1, 2018 02:56
-
-
Save dbiesecke/15001022969c542176fe84468e402bc4 to your computer and use it in GitHub Desktop.
Simple perl based cralwer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.010; | |
use open qw(:locale); | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use URI::URL; | |
use File::Grep qw( fgrep fmap fdo ); | |
use Data::Dumper; | |
use Mojo::UserAgent; | |
use Mojo::URL; | |
my $host; | |
my @hosts = (); | |
# my $filename = shift or die("need input!"); | |
# if(-f $filename) { | |
# @hosts = fgrep { /(http.+)/ } glob $filename; | |
# # print SUMMARY $_ foreach @matches; | |
# | |
# } | |
# | |
# print Dumper(@hosts)."\n"; | |
# die; | |
# foreach(split(/\n/,$host) ) { | |
# if ($url->eq("http://www.sn.no")) or die; | |
# | |
# next if !($_=~/((http|https):\/\/.+)/ig); | |
# print "$1\n"; | |
# } | |
# print "@hosts\n"; | |
# FIFO queue | |
# my @hosts = fgrep { /(http.+)/ } glob $filename; | |
# my $murl = Mojo::URL->new | |
my @urls = map { Mojo::URL->new($_)->host } <>; | |
# } | |
# Limit parallel connections to 4 | |
my $max_conn = 4; | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
$ua->proxy->detect; | |
# Keep track of active connections | |
my $active = 0; | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
for ($active + 1 .. $max_conn) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; | |
$ua->get($url => \&get_callback); | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
sub get_callback { | |
my (undef, $tx) = @_; | |
# Deactivate | |
--$active; | |
( print "ERROR:\t".$tx->req->url."\n" && return) if !($tx->res->code()); | |
# print "-".$tx->res->code()."\n"; | |
# Parse only OK HTML responses | |
# if ($tx->res->code() =~/^4/) { | |
# print $tx->host."\n"; | |
# $tx = Mojo::URL->new($tx->host); | |
# } | |
return | |
if not ($tx->res->code() == 200) | |
or $tx->res->headers->content_type !~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
say $url; | |
parse_html($url, $tx); | |
return; | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
# print $url."\n"; | |
# say $tx->res->dom->at('html title')->text if ($tx->res->dom->at('html title')->text); | |
my $counter = 0; | |
# Extract and enqueue URLs | |
for my $e ($tx->res->dom('a[href]')->each) { | |
# Don't visit other hosts | |
# Validate href attribute | |
my $link = Mojo::URL->new($e->{href}); | |
$counter++; | |
next if 'Mojo::URL' ne ref $link; | |
next if($counter > 10); | |
# next if $link->host ne $url->host; | |
# "normalize" link | |
$link = $link->to_abs($tx->req->url)->fragment(undef); | |
next unless grep { $link->protocol eq $_ } qw(http https); | |
# Don't go deeper than /a/b/c | |
next if @{$link->path->parts} > 3; | |
# Access every link only once | |
state $uniq = {}; | |
++$uniq->{$url->to_string}; | |
next if ++$uniq->{$link->to_string} > 1; | |
next if $link->host ne $url->host; | |
push @urls, $link; | |
say "$link"; | |
} | |
# say ''; | |
return; | |
} | |
__DATA__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment