Skip to content

Instantly share code, notes, and snippets.

@AlexDaniel
Created July 2, 2018 02:33
Show Gist options
  • Save AlexDaniel/521c27e47fecda42297f39d3a528718a to your computer and use it in GitHub Desktop.
Save AlexDaniel/521c27e47fecda42297f39d3a528718a to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl6
use v6;
use DOM::Tiny;
use HTTP::UserAgent;
my $semaphore;
sub MAIN(Str :$seed = "http://perl6.org",
Str :$file = "test.links",
Int :$depth = 2,
Int :$workers = 4,
) {
$semaphore = Semaphore.new: $workers;
crawl($seed, $file, $depth);
sleep ∞;
}
sub crawl($url, $file, $depth) {
return if $depth ≤ 0;
$semaphore.acquire;
my $acquired = True;
LEAVE $semaphore.release if $acquired;
say $url;
my $ua = HTTP::UserAgent.new;
my @links;
my $response = $ua.get($url);
my $dom = DOM::Tiny.parse(~$response);
for $dom.find('a[href]') -> $e {
#say "Getting $e<href>";
if $e<href> ~~ /http/ {
@links.push($e<href>);
}
else {
@links.push("$url$e<href>");
}
}
my $fh = open $file, :a;
$fh.say(@links.unique.join("\n"));
$fh.close;
for @links -> $link {
start crawl($link, $file, $depth - 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment