Skip to content

Instantly share code, notes, and snippets.

@Bowlslaw
Created July 2, 2018 16:18
Show Gist options
  • Save Bowlslaw/fb16404d5cff5b34b5e4d73a23ebde86 to your computer and use it in GitHub Desktop.
Save Bowlslaw/fb16404d5cff5b34b5e4d73a23ebde86 to your computer and use it in GitHub Desktop.
multi-threaded crawler
1 #!/usr/bin/env perl6
2
3 use v6;
4
5 use DOM::Tiny;
6 use HTTP::UserAgent;
7
8 my $channel = Channel.new;
9
10 sub MAIN(Str :$seed = "http://perl6.org", Int :$depth = 4, Int :$workers = 4) {
11 $channel.send: $seed => $depth;
12
13 for ^$workers {
14 start while my $p = $channel.receive {
15 say $*THREAD;
16 crawl $p.key, $p.value;
17 }
18 }
19 await $channel.closed;
20 }
21
22 sub crawl($url, $depth where *.so) {
23 say $depth, ' - ', $url;
24 my $ua = HTTP::UserAgent.new;
25 react {
26 whenever $ua.get($url) -> $response {
27 my $dom = DOM::Tiny.parse(~$response);
28
29 process $dom;
30
31 return if $depth ≤ 1;
32 for $dom.find('a[href]') -> $e { 33 if $e<href> ~~ /http/ {
34 $channel.send($e<href> => $depth - 1);
35 }
36 else {
37 $channel.send("$url$e<href>" => $depth - 1);
38 }
39 }
40 }
41 }
42 }
43
44 sub process($dom) {
45 # process here
46 }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment