-
-
Save Bowlslaw/fb16404d5cff5b34b5e4d73a23ebde86 to your computer and use it in GitHub Desktop.
multi-threaded crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 #!/usr/bin/env perl6 | |
2 | |
3 use v6; | |
4 | |
5 use DOM::Tiny; | |
6 use HTTP::UserAgent; | |
7 | |
8 my $channel = Channel.new; | |
9 | |
10 sub MAIN(Str :$seed = "http://perl6.org", Int :$depth = 4, Int :$workers = 4) { | |
11 $channel.send: $seed => $depth; | |
12 | |
13 for ^$workers { | |
14 start while my $p = $channel.receive { | |
15 say $*THREAD; | |
16 crawl $p.key, $p.value; | |
17 } | |
18 } | |
19 await $channel.closed; | |
20 } | |
21 | |
22 sub crawl($url, $depth where *.so) { | |
23 say $depth, ' - ', $url; | |
24 my $ua = HTTP::UserAgent.new; | |
25 react { | |
26 whenever $ua.get($url) -> $response { | |
27 my $dom = DOM::Tiny.parse(~$response); | |
28 | |
29 process $dom; | |
30 | |
31 return if $depth ≤ 1; | |
32 for $dom.find('a[href]') -> $e { 33 if $e<href> ~~ /http/ { | |
34 $channel.send($e<href> => $depth - 1); | |
35 } | |
36 else { | |
37 $channel.send("$url$e<href>" => $depth - 1); | |
38 } | |
39 } | |
40 } | |
41 } | |
42 } | |
43 | |
44 sub process($dom) { | |
45 # process here | |
46 } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment