Basic Perl spider using Selenium and headless Chrome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use HTTP::Tiny; | |
use Getopt::Long 'GetOptions'; | |
use Encode qw(encode decode); | |
use Parallel::ForkManager; | |
use PerlIO::gzip; | |
use Selenium::Remote::Driver; | |
use Time::HiRes 'sleep'; | |
GetOptions( | |
'processes=i' => \(my $max_processes = 2), | |
'output=s' => \(my $output_dir = 'data'), | |
'depth=i' => \(my $depth = 1), | |
'trace' => \(my $trace = 0), | |
) or die 'unrecognized options'; | |
ping_selenium_server(); | |
my $pm = Parallel::ForkManager->new($max_processes); | |
DOMAIN: | |
while (my $d = <<>>) { | |
my $pid = $pm->start and next DOMAIN; | |
chomp $d; | |
my $path = $d =~ s{/}{__}gr; | |
print "$d -> $path\n"; | |
open my $FH, '>:raw:gzip', "$output_dir/$path.gz" or die $!; | |
my $driver = Selenium::Remote::Driver->new( | |
browser_name => 'chrome', | |
# allow to be run as root | |
# no GUI | |
# load websites with invalid ssl certs | |
# disable "chrome is being controlled by software" notification | |
extra_capabilities => { chromeOptions => {args => [ qw(window-size=1920,1080 no-sandbox headless allow-running-insecure-content disable-infobars) ]}}, | |
); | |
$driver->{visited} = {}; | |
my $url = "http://$d"; | |
spider_site($driver, $url, $depth, $FH); | |
$driver->quit(); | |
$pm->finish; | |
} | |
$pm->wait_all_children; | |
sub spider_site { | |
my ($driver, $url, $depth, $FH) = @_; | |
warn "fetching $url\n"; | |
$driver->get($url); | |
$driver->{visited}{$url}++; | |
my $host = URI->new($url)->host; | |
# some sites have links to the parent domain without www | |
$host =~ s/^www\.//; | |
my $text = $driver->get_body; | |
print $FH encode('UTF-8', $text); | |
if ($depth > 0) { | |
my @links = $driver->find_elements('a', 'tag_name'); | |
warn sprintf "found %s links\n", scalar @links if $trace; | |
my @urls = (); | |
for my $l (@links) { | |
my $link_url = eval { $l->get_attribute('href') }; | |
next unless $link_url; | |
my $link_uri = URI->new($link_url); | |
next unless $link_uri->can('host'); # not all URIs have a domain | |
my $link_host = $link_uri->host; | |
# only visit links to subdomains of our starting URL | |
if ($link_host =~ /\Q$host\E$/) { | |
push @urls, $link_url; | |
warn sprintf "included %s\n", $link_url if $trace; | |
} | |
else { | |
warn sprintf "skipped %s (%s) different subdomain to %s (%s)\n", $link_url, $link_host, $url, $host if $trace; | |
} | |
} | |
for my $u (@urls) { | |
if ($driver->{visited}{$u}) { | |
warn sprintf "already visited, ignoring %s\n", $u if $trace; | |
} | |
else { | |
sleep rand 1; | |
spider_site($driver, $u, $depth - 1, $FH); | |
} | |
} | |
} | |
else { | |
warn "have reached maximum depth\n" if $trace; | |
} | |
} | |
sub ping_selenium_server { | |
my $attempts = 0; | |
my $success = undef; | |
my $ua = HTTP::Tiny->new; | |
until ($success) { | |
my $res = $ua->get('http://127.0.0.1:4444/wd/hub/status'); | |
$success = $res->{success}; | |
sleep 0.1; | |
$attempts++; | |
if ($attempts > 60) { | |
die 'unable to find selenium server'; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment