Skip to content

Instantly share code, notes, and snippets.

@dnmfarrell
Created January 14, 2019 01:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dnmfarrell/5dde6d3957bf9ae037e170cdb44f75a5 to your computer and use it in GitHub Desktop.
Save dnmfarrell/5dde6d3957bf9ae037e170cdb44f75a5 to your computer and use it in GitHub Desktop.
Basic Perl spider using Selenium and headless Chrome
#!/usr/bin/env perl
use strict;
use warnings;
use HTTP::Tiny;
use Getopt::Long 'GetOptions';
use Encode qw(encode decode);
use Parallel::ForkManager;
use PerlIO::gzip;
use Selenium::Remote::Driver;
use Time::HiRes 'sleep';
GetOptions(
'processes=i' => \(my $max_processes = 2),
'output=s' => \(my $output_dir = 'data'),
'depth=i' => \(my $depth = 1),
'trace' => \(my $trace = 0),
) or die 'unrecognized options';
ping_selenium_server();
my $pm = Parallel::ForkManager->new($max_processes);
DOMAIN:
while (my $d = <<>>) {
my $pid = $pm->start and next DOMAIN;
chomp $d;
my $path = $d =~ s{/}{__}gr;
print "$d -> $path\n";
open my $FH, '>:raw:gzip', "$output_dir/$path.gz" or die $!;
my $driver = Selenium::Remote::Driver->new(
browser_name => 'chrome',
# allow to be run as root
# no GUI
# load websites with invalid ssl certs
# disable "chrome is being controlled by software" notification
extra_capabilities => { chromeOptions => {args => [ qw(window-size=1920,1080 no-sandbox headless allow-running-insecure-content disable-infobars) ]}},
);
$driver->{visited} = {};
my $url = "http://$d";
spider_site($driver, $url, $depth, $FH);
$driver->quit();
$pm->finish;
}
$pm->wait_all_children;
sub spider_site {
my ($driver, $url, $depth, $FH) = @_;
warn "fetching $url\n";
$driver->get($url);
$driver->{visited}{$url}++;
my $host = URI->new($url)->host;
# some sites have links to the parent domain without www
$host =~ s/^www\.//;
my $text = $driver->get_body;
print $FH encode('UTF-8', $text);
if ($depth > 0) {
my @links = $driver->find_elements('a', 'tag_name');
warn sprintf "found %s links\n", scalar @links if $trace;
my @urls = ();
for my $l (@links) {
my $link_url = eval { $l->get_attribute('href') };
next unless $link_url;
my $link_uri = URI->new($link_url);
next unless $link_uri->can('host'); # not all URIs have a domain
my $link_host = $link_uri->host;
# only visit links to subdomains of our starting URL
if ($link_host =~ /\Q$host\E$/) {
push @urls, $link_url;
warn sprintf "included %s\n", $link_url if $trace;
}
else {
warn sprintf "skipped %s (%s) different subdomain to %s (%s)\n", $link_url, $link_host, $url, $host if $trace;
}
}
for my $u (@urls) {
if ($driver->{visited}{$u}) {
warn sprintf "already visited, ignoring %s\n", $u if $trace;
}
else {
sleep rand 1;
spider_site($driver, $u, $depth - 1, $FH);
}
}
}
else {
warn "have reached maximum depth\n" if $trace;
}
}
sub ping_selenium_server {
my $attempts = 0;
my $success = undef;
my $ua = HTTP::Tiny->new;
until ($success) {
my $res = $ua->get('http://127.0.0.1:4444/wd/hub/status');
$success = $res->{success};
sleep 0.1;
$attempts++;
if ($attempts > 60) {
die 'unable to find selenium server';
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment