Skip to content

Instantly share code, notes, and snippets.

@halfer
Created April 5, 2017 19:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save halfer/522b9686138efb31d231fff67a6a0389 to your computer and use it in GitHub Desktop.
Save halfer/522b9686138efb31d231fff67a6a0389 to your computer and use it in GitHub Desktop.
Script to examine bugs and behaviours in spatie/crawler (for 2.1, shows the crawler does not do its own de-duping)
<?php
namespace Proximate;
use GuzzleHttp\Client;
use GuzzleHttp\RequestOptions;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\Url;
use Spatie\Crawler\CrawlObserver;
use Spatie\Crawler\CrawlInternalUrls;
require 'vendor/autoload.php';
$url =
$baseUrl =
'http://ilovephp.jondh.me.uk/';
// @todo We need to add a Guzzle plugin into the client, to make curl/header changes
$client = new Client([
RequestOptions::COOKIES => true,
RequestOptions::CONNECT_TIMEOUT => 10,
RequestOptions::TIMEOUT => 10,
RequestOptions::ALLOW_REDIRECTS => true,
]);
class MyCrawlObserver implements CrawlObserver
{
public function willCrawl(Url $url)
{
}
public function hasBeenCrawled(Url $url, $response, Url $foundOnUrl = null)
{
echo sprintf("Crawled URL: %s\n", $url->path());
}
public function finishedCrawling()
{
}
}
// @todo Add regex crawl logic here (in shouldCrawl())
class MyCrawlProfile extends CrawlInternalUrls
{
public function shouldCrawl(Url $url) : bool
{
$isInternal = parent::shouldCrawl($url);
// @todo This needs to be generalised
$matchesRegex = strpos($url->path(), '/en/tutorial') === 0;
$matchesRoot = $url->path() === '/';
$shouldCrawl =
$isInternal &&
($matchesRegex || $matchesRoot);
if ($shouldCrawl)
{
#echo sprintf("Should crawl %s\n", $url->path());
}
return $shouldCrawl;
}
}
$t = microtime(true);
$crawler = new Crawler($client);
$crawler->
setCrawlProfile(new MyCrawlProfile($baseUrl))->
setCrawlObserver(new MyCrawlObserver())->
setConcurrency(1)->
startCrawling($url);
$et = microtime(true) - $t;
echo sprintf("The crawl took %s sec\n", round($et, 1));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment