-
-
Save halfer/522b9686138efb31d231fff67a6a0389 to your computer and use it in GitHub Desktop.
Script to examine bugs and behaviours in spatie/crawler (for 2.1, shows the crawler does not do its own de-duping)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Proximate; | |
use GuzzleHttp\Client; | |
use GuzzleHttp\RequestOptions; | |
use Spatie\Crawler\Crawler; | |
use Spatie\Crawler\Url; | |
use Spatie\Crawler\CrawlObserver; | |
use Spatie\Crawler\CrawlInternalUrls; | |
require 'vendor/autoload.php'; | |
$url = | |
$baseUrl = | |
'http://ilovephp.jondh.me.uk/'; | |
// @todo We need to add a Guzzle plugin into the client, to make curl/header changes | |
$client = new Client([ | |
RequestOptions::COOKIES => true, | |
RequestOptions::CONNECT_TIMEOUT => 10, | |
RequestOptions::TIMEOUT => 10, | |
RequestOptions::ALLOW_REDIRECTS => true, | |
]); | |
class MyCrawlObserver implements CrawlObserver | |
{ | |
public function willCrawl(Url $url) | |
{ | |
} | |
public function hasBeenCrawled(Url $url, $response, Url $foundOnUrl = null) | |
{ | |
echo sprintf("Crawled URL: %s\n", $url->path()); | |
} | |
public function finishedCrawling() | |
{ | |
} | |
} | |
// @todo Add regex crawl logic here (in shouldCrawl()) | |
class MyCrawlProfile extends CrawlInternalUrls | |
{ | |
public function shouldCrawl(Url $url) : bool | |
{ | |
$isInternal = parent::shouldCrawl($url); | |
// @todo This needs to be generalised | |
$matchesRegex = strpos($url->path(), '/en/tutorial') === 0; | |
$matchesRoot = $url->path() === '/'; | |
$shouldCrawl = | |
$isInternal && | |
($matchesRegex || $matchesRoot); | |
if ($shouldCrawl) | |
{ | |
#echo sprintf("Should crawl %s\n", $url->path()); | |
} | |
return $shouldCrawl; | |
} | |
} | |
$t = microtime(true); | |
$crawler = new Crawler($client); | |
$crawler-> | |
setCrawlProfile(new MyCrawlProfile($baseUrl))-> | |
setCrawlObserver(new MyCrawlObserver())-> | |
setConcurrency(1)-> | |
startCrawling($url); | |
$et = microtime(true) - $t; | |
echo sprintf("The crawl took %s sec\n", round($et, 1)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment