halfer/ recursive-fetch.php Secret

## recursive-fetch.php
<?php

namespace Proximate;

use GuzzleHttp\Client;
use GuzzleHttp\RequestOptions;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\Url;
use Spatie\Crawler\CrawlObserver;
use Spatie\Crawler\CrawlInternalUrls;

require 'vendor/autoload.php';

$url =
$baseUrl =
    'http://ilovephp.jondh.me.uk/';

// @todo We need to add a Guzzle plugin into the client, to make curl/header changes
$client = new Client([
    RequestOptions::COOKIES => true,
    RequestOptions::CONNECT_TIMEOUT => 10,
    RequestOptions::TIMEOUT => 10,
    RequestOptions::ALLOW_REDIRECTS => true,
]);

class MyCrawlObserver implements CrawlObserver
{
    public function willCrawl(Url $url)
    {
    }

    public function hasBeenCrawled(Url $url, $response, Url $foundOnUrl = null)
    {
        echo sprintf("Crawled URL: %s\n", $url->path());
    }

    public function finishedCrawling()
    {
    }
}

// @todo Add regex crawl logic here (in shouldCrawl())
class MyCrawlProfile extends CrawlInternalUrls
{
    public function shouldCrawl(Url $url) : bool
    {
        $isInternal = parent::shouldCrawl($url);
        // @todo This needs to be generalised
        $matchesRegex = strpos($url->path(), '/en/tutorial') === 0;
        $matchesRoot = $url->path() === '/';

        $shouldCrawl =
            $isInternal &&
            ($matchesRegex || $matchesRoot);

        if ($shouldCrawl)
        {
            #echo sprintf("Should crawl %s\n", $url->path());
        }

        return $shouldCrawl;
    }
}

$t = microtime(true);
$crawler = new Crawler($client);
$crawler->
    setCrawlProfile(new MyCrawlProfile($baseUrl))->
    setCrawlObserver(new MyCrawlObserver())->
    setConcurrency(1)->
    startCrawling($url);
$et = microtime(true) - $t;
echo sprintf("The crawl took %s sec\n", round($et, 1));
	<?php

	namespace Proximate;

	use GuzzleHttp\Client;
	use GuzzleHttp\RequestOptions;
	use Spatie\Crawler\Crawler;
	use Spatie\Crawler\Url;
	use Spatie\Crawler\CrawlObserver;
	use Spatie\Crawler\CrawlInternalUrls;

	require 'vendor/autoload.php';

	$url =
	$baseUrl =
	'http://ilovephp.jondh.me.uk/';

	// @todo We need to add a Guzzle plugin into the client, to make curl/header changes
	$client = new Client([
	RequestOptions::COOKIES => true,
	RequestOptions::CONNECT_TIMEOUT => 10,
	RequestOptions::TIMEOUT => 10,
	RequestOptions::ALLOW_REDIRECTS => true,
	]);

	class MyCrawlObserver implements CrawlObserver
	{
	public function willCrawl(Url $url)
	{
	}

	public function hasBeenCrawled(Url $url, $response, Url $foundOnUrl = null)
	{
	echo sprintf("Crawled URL: %s\n", $url->path());
	}

	public function finishedCrawling()
	{
	}
	}

	// @todo Add regex crawl logic here (in shouldCrawl())
	class MyCrawlProfile extends CrawlInternalUrls
	{
	public function shouldCrawl(Url $url) : bool
	{
	$isInternal = parent::shouldCrawl($url);
	// @todo This needs to be generalised
	$matchesRegex = strpos($url->path(), '/en/tutorial') === 0;
	$matchesRoot = $url->path() === '/';

	$shouldCrawl =
	$isInternal &&
	($matchesRegex \|\| $matchesRoot);

	if ($shouldCrawl)
	{
	#echo sprintf("Should crawl %s\n", $url->path());
	}

	return $shouldCrawl;
	}
	}

	$t = microtime(true);
	$crawler = new Crawler($client);
	$crawler->
	setCrawlProfile(new MyCrawlProfile($baseUrl))->
	setCrawlObserver(new MyCrawlObserver())->
	setConcurrency(1)->
	startCrawling($url);
	$et = microtime(true) - $t;
	echo sprintf("The crawl took %s sec\n", round($et, 1));