Skip to content

Instantly share code, notes, and snippets.

@kaianuar
Created August 11, 2019 07:23
Show Gist options
  • Save kaianuar/ca1c447dcbd891e57c53b7c0c61c3bbc to your computer and use it in GitHub Desktop.
Save kaianuar/ca1c447dcbd891e57c53b7c0c61c3bbc to your computer and use it in GitHub Desktop.
Using spatie/crawler and crawl a page and retrieve all links within that page.
<?php
namespace App\Http\Crawler;
use Spatie\Crawler\CrawlObserver;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
class CategoryHelper extends CrawlObserver{
public $pages =[];
protected $selector;
protected $url;
public function crawled(UriInterface $url, ResponseInterface $response, UriInterface $foundOnUrl = null)
{
$this->url = $url->__toString();
$crawler = new DomCrawler($response->getBody()->__toString());
$crawler->filter($this->selector)->each(function (DomCrawler $node, $i) {
// this is where you add the urls to the array
$this->pages[] = $this->url . $node->attr('href');
});
}
public function crawlFailed(UriInterface $url, RequestException $requestException, UriInterface $foundOnUrl = null)
{
echo 'failed';
}
public function finishedCrawling()
{
// You can use this to do whatever needs to be done once the crawling has finished.
// What I usually do here is to remove duplicate urls
}
public function setSelector($selector)
{
$this->selector = $selector;
}
}
<?php
use Spatie\Crawler\Crawler;
use App\Http\Crawler\CategoryHelper;
class CrawlerController extends Controller
{
public $root_url;
public function crawl($url = null)
{
$this->root_url = 'https://www.amazon.com/';
$cat_helper = new CategoryHelper();
$cat_helper->setSelector('a.a-link-normal'); // Pass the css selector that contains the links
Crawler::create()
->setCrawlObserver($cat_helper)
->setMaximumCrawlCount(1)
->ignoreRobots()
->startCrawling($this->root_url);
$pages = $cat_helper->pages;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment