Skip to content

Instantly share code, notes, and snippets.

@anlisha-maharjan
Last active January 7, 2022 08:49
Show Gist options
  • Save anlisha-maharjan/b2a67fcb968e7c881d68308c82aedf19 to your computer and use it in GitHub Desktop.
Save anlisha-maharjan/b2a67fcb968e7c881d68308c82aedf19 to your computer and use it in GitHub Desktop.
<?php
namespace App\Http\Controllers;
use Illuminate\Http\Request;
use App\Observers\CustomCrawlerObserver;
use Spatie\Crawler\CrawlProfiles\CrawlInternalUrls;
use Spatie\Crawler\Crawler;
use App\Http\Controllers\Controller;
use GuzzleHttp\RequestOptions;
class CustomCrawlerController extends Controller {
public function __construct() {}
/**
* Crawl the website content.
* @return true
*/
public function fetchContent(){
//# initiate crawler
Crawler::create([RequestOptions::ALLOW_REDIRECTS => true, RequestOptions::TIMEOUT => 30])
->acceptNofollowLinks()
->ignoreRobots()
// ->setParseableMimeTypes(['text/html', 'text/plain'])
->setCrawlObserver(new CustomCrawlerObserver())
->setCrawlProfile(new CrawlInternalUrls('https://www.lipsum.com'))
->setMaximumResponseSize(1024 * 1024 * 2) // 2 MB maximum
->setTotalCrawlLimit(100) // limit defines the maximal count of URLs to crawl
// ->setConcurrency(1) // all urls will be crawled one by one
->setDelayBetweenRequests(100)
->startCrawling('https://www.lipsum.com');
return true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment