Last active
January 7, 2022 08:49
-
-
Save anlisha-maharjan/b2a67fcb968e7c881d68308c82aedf19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Http\Controllers; | |
use Illuminate\Http\Request; | |
use App\Observers\CustomCrawlerObserver; | |
use Spatie\Crawler\CrawlProfiles\CrawlInternalUrls; | |
use Spatie\Crawler\Crawler; | |
use App\Http\Controllers\Controller; | |
use GuzzleHttp\RequestOptions; | |
class CustomCrawlerController extends Controller { | |
public function __construct() {} | |
/** | |
* Crawl the website content. | |
* @return true | |
*/ | |
public function fetchContent(){ | |
//# initiate crawler | |
Crawler::create([RequestOptions::ALLOW_REDIRECTS => true, RequestOptions::TIMEOUT => 30]) | |
->acceptNofollowLinks() | |
->ignoreRobots() | |
// ->setParseableMimeTypes(['text/html', 'text/plain']) | |
->setCrawlObserver(new CustomCrawlerObserver()) | |
->setCrawlProfile(new CrawlInternalUrls('https://www.lipsum.com')) | |
->setMaximumResponseSize(1024 * 1024 * 2) // 2 MB maximum | |
->setTotalCrawlLimit(100) // limit defines the maximal count of URLs to crawl | |
// ->setConcurrency(1) // all urls will be crawled one by one | |
->setDelayBetweenRequests(100) | |
->startCrawling('https://www.lipsum.com'); | |
return true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment