Created
January 7, 2022 07:13
-
-
Save anlisha-maharjan/cc5eab50624907ed267c1ce5525607c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Observers; | |
use DOMDocument; | |
use Spatie\Crawler\CrawlObservers\CrawlObserver; | |
use Psr\Http\Message\UriInterface; | |
use Psr\Http\Message\ResponseInterface; | |
use GuzzleHttp\Exception\RequestException; | |
use Illuminate\Support\Facades\Log; | |
class CustomCrawlerObserver extends CrawlObserver { | |
private $content; | |
public function __construct() { | |
$this->content = NULL; | |
} | |
/** | |
* Called when the crawler will crawl the url. | |
* | |
* @param \Psr\Http\Message\UriInterface $url | |
*/ | |
public function willCrawl(UriInterface $url): void | |
{ | |
Log::info('willCrawl',['url'=>$url]); | |
} | |
/** | |
* Called when the crawler has crawled the given url successfully. | |
* | |
* @param \Psr\Http\Message\UriInterface $url | |
* @param \Psr\Http\Message\ResponseInterface $response | |
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl | |
*/ | |
public function crawled( | |
UriInterface $url, | |
ResponseInterface $response, | |
?UriInterface $foundOnUrl = null | |
) | |
{ | |
$doc = new DOMDocument(); | |
@$doc->loadHTML($response->getBody()); | |
//# save HTML | |
$content = $doc->saveHTML(); | |
//# convert encoding | |
$content1 = mb_convert_encoding($content,'UTF-8',mb_detect_encoding($content,'UTF-8, ISO-8859-1',true)); | |
//# strip all javascript | |
$content2 = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $content1); | |
//# strip all style | |
$content3 = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $content2); | |
//# strip tags | |
$content4 = str_replace('<',' <',$content3); | |
$content5 = strip_tags($content4); | |
$content6 = str_replace( ' ', ' ', $content5 ); | |
//# strip white spaces and line breaks | |
$content7 = preg_replace('/\s+/S', " ", $content6); | |
//# html entity decode - ö was shown as ö | |
$html = html_entity_decode($content7); | |
//# append | |
$this->content .= $html; | |
} | |
/** | |
* Called when the crawler had a problem crawling the given url. | |
* | |
* @param \Psr\Http\Message\UriInterface $url | |
* @param \GuzzleHttp\Exception\RequestException $requestException | |
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl | |
*/ | |
public function crawlFailed( | |
UriInterface $url, | |
RequestException $requestException, | |
?UriInterface $foundOnUrl = null | |
) | |
{ | |
Log::error('crawlFailed',['url'=>$url,'error'=>$requestException->getMessage()]); | |
} | |
/** | |
* Called when the crawl has ended. | |
*/ | |
public function finishedCrawling() | |
{ | |
Log::info("finishedCrawling"); | |
//# store $this->content in DB | |
//# Add logic here | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment