Skip to content

Instantly share code, notes, and snippets.

@anlisha-maharjan
Created January 7, 2022 07:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anlisha-maharjan/cc5eab50624907ed267c1ce5525607c4 to your computer and use it in GitHub Desktop.
Save anlisha-maharjan/cc5eab50624907ed267c1ce5525607c4 to your computer and use it in GitHub Desktop.
<?php
namespace App\Observers;
use DOMDocument;
use Spatie\Crawler\CrawlObservers\CrawlObserver;
use Psr\Http\Message\UriInterface;
use Psr\Http\Message\ResponseInterface;
use GuzzleHttp\Exception\RequestException;
use Illuminate\Support\Facades\Log;
class CustomCrawlerObserver extends CrawlObserver {
private $content;
public function __construct() {
$this->content = NULL;
}
/**
* Called when the crawler will crawl the url.
*
* @param \Psr\Http\Message\UriInterface $url
*/
public function willCrawl(UriInterface $url): void
{
Log::info('willCrawl',['url'=>$url]);
}
/**
* Called when the crawler has crawled the given url successfully.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawled(
UriInterface $url,
ResponseInterface $response,
?UriInterface $foundOnUrl = null
)
{
$doc = new DOMDocument();
@$doc->loadHTML($response->getBody());
//# save HTML
$content = $doc->saveHTML();
//# convert encoding
$content1 = mb_convert_encoding($content,'UTF-8',mb_detect_encoding($content,'UTF-8, ISO-8859-1',true));
//# strip all javascript
$content2 = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $content1);
//# strip all style
$content3 = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $content2);
//# strip tags
$content4 = str_replace('<',' <',$content3);
$content5 = strip_tags($content4);
$content6 = str_replace( ' ', ' ', $content5 );
//# strip white spaces and line breaks
$content7 = preg_replace('/\s+/S', " ", $content6);
//# html entity decode - ö was shown as &ouml;
$html = html_entity_decode($content7);
//# append
$this->content .= $html;
}
/**
* Called when the crawler had a problem crawling the given url.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \GuzzleHttp\Exception\RequestException $requestException
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
?UriInterface $foundOnUrl = null
)
{
Log::error('crawlFailed',['url'=>$url,'error'=>$requestException->getMessage()]);
}
/**
* Called when the crawl has ended.
*/
public function finishedCrawling()
{
Log::info("finishedCrawling");
//# store $this->content in DB
//# Add logic here
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment