Skip to content

Instantly share code, notes, and snippets.

@kicken
Created February 14, 2021 20:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kicken/3c470ade3694b076ae46f00e18c1e310 to your computer and use it in GitHub Desktop.
Save kicken/3c470ade3694b076ae46f00e18c1e310 to your computer and use it in GitHub Desktop.
Simple Crawler Example
<?php
class Crawler {
private $linkQueue;
private $linkExtractor;
public function __construct(){
$this->linkExtractor = new LinkExtractor;
}
public function crawl($startingUrl){
$this->linkQueue = new LinkQueue();
$this->linkQueue->enqueue($startingUrl);
$this->run();
}
private function run(){
while (!$this->linkQueue->isEmpty()){
$url = $this->linkQueue->dequeue();
echo 'Crawling URL ', $url, PHP_EOL;
$dom = $this->loadUrl($url);
if ($dom){
foreach ($this->linkExtractor->getLinks($dom, $url) as $newUrl){
$this->linkQueue->enqueue($newUrl);
}
}
}
}
private function loadUrl($url){
$options = array(
'http'=>array('method'=>"GET", 'header'=>"User-Agent: imagimediaBot/0.1\n")
);
$context = stream_context_create($options);
$html = file_get_contents($url, false, $context);
if ($html !== false){
libxml_use_internal_errors(true);
$dom = new DomDocument();
$dom->loadHTML($html);
libxml_clear_errors();
libxml_use_internal_errors(false);
return $dom;
} else {
return null;
}
}
}
<?php
class LinkExtractor {
public function getLinks(DOMDocument $dom, $baseUrl) {
$baseScheme = parse_url($baseUrl)["scheme"]; //HTTP
$baseHost = parse_url($baseUrl)["host"];
$basePath = parse_url($baseUrl)['path'];
$linkList = [];
foreach ($dom->getElementsByTagName("a") as $anchor){
if ($this->isValid($anchor, $baseHost)){
$linkList[] = $this->resolveUrl($anchor->getAttribute('href'), $baseHost, $baseScheme, $basePath);
}
}
return $linkList;
}
private function isValid($anchor, $host){
if (!$anchor->hasAttribute('href')){
return false;
}
if ($anchor->hasAttribute('rel')){
$rel = $anchor->getAttribute('rel');
return strpos($rel, 'nofollow') === false;
}
$href = $anchor->getAttribute('href');
if ($href[0] === '#'){
return false;
} else if (preg_match('@^(\w+)://@', $href, $matches)){
$scheme = strtolower($matches[1]);
if (!in_array($scheme, ['http','https'])){
return false;
}
$hrefHost = parse_url($href)['host'];
if (strcasecmp($hrefHost, $host) !== 0){
return false;
}
}
return true;
}
private function resolveUrl($target, $host, $scheme, $path) {
if(substr($target, 0, 2) == "//") {
$target = $scheme . ":" . $target;
}
else if(substr($target, 0, 1) == "/") {
$target = $scheme . "://" . $host . $target;
}
else if (!preg_match('@^https?://@i', $target)){
if ($path[strlen($path)-1] !== '/'){
$path = substr($path, 0, strrpos($path, '/')+1);
}
if (substr($target, 0, 2) === './'){
$target = substr($target, 2);
} else if (substr($target, 0, 3) === '../'){
$target = substr($target, 3);
$path = substr($path, 0, -1);
$path = substr($path, 0, strrpos($path, '/')+1);
}
$target = $scheme . "://" . $host . $path . $target;
}
return $target;
}
}
<?php
class LinkQueue extends \SplQueue {
private $seen;
public function enqueue($url){
$normalized = $this->normalizeUrl($url);
if (!isset($this->seen[$normalized])){
parent::enqueue($url);
$this->seen[$normalized] = true;
echo 'Enqueued URL: ', $url, PHP_EOL;
}
}
private function normalizeUrl($url){
$url = preg_replace('@^https?://@i', '', $url);
$url = preg_replace('@^www.@', '', $url);
return $url;
}
}
<?php
require 'Crawler.php';
require 'LinkExtractor.php';
require 'LinkQueue.php';
$crawler = new Crawler();
$crawler->crawl('https://aoeex.com/phile/');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment