Created
June 24, 2016 14:43
-
-
Save cnicodeme/d3efb55112e0c776137fb8c9a06e4fd5 to your computer and use it in GitHub Desktop.
Crawler written in PHP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Crawler\Extractors; | |
class LinkExtractor { | |
private static $excludes = array( | |
'.png', '.gif', '.jpg', '.jpeg', '.svg', '.mp3', '.mp4', '.avi', '.mpeg', '.ps', '.swf', '.webm', '.ogg', '.pdf', | |
'.3gp', '.apk', '.bmp', '.flac', '.gz', '.gzip', '.jpe', '.kml', '.kmz', '.m4a', '.mov', '.mpg', '.odp', '.oga', '.ogv', '.pps', '.pptx', '.qt', '.tar', '.tif', '.wav', '.wmv', '.zip', | |
// Removed '.js', '.coffee', '.css', '.less', '.csv', '.xsl', '.xsd', '.xml', '.html', '.html', '.php', '.txt', '.atom', '.rss' | |
// Implement later ? | |
'.doc', '.docx', '.ods', '.odt', '.xls', '.xlsx', | |
); | |
private static $excludedDomains = array( | |
'.google.', '.facebook.', '.bing.' | |
); | |
private static function _getBaseUrl($parsed_url) { | |
$scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '//'; | |
$host = isset($parsed_url['host']) ? $parsed_url['host'] : ''; | |
$port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : ''; | |
return strtolower("$scheme$host$port"); | |
} | |
public static function extract(\Crawler\Engine\Spider $spider) { | |
$parsed = parse_url(strtolower($spider->getUrl())); | |
if (!isset($parsed['scheme'])) { | |
$parsed['scheme'] = 'http'; | |
} | |
$base = self::_getBaseUrl($parsed); | |
$host_length = strlen($parsed['host']); | |
preg_match_all("/(href|src)=[\'\"]?([^\'\">]+)/i", $spider->getSource(), $out); | |
$linkPattern = '/^(?:[;\/?:@&=+$,]|(?:[^\W_]|[-_.!~*\()\[\] ])|(?:%[\da-fA-F]{2}))*$/'; | |
$urls = array(); | |
if (is_array($out) && isset($out[2])) { | |
foreach ($out[2] as $key=>$url) { | |
if (substr($url, 0, 2) === '#!') { | |
// see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started | |
$url = $base.$parsed['path'].'?_escaped_fragment_='.substr($url, 2); | |
} else if (substr($url, 0, 2) === '//') { // generic scheme | |
$url = $parsed['scheme'].'://'.$url; | |
} else if (substr($url, 0, 1) === '/') { // generic scheme | |
$url = $base.$url; | |
} else if (substr($url, 0, 4) !== 'http') { | |
continue; | |
} | |
if (strlen($url) > 250) continue; // We ignore too long urls | |
$urll = strtolower($url); | |
$parsed_url = parse_url($url); | |
if ($parsed_url === false) continue; // We ignore invalid urls | |
if (preg_match($linkPattern, $urll) !== 1) continue; | |
$isExcluded = false; | |
foreach (self::$excludes as $exclude) { | |
if (substr($urll, strlen($exclude) * -1) === $exclude) { | |
$isExcluded = true; | |
break; | |
} | |
} | |
foreach (self::$excludedDomains as $exclude) { | |
if (strpos($urll, $exclude) !== false) { | |
$isExcluded = true; | |
break; | |
} | |
} | |
if ($isExcluded) continue; // We ignore some extensions | |
if (\Crawler\Models\LinkModel::isPresent($url)) continue; // We don't add a link that is already present | |
if (\Crawler\RobotsTxtParser::disallowed($url)) continue; // We respect robots.txt | |
$urls[$url] = true; | |
} | |
} | |
return array_keys($urls); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Crawler\Models; | |
class LinkModel { | |
public static function __callStatic($name, $arguments) { | |
return call_user_func_array(array(self::get(), '_'.$name), $arguments); | |
} | |
private static $instance = null; | |
public static function get() { | |
if (is_null(self::$instance)) { | |
self::$instance = new self(); | |
} | |
return self::$instance; | |
} | |
private $presentStmt = null; | |
private $countQueuedStmt = null; | |
private $countTotalStmt = null; | |
private function __construct() { | |
$this->presentStmt = \Crawler\Database::prepare('SELECT `id` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;'); | |
$this->detailsStmt = \Crawler\Database::prepare('SELECT `job_id` AS `job` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;'); | |
$this->insertStmt = \Crawler\Database::prepare('INSERT INTO `urls` (`url`, `is_crawled`, `executed`, `source`, `job_id`) VALUES (:url, :crawled, UTC_TIMESTAMP(), :source, :job)'); | |
$this->updateStmt = \Crawler\Database::prepare('UPDATE `urls` SET `is_crawled` = :crawled WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;'); | |
$this->countQueuedStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `is_crawled` = 0 AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);'); | |
$this->countTotalStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);'); | |
} | |
public function _isPresent($url) { | |
$this->presentStmt->execute(array('url' => strtolower($url))); | |
$result = $this->presentStmt->fetch(\PDO::FETCH_ASSOC); | |
return is_array($result); | |
} | |
/** | |
* crawled : The engine extracted this url | |
* redirectedFrom : The url it cames from, was redirected | |
* | |
* In certain case, crawled != fetched. This means the $url was a redrection from an other url | |
*/ | |
public function _add($url, $crawled = false, $redirectedFrom = null, $jobId = null) { | |
$url = strtolower($url); | |
if (is_null($jobId)) { | |
$this->detailsStmt->execute(array('url' => $url)); | |
$result = $this->detailsStmt->fetch(\PDO::FETCH_ASSOC); | |
// We search if already exists | |
if (is_array($result)) { | |
$this->_update($url, $crawled); | |
// And return the job id if present ! | |
return (empty($result['job']) ? null : $result['job']); | |
} | |
} | |
// We insert | |
$this->insertStmt->execute(array( | |
'url' => $url, | |
'crawled' => $crawled, | |
'source' => $redirectedFrom, | |
'job' => $jobId | |
)); | |
return null; | |
} | |
public function _update($url, $crawled = false) { | |
$url = strtolower($url); | |
$this->updateStmt->execute(array( | |
'url' => $url, | |
'crawled' => $crawled | |
)); | |
} | |
public function _countQueued($domain) { | |
$this->countQueuedStmt->execute(array( | |
'domaina' => 'http://'.$domain.'%', | |
'domainb' => 'https://'.$domain.'%', | |
)); | |
$result = $this->countQueuedStmt->fetch(\PDO::FETCH_ASSOC); | |
if (!is_array($result)) return 0; | |
return $result['total']; | |
} | |
public function _countTotal($domain) { | |
$this->countTotalStmt->execute(array( | |
'domaina' => 'http://'.$domain.'%', | |
'domainb' => 'https://'.$domain.'%', | |
)); | |
$result = $this->countTotalStmt->fetch(\PDO::FETCH_ASSOC); | |
if (!is_array($result)) return 0; | |
return $result['total']; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Crawler\Engine; | |
class Spider { | |
const MAX_DOWNLOAD_SIZE = 1024*1024*100; // in bytes, =100kb | |
const LOW_PRIORITY = 1024; // = Default | |
const MEDIUM_PRIORITY = 512; | |
const HIGH_PRIORITY = 256; | |
private $options = array( | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_FORBID_REUSE => true, | |
CURLOPT_FRESH_CONNECT => true, | |
CURLOPT_HEADER => false, | |
CURLOPT_RETURNTRANSFER => true, | |
CURLOPT_SSL_VERIFYPEER => false, | |
CURLOPT_MAXREDIRS => 5, | |
CURLOPT_TIMEOUT => 5, | |
CURLOPT_ENCODING => '' | |
); | |
private $curl = null; | |
private $url = null; | |
private $urlParts = array(); | |
private $statusCode = null; | |
private $source = null; | |
public function __construct($url, $referer) { | |
$this->options[CURLOPT_WRITEFUNCTION] = array($this, 'curl_handler_recv'); | |
$this->options[CURLOPT_REFERER] = $referer; | |
$this->curl = curl_init(); | |
curl_setopt($this->curl, CURLOPT_URL, $url); | |
curl_setopt_array($this->curl, $this->options); | |
$this->source = ''; | |
} | |
public function curl_handler_recv($curl, $data) { | |
$this->source .= $data; | |
if (strlen($this->source) > self::MAX_DOWNLOAD_SIZE) return 0; | |
return strlen($data); | |
} | |
public function exec() { | |
$start = round(microtime(true) * 1000); | |
curl_exec($this->getCurl()); | |
$this->getUrl(); | |
$this->getStatusCode(); | |
curl_close($this->getCurl()); | |
return round(microtime(true) * 1000) - $start; | |
} | |
public function getCurl() { | |
return $this->curl; | |
} | |
public function getSource() { | |
return $this->source; | |
} | |
public function getUrl() { | |
if (is_null($this->url)) { | |
$this->url = curl_getinfo($this->getCurl(), CURLINFO_EFFECTIVE_URL); | |
$this->urlParts = parse_url($this->url); | |
} | |
return $this->url; | |
} | |
public function getUrlParts($key = null) { | |
if (!is_null($key) && isset($this->urlParts[$key])) { | |
return $this->urlParts[$key]; | |
} | |
return $this->urlParts; | |
} | |
public function getStatusCode() { | |
if (is_null($this->statusCode)) { | |
$this->statusCode = curl_getinfo($this->getCurl(), CURLINFO_HTTP_CODE); | |
} | |
return $this->statusCode; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if (php_sapi_name() !== 'cli') exit(1); | |
require_once(__DIR__.'/../init.php'); | |
define('WORKER_LIMIT_INSTANCES', 200); | |
define('CRAWLER_MAX_DEPTH', 10000); | |
define('CRAWLER_MAX_HIGH_URLS', 100); | |
use \Pheanstalk\Pheanstalk; | |
use \Crawler\Models\LinkModel; | |
$pheanstalk = new Pheanstalk('127.0.0.1'); | |
$reloadedInitialTime = filemtime(__DIR__.'/../reloaded'); | |
fwrite(STDOUT, "Started new instance of script (".$reloadedInitialTime.").\n"); | |
$loopCounter = 0; | |
while (true) { | |
clearstatcache(); | |
// Script to stop the service | |
if (intval(file_get_contents(__DIR__.'/../breakworker')) === 1 ) exit(1); | |
// We check if we need to stop this worker (code update?) | |
$autoReloadSystem = filemtime(__DIR__.'/../reloaded'); | |
if ($reloadedInitialTime !== $autoReloadSystem) { | |
fwrite(STDOUT, "New update - Reloading script.\n"); | |
exit(0); | |
} | |
usleep(500000); // Give it some slack ; 1/2 second | |
$loopCounter++; | |
if ($loopCounter > WORKER_LIMIT_INSTANCES) break; // We count on Supervisord to reload workers | |
// grab the next job off the queue and reserve it | |
$job = $pheanstalk->watch(QUEUE_NAME) | |
->ignore('default') | |
->reserve(); | |
// remove the job from the queue | |
$pheanstalk->delete($job); | |
$data = json_decode($job->getData(), true); | |
if (is_null($data)) { | |
fwrite(STDERR, "[FATAL] Invalid Job data : ".$job->getData()."\n"); | |
} | |
if (!isset($data['retries'])) $data['retries'] = 0; | |
if (!isset($data['priority'])) $data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY; | |
if ($data['priority'] == \Crawler\Engine\Spider::LOW_PRIORITY) { | |
// Normally, only new links are in low priority | |
$data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY; | |
} | |
/* | |
* The "Spider" goes to the website using a basic CURL request | |
* It also pre-fetch the robots.txt the first request to ensure we respect it | |
* With the following CURL rules : | |
* CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_FORBID_REUSE => true, | |
CURLOPT_FRESH_CONNECT => true, | |
CURLOPT_HEADER => false, | |
CURLOPT_RETURNTRANSFER => true, | |
CURLOPT_SSL_VERIFYPEER => false, | |
CURLOPT_MAXREDIRS => 5, | |
CURLOPT_TIMEOUT => 5, | |
CURLOPT_ENCODING => '' | |
*/ | |
$spider = new \Crawler\Engine\Spider($data['url']); | |
$duration = $spider->exec(); | |
// First, we ensure that we are not black-listed | |
// So we analyze the status code | |
// For 401, 403 and 404, we retry once | |
// For 408, 429 and 503, we retry 3 times, with increasing wait between requests | |
if (in_array($spider->getStatusCode(), array(401, 403, 404, 408, 429, 503))) { | |
$data['retries']++; | |
if ((in_array($spider->getStatusCode(), array(401, 403, 404)) && $data['retries'] <= 1) // Only one retry | |
|| | |
(in_array($spider->getStatusCode(), array(408, 429, 503)) && $data['retries'] <= 3) // 3 retries | |
) { | |
$pheanstalk->putInTube(QUEUE_NAME, json_encode($data), $data['priority'], $data['retries'] * 30); | |
continue; | |
} | |
// We are here (and not in the "if" section) when the status code is in the array | |
// but the retries are reached, that mean we stop for this url | |
// So the next step will be to add it in the Link database and stop the data. | |
} | |
// We update the url in the database to indicate it has been crawled | |
LinkModel::update($data['url'], true); | |
if (strtolower($data['url']) !== strtolower($spider->getUrl())) { | |
// We were redirected, so we add a new URL also marked as being crawled, with $data['url'] being the origin | |
$jobId = LinkModel::add($spider->getUrl(), true, $data['url']); | |
// We remove the job of the redirect url because we had it already in queue | |
if (!is_null($jobId)) { | |
// We catch exception in case the url has already been processed | |
try { | |
$job = $pheanstalk->peek($jobId); | |
$pheanstalk->delete($job); | |
} catch (\Exception $e) {} | |
} | |
} | |
$domainName = $spider->getUrlParts(PHP_URL_HOST); | |
$domainName = strtolower($domainName['host']); | |
// Here's the code I do to index the webpages | |
// I removed it because it's not interesting in our case | |
// But in general, if you are looking for a similar work, you can implement your need here :) | |
// This code extract all the links in the page to add them in the queue | |
$links = \Crawler\Extractors\LinkExtractor::extract($spider); | |
// And we add them now : | |
$priority = $data['priority']; | |
foreach ($links as $link) { | |
$parsedDomain = strtolower(parse_url($link, PHP_URL_HOST)); | |
$jobsData = array( | |
'url' => $link, | |
'retries' => 0, | |
'referer' => $spider->getUrl() | |
); | |
$jobsData['delay'] = ceil($duration * (rand(1, 10)/10000)); // Delay between 0.1 and 1 seconds x $duration of the request | |
if ($jobsData['delay'] > 5) $jobsData['delay'] = 5; | |
// We increase the time to wait per number of links for this specific domain | |
$jobsData['delay'] = $jobsData['delay'] + LinkModel::countQueued($parsedDomain); | |
if (\Crawler\Engine\Spider::HIGH_PRIORITY) { | |
// Allow 5 simultaneous request on high priority | |
$jobsData['delay'] = floor($jobsData['delay'] / 10); | |
} | |
$iCountCrawledUrls = LinkModel::countTotal($parsedDomain); | |
if ($iCountCrawledUrls > CRAWLER_MAX_DEPTH) break; // We stop crawling this domain | |
if ($domainName === $parsedDomain) { | |
if ($priority === \Crawler\Engine\Spider::HIGH_PRIORITY && $iCountCrawledUrls > CRAWLER_MAX_HIGH_URLS) { | |
$priority = \Crawler\Engine\Spider::MEDIUM_PRIORITY; | |
} | |
$jobsData['priority'] = $priority; | |
} else { | |
$jobsData['priority'] = \Crawler\Engine\Spider::LOW_PRIORITY; | |
} | |
$jobId = $pheanstalk->putInTube(QUEUE_NAME, json_encode($jobsData), $jobsData['priority'], $jobsData['delay']); | |
// The add method checks if the url is already present in the database | |
// To avoid adding multiple time the same url (and going in loop in case two sites links to each others !) | |
LinkModel::add($link, false, null, $jobId); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment