Skip to content

Instantly share code, notes, and snippets.

@cnicodeme
Created June 24, 2016 14:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save cnicodeme/d3efb55112e0c776137fb8c9a06e4fd5 to your computer and use it in GitHub Desktop.
Save cnicodeme/d3efb55112e0c776137fb8c9a06e4fd5 to your computer and use it in GitHub Desktop.
Crawler written in PHP
<?php
namespace Crawler\Extractors;
class LinkExtractor {
private static $excludes = array(
'.png', '.gif', '.jpg', '.jpeg', '.svg', '.mp3', '.mp4', '.avi', '.mpeg', '.ps', '.swf', '.webm', '.ogg', '.pdf',
'.3gp', '.apk', '.bmp', '.flac', '.gz', '.gzip', '.jpe', '.kml', '.kmz', '.m4a', '.mov', '.mpg', '.odp', '.oga', '.ogv', '.pps', '.pptx', '.qt', '.tar', '.tif', '.wav', '.wmv', '.zip',
// Removed '.js', '.coffee', '.css', '.less', '.csv', '.xsl', '.xsd', '.xml', '.html', '.html', '.php', '.txt', '.atom', '.rss'
// Implement later ?
'.doc', '.docx', '.ods', '.odt', '.xls', '.xlsx',
);
private static $excludedDomains = array(
'.google.', '.facebook.', '.bing.'
);
private static function _getBaseUrl($parsed_url) {
$scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '//';
$host = isset($parsed_url['host']) ? $parsed_url['host'] : '';
$port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';
return strtolower("$scheme$host$port");
}
public static function extract(\Crawler\Engine\Spider $spider) {
$parsed = parse_url(strtolower($spider->getUrl()));
if (!isset($parsed['scheme'])) {
$parsed['scheme'] = 'http';
}
$base = self::_getBaseUrl($parsed);
$host_length = strlen($parsed['host']);
preg_match_all("/(href|src)=[\'\"]?([^\'\">]+)/i", $spider->getSource(), $out);
$linkPattern = '/^(?:[;\/?:@&=+$,]|(?:[^\W_]|[-_.!~*\()\[\] ])|(?:%[\da-fA-F]{2}))*$/';
$urls = array();
if (is_array($out) && isset($out[2])) {
foreach ($out[2] as $key=>$url) {
if (substr($url, 0, 2) === '#!') {
// see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
$url = $base.$parsed['path'].'?_escaped_fragment_='.substr($url, 2);
} else if (substr($url, 0, 2) === '//') { // generic scheme
$url = $parsed['scheme'].'://'.$url;
} else if (substr($url, 0, 1) === '/') { // generic scheme
$url = $base.$url;
} else if (substr($url, 0, 4) !== 'http') {
continue;
}
if (strlen($url) > 250) continue; // We ignore too long urls
$urll = strtolower($url);
$parsed_url = parse_url($url);
if ($parsed_url === false) continue; // We ignore invalid urls
if (preg_match($linkPattern, $urll) !== 1) continue;
$isExcluded = false;
foreach (self::$excludes as $exclude) {
if (substr($urll, strlen($exclude) * -1) === $exclude) {
$isExcluded = true;
break;
}
}
foreach (self::$excludedDomains as $exclude) {
if (strpos($urll, $exclude) !== false) {
$isExcluded = true;
break;
}
}
if ($isExcluded) continue; // We ignore some extensions
if (\Crawler\Models\LinkModel::isPresent($url)) continue; // We don't add a link that is already present
if (\Crawler\RobotsTxtParser::disallowed($url)) continue; // We respect robots.txt
$urls[$url] = true;
}
}
return array_keys($urls);
}
}
<?php
namespace Crawler\Models;
class LinkModel {
public static function __callStatic($name, $arguments) {
return call_user_func_array(array(self::get(), '_'.$name), $arguments);
}
private static $instance = null;
public static function get() {
if (is_null(self::$instance)) {
self::$instance = new self();
}
return self::$instance;
}
private $presentStmt = null;
private $countQueuedStmt = null;
private $countTotalStmt = null;
private function __construct() {
$this->presentStmt = \Crawler\Database::prepare('SELECT `id` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
$this->detailsStmt = \Crawler\Database::prepare('SELECT `job_id` AS `job` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
$this->insertStmt = \Crawler\Database::prepare('INSERT INTO `urls` (`url`, `is_crawled`, `executed`, `source`, `job_id`) VALUES (:url, :crawled, UTC_TIMESTAMP(), :source, :job)');
$this->updateStmt = \Crawler\Database::prepare('UPDATE `urls` SET `is_crawled` = :crawled WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
$this->countQueuedStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `is_crawled` = 0 AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
$this->countTotalStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
}
public function _isPresent($url) {
$this->presentStmt->execute(array('url' => strtolower($url)));
$result = $this->presentStmt->fetch(\PDO::FETCH_ASSOC);
return is_array($result);
}
/**
* crawled : The engine extracted this url
* redirectedFrom : The url it cames from, was redirected
*
* In certain case, crawled != fetched. This means the $url was a redrection from an other url
*/
public function _add($url, $crawled = false, $redirectedFrom = null, $jobId = null) {
$url = strtolower($url);
if (is_null($jobId)) {
$this->detailsStmt->execute(array('url' => $url));
$result = $this->detailsStmt->fetch(\PDO::FETCH_ASSOC);
// We search if already exists
if (is_array($result)) {
$this->_update($url, $crawled);
// And return the job id if present !
return (empty($result['job']) ? null : $result['job']);
}
}
// We insert
$this->insertStmt->execute(array(
'url' => $url,
'crawled' => $crawled,
'source' => $redirectedFrom,
'job' => $jobId
));
return null;
}
public function _update($url, $crawled = false) {
$url = strtolower($url);
$this->updateStmt->execute(array(
'url' => $url,
'crawled' => $crawled
));
}
public function _countQueued($domain) {
$this->countQueuedStmt->execute(array(
'domaina' => 'http://'.$domain.'%',
'domainb' => 'https://'.$domain.'%',
));
$result = $this->countQueuedStmt->fetch(\PDO::FETCH_ASSOC);
if (!is_array($result)) return 0;
return $result['total'];
}
public function _countTotal($domain) {
$this->countTotalStmt->execute(array(
'domaina' => 'http://'.$domain.'%',
'domainb' => 'https://'.$domain.'%',
));
$result = $this->countTotalStmt->fetch(\PDO::FETCH_ASSOC);
if (!is_array($result)) return 0;
return $result['total'];
}
}
<?php
namespace Crawler\Engine;
class Spider {
const MAX_DOWNLOAD_SIZE = 1024*1024*100; // in bytes, =100kb
const LOW_PRIORITY = 1024; // = Default
const MEDIUM_PRIORITY = 512;
const HIGH_PRIORITY = 256;
private $options = array(
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_FORBID_REUSE => true,
CURLOPT_FRESH_CONNECT => true,
CURLOPT_HEADER => false,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => 5,
CURLOPT_ENCODING => ''
);
private $curl = null;
private $url = null;
private $urlParts = array();
private $statusCode = null;
private $source = null;
public function __construct($url, $referer) {
$this->options[CURLOPT_WRITEFUNCTION] = array($this, 'curl_handler_recv');
$this->options[CURLOPT_REFERER] = $referer;
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_URL, $url);
curl_setopt_array($this->curl, $this->options);
$this->source = '';
}
public function curl_handler_recv($curl, $data) {
$this->source .= $data;
if (strlen($this->source) > self::MAX_DOWNLOAD_SIZE) return 0;
return strlen($data);
}
public function exec() {
$start = round(microtime(true) * 1000);
curl_exec($this->getCurl());
$this->getUrl();
$this->getStatusCode();
curl_close($this->getCurl());
return round(microtime(true) * 1000) - $start;
}
public function getCurl() {
return $this->curl;
}
public function getSource() {
return $this->source;
}
public function getUrl() {
if (is_null($this->url)) {
$this->url = curl_getinfo($this->getCurl(), CURLINFO_EFFECTIVE_URL);
$this->urlParts = parse_url($this->url);
}
return $this->url;
}
public function getUrlParts($key = null) {
if (!is_null($key) && isset($this->urlParts[$key])) {
return $this->urlParts[$key];
}
return $this->urlParts;
}
public function getStatusCode() {
if (is_null($this->statusCode)) {
$this->statusCode = curl_getinfo($this->getCurl(), CURLINFO_HTTP_CODE);
}
return $this->statusCode;
}
}
<?php
if (php_sapi_name() !== 'cli') exit(1);
require_once(__DIR__.'/../init.php');
define('WORKER_LIMIT_INSTANCES', 200);
define('CRAWLER_MAX_DEPTH', 10000);
define('CRAWLER_MAX_HIGH_URLS', 100);
use \Pheanstalk\Pheanstalk;
use \Crawler\Models\LinkModel;
$pheanstalk = new Pheanstalk('127.0.0.1');
$reloadedInitialTime = filemtime(__DIR__.'/../reloaded');
fwrite(STDOUT, "Started new instance of script (".$reloadedInitialTime.").\n");
$loopCounter = 0;
while (true) {
clearstatcache();
// Script to stop the service
if (intval(file_get_contents(__DIR__.'/../breakworker')) === 1 ) exit(1);
// We check if we need to stop this worker (code update?)
$autoReloadSystem = filemtime(__DIR__.'/../reloaded');
if ($reloadedInitialTime !== $autoReloadSystem) {
fwrite(STDOUT, "New update - Reloading script.\n");
exit(0);
}
usleep(500000); // Give it some slack ; 1/2 second
$loopCounter++;
if ($loopCounter > WORKER_LIMIT_INSTANCES) break; // We count on Supervisord to reload workers
// grab the next job off the queue and reserve it
$job = $pheanstalk->watch(QUEUE_NAME)
->ignore('default')
->reserve();
// remove the job from the queue
$pheanstalk->delete($job);
$data = json_decode($job->getData(), true);
if (is_null($data)) {
fwrite(STDERR, "[FATAL] Invalid Job data : ".$job->getData()."\n");
}
if (!isset($data['retries'])) $data['retries'] = 0;
if (!isset($data['priority'])) $data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
if ($data['priority'] == \Crawler\Engine\Spider::LOW_PRIORITY) {
// Normally, only new links are in low priority
$data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
}
/*
* The "Spider" goes to the website using a basic CURL request
* It also pre-fetch the robots.txt the first request to ensure we respect it
* With the following CURL rules :
* CURLOPT_FOLLOWLOCATION => true,
CURLOPT_FORBID_REUSE => true,
CURLOPT_FRESH_CONNECT => true,
CURLOPT_HEADER => false,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => 5,
CURLOPT_ENCODING => ''
*/
$spider = new \Crawler\Engine\Spider($data['url']);
$duration = $spider->exec();
// First, we ensure that we are not black-listed
// So we analyze the status code
// For 401, 403 and 404, we retry once
// For 408, 429 and 503, we retry 3 times, with increasing wait between requests
if (in_array($spider->getStatusCode(), array(401, 403, 404, 408, 429, 503))) {
$data['retries']++;
if ((in_array($spider->getStatusCode(), array(401, 403, 404)) && $data['retries'] <= 1) // Only one retry
||
(in_array($spider->getStatusCode(), array(408, 429, 503)) && $data['retries'] <= 3) // 3 retries
) {
$pheanstalk->putInTube(QUEUE_NAME, json_encode($data), $data['priority'], $data['retries'] * 30);
continue;
}
// We are here (and not in the "if" section) when the status code is in the array
// but the retries are reached, that mean we stop for this url
// So the next step will be to add it in the Link database and stop the data.
}
// We update the url in the database to indicate it has been crawled
LinkModel::update($data['url'], true);
if (strtolower($data['url']) !== strtolower($spider->getUrl())) {
// We were redirected, so we add a new URL also marked as being crawled, with $data['url'] being the origin
$jobId = LinkModel::add($spider->getUrl(), true, $data['url']);
// We remove the job of the redirect url because we had it already in queue
if (!is_null($jobId)) {
// We catch exception in case the url has already been processed
try {
$job = $pheanstalk->peek($jobId);
$pheanstalk->delete($job);
} catch (\Exception $e) {}
}
}
$domainName = $spider->getUrlParts(PHP_URL_HOST);
$domainName = strtolower($domainName['host']);
// Here's the code I do to index the webpages
// I removed it because it's not interesting in our case
// But in general, if you are looking for a similar work, you can implement your need here :)
// This code extract all the links in the page to add them in the queue
$links = \Crawler\Extractors\LinkExtractor::extract($spider);
// And we add them now :
$priority = $data['priority'];
foreach ($links as $link) {
$parsedDomain = strtolower(parse_url($link, PHP_URL_HOST));
$jobsData = array(
'url' => $link,
'retries' => 0,
'referer' => $spider->getUrl()
);
$jobsData['delay'] = ceil($duration * (rand(1, 10)/10000)); // Delay between 0.1 and 1 seconds x $duration of the request
if ($jobsData['delay'] > 5) $jobsData['delay'] = 5;
// We increase the time to wait per number of links for this specific domain
$jobsData['delay'] = $jobsData['delay'] + LinkModel::countQueued($parsedDomain);
if (\Crawler\Engine\Spider::HIGH_PRIORITY) {
// Allow 5 simultaneous request on high priority
$jobsData['delay'] = floor($jobsData['delay'] / 10);
}
$iCountCrawledUrls = LinkModel::countTotal($parsedDomain);
if ($iCountCrawledUrls > CRAWLER_MAX_DEPTH) break; // We stop crawling this domain
if ($domainName === $parsedDomain) {
if ($priority === \Crawler\Engine\Spider::HIGH_PRIORITY && $iCountCrawledUrls > CRAWLER_MAX_HIGH_URLS) {
$priority = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
}
$jobsData['priority'] = $priority;
} else {
$jobsData['priority'] = \Crawler\Engine\Spider::LOW_PRIORITY;
}
$jobId = $pheanstalk->putInTube(QUEUE_NAME, json_encode($jobsData), $jobsData['priority'], $jobsData['delay']);
// The add method checks if the url is already present in the database
// To avoid adding multiple time the same url (and going in loop in case two sites links to each others !)
LinkModel::add($link, false, null, $jobId);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment