Skip to content

Instantly share code, notes, and snippets.

@ManInTheBox
Created September 13, 2012 23:29
Show Gist options
  • Save ManInTheBox/3718576 to your computer and use it in GitHub Desktop.
Save ManInTheBox/3718576 to your computer and use it in GitHub Desktop.
Base Crawler
<?php
/**
* Crawler is a base class for performing website `crawling` (parsing) process.
*
* @property DOMXPath $xPath
* @property string $currency
* @property string $availability
* @property string $delivery
* @property string $price
* @property string $shipping
* @property string $currentUrl
*
* @author Zarko Stankovic <stankovic.zarko@gmail.com>
*/
abstract class Crawler extends CComponent
{
/**
* @var string supplier model instance
*/
public $supplier;
/**
* @var string target url to be crawled
*/
public $url;
/**
* @var DOMDocument dom instance
*/
public $domDocument;
/**
* @var string HTML content needed for {@link $domDocument}
*/
public $domHTML;
/**
* @var DOMXPath xPath instance
*/
private $_xPath;
/**
* @var Curl cURL instance. {@see Curl} class for details.
*/
public $curl;
/**
* @var boolean is remote website ready for parsing process
*/
public $ready;
/**
* @var string website address
*/
public $host;
/**
* @var string Not Available message
*/
public $notAvailable = 'N/A';
/**
* Constructor
*
* @param string $url target url
* @param boolean $autoConnect whether to connect automatically. Defaults to true.
* @param boolean $modelAutoLoad whether to load model automatically. Defaults to true.
* @param array $curlOptions cURL options. They will be merged with {@link $defaultCurlOptions}
*/
public function __construct($isbn, $autoConnect = true, $modelAutoLoad = true)
{
$this->domDocument = new DOMDocument();
if ($modelAutoLoad)
{
$this->supplier = Supplier::model()->findByAttributes(array(
'crawler_name' => get_class($this),
));
}
if ($autoConnect)
{
$this->ready = $this->checkResponse($isbn);
}
}
/**
* Loads HTML into DOMDocument
*/
public function load()
{
$this->domHTML = $this->open();
$this->domDocument->loadHTML($this->domHTML);
}
/**
* Opens cURL session and returns data from remote website.
*
* @return string data
*/
public function open()
{
libxml_use_internal_errors(true);
$response = $this->curl->response;
$this->close();
return $response;
}
/**
* Closes cURL session and clears libxml error buffer.
*/
public function close()
{
if (!$this->curl->autoClose)
{
$this->curl->close();
}
libxml_clear_errors();
}
/**
* Returns xPath instance.
*
* @return DOMXPath xPath instance
*/
public function getXPath()
{
$this->_xPath = new DOMXPath($this->domDocument);
return $this->_xPath;
}
/**
* Just clean up any HTML errors left when GC is called.
*/
public function __destruct()
{
libxml_clear_errors();
}
/**
* @return boolean is remote website available
*/
abstract public function checkResponse($isbn);
/**
* @return string currency
*/
abstract public function getCurrency();
/**
* @return string availability
*/
abstract public function getAvailability();
/**
* @return string delivery
*/
abstract public function getDelivery();
/**
* @return string price
*/
abstract public function getPrice();
/**
* @return string shipping
*/
abstract public function getShipping();
/**
* This method will return current URL processed by cURL.
* Override this method if you want to post process current URL.
* @return string current URL
*/
public function getCurrentUrl()
{
return $this->curl->currentUrl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment