Created
September 13, 2012 23:29
-
-
Save ManInTheBox/3718576 to your computer and use it in GitHub Desktop.
Base Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Crawler is a base class for performing website `crawling` (parsing) process. | |
* | |
* @property DOMXPath $xPath | |
* @property string $currency | |
* @property string $availability | |
* @property string $delivery | |
* @property string $price | |
* @property string $shipping | |
* @property string $currentUrl | |
* | |
* @author Zarko Stankovic <stankovic.zarko@gmail.com> | |
*/ | |
abstract class Crawler extends CComponent | |
{ | |
/** | |
* @var string supplier model instance | |
*/ | |
public $supplier; | |
/** | |
* @var string target url to be crawled | |
*/ | |
public $url; | |
/** | |
* @var DOMDocument dom instance | |
*/ | |
public $domDocument; | |
/** | |
* @var string HTML content needed for {@link $domDocument} | |
*/ | |
public $domHTML; | |
/** | |
* @var DOMXPath xPath instance | |
*/ | |
private $_xPath; | |
/** | |
* @var Curl cURL instance. {@see Curl} class for details. | |
*/ | |
public $curl; | |
/** | |
* @var boolean is remote website ready for parsing process | |
*/ | |
public $ready; | |
/** | |
* @var string website address | |
*/ | |
public $host; | |
/** | |
* @var string Not Available message | |
*/ | |
public $notAvailable = 'N/A'; | |
/** | |
* Constructor | |
* | |
* @param string $url target url | |
* @param boolean $autoConnect whether to connect automatically. Defaults to true. | |
* @param boolean $modelAutoLoad whether to load model automatically. Defaults to true. | |
* @param array $curlOptions cURL options. They will be merged with {@link $defaultCurlOptions} | |
*/ | |
public function __construct($isbn, $autoConnect = true, $modelAutoLoad = true) | |
{ | |
$this->domDocument = new DOMDocument(); | |
if ($modelAutoLoad) | |
{ | |
$this->supplier = Supplier::model()->findByAttributes(array( | |
'crawler_name' => get_class($this), | |
)); | |
} | |
if ($autoConnect) | |
{ | |
$this->ready = $this->checkResponse($isbn); | |
} | |
} | |
/** | |
* Loads HTML into DOMDocument | |
*/ | |
public function load() | |
{ | |
$this->domHTML = $this->open(); | |
$this->domDocument->loadHTML($this->domHTML); | |
} | |
/** | |
* Opens cURL session and returns data from remote website. | |
* | |
* @return string data | |
*/ | |
public function open() | |
{ | |
libxml_use_internal_errors(true); | |
$response = $this->curl->response; | |
$this->close(); | |
return $response; | |
} | |
/** | |
* Closes cURL session and clears libxml error buffer. | |
*/ | |
public function close() | |
{ | |
if (!$this->curl->autoClose) | |
{ | |
$this->curl->close(); | |
} | |
libxml_clear_errors(); | |
} | |
/** | |
* Returns xPath instance. | |
* | |
* @return DOMXPath xPath instance | |
*/ | |
public function getXPath() | |
{ | |
$this->_xPath = new DOMXPath($this->domDocument); | |
return $this->_xPath; | |
} | |
/** | |
* Just clean up any HTML errors left when GC is called. | |
*/ | |
public function __destruct() | |
{ | |
libxml_clear_errors(); | |
} | |
/** | |
* @return boolean is remote website available | |
*/ | |
abstract public function checkResponse($isbn); | |
/** | |
* @return string currency | |
*/ | |
abstract public function getCurrency(); | |
/** | |
* @return string availability | |
*/ | |
abstract public function getAvailability(); | |
/** | |
* @return string delivery | |
*/ | |
abstract public function getDelivery(); | |
/** | |
* @return string price | |
*/ | |
abstract public function getPrice(); | |
/** | |
* @return string shipping | |
*/ | |
abstract public function getShipping(); | |
/** | |
* This method will return current URL processed by cURL. | |
* Override this method if you want to post process current URL. | |
* @return string current URL | |
*/ | |
public function getCurrentUrl() | |
{ | |
return $this->curl->currentUrl; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment