Last active
May 11, 2018 15:10
-
-
Save iRynoh/21224502956f27b0aa9936c01ee37819 to your computer and use it in GitHub Desktop.
CSL_Scraper.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'Scraper.class.php'; | |
class CSL_Scraper extends Scraper | |
{ | |
public $domain; | |
public $referer; | |
public $car_url; | |
public $motorcycle_url; | |
public $phone_token_base; | |
public $key_recaptcha; | |
public function __construct() | |
{ | |
parent::__construct(); | |
$this->domain = 'carsales.com.au'; | |
$this->car_url = 'www.gumtree.com.au/s-ad/'; | |
$this->motorcycle_url = 'www.kijiji.ca/v-sport-bikes/'; | |
$this->referer = 'www.carsales.com.au'; | |
$this->phone_token_base = 'www.gumtree.com.au/j-recaptcha-verify-phone-get.json?token='; | |
$this->key_recaptcha = '6Lc4ATUUAAAAAGhFgdqfrvYQOXryqRbTK4k6H_wi'; | |
} | |
public function get_total($html) | |
{ | |
$pattern_total = '/(?<=<span class="titlecount">\()[\d,]{1,5}(?=\)<\/span)/'; | |
preg_match($pattern_total, $html, $arr_regex_match); | |
if (! count($arr_regex_match)) { | |
throw new Exception($this->get_exception(7)); | |
} | |
return preg_replace('/[^0-9]/', '', $arr_regex_match[0]); | |
} | |
function split_search($search) | |
{ | |
$pattern_year = '/\d{4}__\d{4}/'; | |
preg_match($pattern_year, $search, $arr_regex_match); | |
if (empty($arr_regex_match)) { | |
return false; | |
} else { | |
$arr_year = explode('__', $arr_regex_match[0]); | |
for ($i = reset($arr_year); $i <= end($arr_year); $i++) { | |
$arr_year_ranges[] = $i; | |
} | |
foreach ($arr_year_ranges as $key => $value) { | |
$arr_search[] = str_replace($arr_regex_match[0], $value, $search); | |
} | |
return $arr_search; | |
} | |
} | |
public function get_id($html, $nb_page) | |
{ | |
$matches = []; | |
preg_match_all('!data-csn-item-history="(\w{3,}-\w{2,}-\d{7,})"!', $html, $matches); | |
// Delete duplicates and restore array keys. | |
$arr_id = array_values(array_unique($matches[1])); | |
//next page check | |
$nb_page++; | |
$tag_next = strstr($html, 'title="Next"'); | |
$nb_page = $tag_next ? $nb_page : 0; | |
if (! count($arr_id)) { | |
throw new Exception($this->get_exception(10)); | |
} | |
$arr_data = [$arr_id, $nb_page]; | |
print_r2($arr_data); | |
return $arr_data; | |
} | |
public function set_url_phone($array, $captcha) | |
{ | |
extract($array); | |
$arr_get = [ | |
'urlg' => $this->phone_token_base.str_replace('|', '%7C', $token).'&recaptchaResponse='.$captcha.'&origin=jsp' | |
//'header' => array('X-CSRF-TOKEN: ' . $csrf, 'X-CSRF-TIME: ' . $csrft, 'X-Requested-With: XMLHttpRequest') | |
]; | |
/*url example | |
www.gumtree.com.au/j-recaptcha-verify-phone-get.json?token=1183167206%7C1525483833986%7C297c421dd69169b53b6ec87188facbc1%7C3da19e23a3555b4a6f6dc28447e5bf0a%7C4fe342e0791aada79a8d785ad23121c7%7C26763cc9e81b36d213622ef40c092f2a&recaptchaResponse=03AJpayVHdKoRSMZib0anL85ylQ69BlJA7voGBjQNG6lejI4aV6Ok2ki_gq8MESkoPjjXu2vxmE5cpfyCzblQ2iZL5x4HwtzDZgeSJew_9otAV4u3o4t77fk1hKhB5wmss42yPM4j66SPXkdsK_sS7qhFLp5Ow5rw2Iv3mJBFoQRIHzsiICGPC1vD2b7UHeVer2rShe-dSbYX0ZFflvLoRmPDnrwvEFk5RTQ-cQBi3RnkKafZoroTkuVuxGAhoNKh4xeclFrBlieKAN5FGG5XPfSFLVTDN0l1F3N-JQYyT-a6WebFOPclbXZ_90oBJSy8Rx4DwSizXU1lvaRy0Xe1BAEbBqjGLbr3lYf76bN_OD32IQwaKokPVHxNrCDXab8FZVZsQynDyLo6Vng52LGNkc_XLZm_JB1ZSlGZBEUuOMBiI_J80xgLWRpzG91PGKqABuO2KjWDEUeV9OJBBiguZ-Cg-7SnbocuuuWaRq-TNLbBaRZjXADAOle_u6xzc5v4WcBBz7mauSRPs&origin=jsp*/ | |
/*header example | |
X-CSRF-TOKEN: Bixisfacdi4kUwotZX8SSw | |
X-CSRF-TIME: d06a6aada34e1aa39f3ba1d7723a042b | |
X-Requested-With: XMLHttpRequest*/ | |
return $arr_get; | |
} | |
public function get_phone($html) | |
{ | |
if (strstr($html, '"phone"')) { | |
$arr_phone = json_decode($html, 1); | |
$phone = $arr_phone['phone']; | |
return $phone; | |
} else { | |
throw new Exception($this->get_exception(19)); | |
} | |
} | |
public function get_vin($html) | |
{ | |
$pattern_vin = '/(?<=id="actualVin">)\w{17}(?=<\/)/'; | |
preg_match($pattern_vin, $html, $arr_regex_match); | |
if (empty($arr_regex_match[0])) { | |
throw new Exception($this->get_exception(25)); | |
} else { | |
return $arr_regex_match[0]; | |
} | |
} | |
/** | |
* @param $html | |
* @param $get_phone | |
* @param $id_vehicle_type | |
* @return array | |
* @throws \Exception | |
*/ | |
public function get_data($html, $get_phone, $id_vehicle_type) | |
{ | |
$isDomainBanned = ! stristr($html, $this->domain); | |
if ($isDomainBanned) { | |
throw new Exception(4); | |
} | |
$doc = new DomDocument; | |
// Use @ to avoid warning messages for HTML5. | |
@$doc->loadHtml($html); | |
// Use for the metas. | |
$xpath = new DOMXPath($doc); | |
$isLoggedIn = 0; | |
$arrLoginChecks = ['dashboard', 'Sign out', 'logout']; | |
foreach ($arrLoginChecks as $check) { | |
if (strstr($html, $check)) { | |
$isLoggedIn = 1; | |
break; | |
} | |
} | |
if (! $isLoggedIn) { | |
throw new Exception($this->get_exception(11)); | |
} | |
$arrMainData = ['token', 'csrf', 'csrft']; | |
//'phoneToken': '81183167206|1525474164691|22662836d4cf152e9aa6c3c7331d801a|3d35ebd550ab8f4954d8107d585301e1|78d42ed80fdd01875b926bde753fd456|d9d6ee75e6ed2a22192dfbc08f0e3031' | |
$pattern_token = '/(?<=\'phoneToken\': \')[a-zA-Z0-9|]{100,200}(?=\')/'; | |
preg_match($pattern_token, $html, $arr_regex_match); | |
//if (empty($arr_regex_match[0])) { | |
// throw new Exception(12); | |
//} | |
$arrMainData['token'] = reset($arr_regex_match); | |
$arrExpiredStrings = [ | |
'expired-ad', | |
'No Longer Available', | |
'no longer on the site', | |
'PageExpired', | |
'<title></title>', | |
'there is no shortage of vehicles', | |
'Page Not Found', | |
'already gone', | |
'too late', | |
'trop tard', | |
'<title>New & Used', | |
]; | |
foreach ($arrExpiredStrings as $key => $value) { | |
if (stristr($html, $value)) { | |
throw new Exception(1); | |
} | |
} | |
$arrPrivateSellerStrings = ['isPostedByOwner', '"Owner"', 'Private Seller Car', '<i class="private"></i>']; | |
foreach ($arrPrivateSellerStrings as $key => $value) { | |
if (stristr($html, $value)) { | |
$isPrivateSeller = 1; | |
break; | |
} | |
} | |
// Motorcycle parts. | |
if (($id_vehicle_type == 2) && stristr($html, 'Motorcycle Parts')) { | |
throw new Exception(1); | |
} | |
if (empty($isPrivateSeller)) { | |
throw new Exception($this->get_exception(5)); | |
} | |
$keys = [ | |
'date', | |
'title', | |
'vin', | |
'reg', | |
'make_model_title', | |
'make_model', | |
'year_title', | |
'year', | |
'make', | |
'model', | |
'trim', | |
'body', | |
'type_regex', | |
'mileage', | |
'price_title', | |
'price', | |
'color', | |
'icolor', | |
'engine', | |
'trans', | |
'drive', | |
'fuel', | |
'options', | |
'desc', | |
'name', | |
'has_phone', | |
'phone', | |
'city', | |
'state', | |
'zip', | |
'img_json', | |
'img', | |
]; | |
$arrMainData = array_fill_keys($keys, ''); | |
// Get title. Remove spaces, etc. | |
$query = $doc->getElementsByTagName("title"); | |
if ($query->length > 0) { | |
$title = $this->cleanHTML($doc->getElementsByTagName("title")->item(0)->textContent); | |
// Set Title. | |
$arrMainData['title'] = $title; | |
} else { | |
throw new Exception('Title tag missing'); | |
} | |
// Create an array to store html meta names and arrMainData keys. | |
$arrMetaFields = [ | |
'make' => 'WT.z_make', | |
'model' => 'WT.z_model', | |
'type' => 'WT.z_adtype', | |
'trim' => 'WT.z_badge', | |
'color' => 'WT.z_color', | |
'year' => 'WT.z_year', | |
'price' => 'WT.z_price', | |
'fuel' => 'WT.z_fuel', | |
'state' => 'twitter:data2', | |
]; | |
// Loop the array... | |
foreach ($arrMetaFields as $key => $arrBackupField) { | |
// Set the arrMainData key with the content found by value. | |
$xpathResult = $xpath->query("//meta[@name=\"$arrBackupField\"]/@content"); | |
if ($xpathResult->length > 0) { | |
$arrMainData[$key] = $xpathResult->item(0)->textContent; | |
} else { | |
throw new Exception("{$key} data not registered, Meta tag not found for name {$arrBackupField}"); | |
} | |
} | |
// Getting the phone | |
// Setting has_phone to false. | |
$arrMainData['has_phone'] = 0; | |
$phoneQuery = $xpath->query("//div[@class=\"contact-unlock-content\"]"); | |
if ($phoneQuery->length) { | |
$arrMainData['has_phone'] = 1; | |
} | |
// Getting the city | |
$cityXpath = $xpath->query("//div[@class=\"location-content\"]"); | |
if ($cityXpath->length > 0) { | |
// Get the result | |
$cityNodeObj = $cityXpath->item(0)->textContent; | |
// Explode the content with comma and get the first node. Example (Sam Panania, NSW 2213) | |
$cityRaw = explode(',', $cityNodeObj)[0]; | |
// The string has strange characters, divide into an array and then glue it with space. | |
$city = implode(' ', $this->divideString($this->cleanHTML($cityRaw))); | |
$arrMainData['city'] = $city; | |
} else { | |
throw new Exception('City not registered, No xpath found for div with class location-content'); | |
} | |
// Getting the description. class="view-more-target" | |
$descXpath = $xpath->query("//div[@class=\"view-more-target\"]"); | |
if ($descXpath->length > 0) { | |
$description = $descXpath->item(0)->textContent; | |
$description = $this->cleanHTML($description); | |
$arrMainData['desc'] = $description; | |
} else { | |
throw new Exception('Description not registered, No xpath found for div with class view-more-target'); | |
} | |
// Getting the body | |
// data-webm-make="Toyota" | |
// data-webm-model="Corolla" | |
// data-webm-badge="Levin ZR" | |
// data-webm-bodytype="Hatchback" | |
// data-webm-state="NSW" | |
// data-webm-price="20000"> | |
$arrBackupFields = [ | |
'make_alt' => 'data-webm-make', | |
'model_alt' => 'data-webm-model', | |
'trim_alt' => 'data-webm-badge', | |
'body' => 'data-webm-bodytype', | |
'state_alt' => 'data-webm-state', | |
'price_alt' => 'data-webm-price', | |
]; | |
// Loop the array... | |
foreach ($arrBackupFields as $key => $arrBackupField) { | |
$query = $xpath->query("//a[contains(@class,\"btn save-car\")]/@{$arrBackupField}"); | |
if ($query->length > 0) { | |
// Set the arrMainData key with the content found by value. | |
$arrMainData[$key] = $query->item(0)->textContent; | |
} else { | |
throw new Exception("Body not registered, 'a' tag not found for class btn save-car or {$arrBackupField} propertie not found inside 'a'"); | |
} | |
} | |
// Getting the json from the source. | |
// {"Colour":"Crystal Pearl","Make":"Toyota","Model":"Corolla","Price":"20000","Year":2014,"Odometer":36300} | |
$jsonSource = strstr($html, 'data-car-details="'); | |
if ($jsonSource) { | |
// Remove everything after the close tag. | |
$jsonSource = strstr($jsonSource, '"></div>', 1); | |
// Remove the html tag and leave the raw json. | |
$jsonSource = str_replace('data-car-details="', '', $jsonSource); | |
// Decode the json. | |
$arrJson = json_decode($jsonSource, true); | |
$arrFields = [ | |
'color_alt' => 'Colour', | |
'make_alt_2' => 'Make', | |
'mode_alt_2' => 'Model', | |
'price_alt_2' => 'Price', | |
'year_alt' => 'Year', | |
'mileage' => 'Odometer', | |
]; | |
// Insert the fields into data. | |
foreach ($arrFields as $key => $arrField) { | |
$arrMainData[$key] = $arrJson[$arrField]; | |
} | |
} else { | |
throw new Exception("Mileage not registered, data-car-details property not found in source"); | |
} | |
// Getting the date. <th>Last Modified</th> | |
$engineHtml = strstr($html, '<th>Engine</th>'); | |
if ($engineHtml) { | |
// Remove everything after the closure tag. | |
$engineHtml = strstr($engineHtml, '</tr>', 1); | |
// Replace engine word. | |
$engineHtml = str_replace('Engine', '', $engineHtml); | |
// Clean the html, divide it to remove extra spaces and glue it again. | |
$arrMainData['engine'] = $this->cleanHTML($engineHtml); | |
} | |
// Getting the transmission | |
$transNodes = $xpath->query("//span[@class=\"data-type\"]"); | |
if ($transNodes->length > 0) { | |
$nodeRelations = [ | |
'mileage_alt' => 'ODOMETER', | |
'body_alt' => 'BODY TYPE', | |
'trans' => 'TRANSMISSION', | |
'engine_alt' => 'ENGINE', | |
]; | |
foreach ($transNodes as $transNode) { | |
foreach ($nodeRelations as $key => $nodeRelation) { | |
if ($nodeRelation == $transNode->textContent) { | |
$arrMainData[$key] = $transNode->parentNode->childNodes[1]->textContent; | |
} | |
} | |
} | |
} else { | |
throw new Exception("Transmission not registered, no span found with class 'data-type'"); | |
} | |
// Getting zip code | |
// pcode=2213 | |
$zipHtml = strstr($html, "/pcode="); | |
if ($zipHtml) { | |
$zipHtml = strstr($zipHtml, "/kw", 1); | |
$zipHtml = str_replace('/pcode=', '', $zipHtml); | |
$arrMainData['zip'] = $zipHtml; | |
} else { | |
throw new Exception("Zip not registered, no pcode string found"); | |
} | |
// Getting the date. <th>Last Modified</th> | |
$date = $this->getTextFromDetailsTable($html, 'Last Modified'); | |
$this->registerDataFromDetailsTable('date', $date, $arrMainData, 'No date registered, last modified table row not found'); | |
// Getting the registration plate. <th>Registration Plate</th> | |
$regPlate = $this->getTextFromDetailsTable($html, 'Registration Plate'); | |
$this->registerDataFromDetailsTable('reg', $regPlate, $arrMainData, 'No registration plate registered, Registration Plate table row not found'); | |
// Getting the drive type <th>Drive Type</th> | |
$driveType = $this->getTextFromDetailsTable($html, 'Drive Type'); | |
$this->registerDataFromDetailsTable('drive', $driveType, $arrMainData, 'No drive type registered, Drive Type table row not found'); | |
// Getting the interior color <th>Interior Colour</th> | |
$interiorColor = $this->getTextFromDetailsTable($html, 'Interior Colour'); | |
$this-> registerDataFromDetailsTable('icolor', $interiorColor, $arrMainData, 'No interior color registered, Interior Colour table row not found'); | |
//img https://carsales.pxcrush.net/carsales/car/private/rzov8m7s5vdhrdsty9sy1.jpg | |
$img_pattern = '#https://carsales.pxcrush.net/\w{1,8}/\w{1,3}/\w{1,7}/[a-zA-Z0-9\-~=]{5,25}.\w+#'; | |
preg_match_all($img_pattern, $html, $arr_match); | |
$arrMainData['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : ''; | |
print_r2($arrMainData); | |
return $arrMainData; | |
} | |
/** | |
* Trim and remove html tags | |
* | |
* @param $string | |
* @return string | |
*/ | |
function cleanHTML($string) | |
{ | |
return trim(strip_tags(html_entity_decode($string))); | |
} | |
/** | |
* @param $string | |
* @return array[]|false|string[] | |
*/ | |
function divideString($string) | |
{ | |
return preg_split('/\s+/', $string); | |
} | |
/** | |
* Returns the data inside the <th> column | |
* | |
* @param $html | |
* @param string $column Name of the <th> column | |
* @return boolean|string | |
*/ | |
function getTextFromDetailsTable($html, $column) | |
{ | |
$result = false; | |
$text = strstr($html, "<th>$column</th>"); | |
if ($text) { | |
// Remove everything after the closure tag. | |
$text = strstr($text, '</tr>', 1); | |
// Replace unncesary words. | |
$text = str_replace($column, '', $text); | |
// Clean it. | |
$text = $this->cleanHTML($text); | |
$result = $text; | |
} | |
return $result; | |
} | |
/** | |
* @param string $key Key from data array | |
* @param string|boolean $text Text found in row or false | |
* @param array $data Data array | |
* @param string $errorMessage Error to display | |
* @throws \Exception | |
*/ | |
function registerDataFromDetailsTable($key, $text, &$data, $errorMessage) | |
{ | |
if ($text) { | |
$data[$key] = $text; | |
} else { | |
throw new Exception($errorMessage); | |
} | |
} | |
} | |
require('../req/print_r2.php'); | |
//$html = file_get_contents('req/csl-get-no_phone.html'); | |
$html = file_get_contents('../req/csl-srp.html'); | |
$scraper = new CSL_Scraper(); | |
$scraper->get_id($html, 0); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment