Skip to content

Instantly share code, notes, and snippets.

@iRynoh
Last active May 11, 2018 15:10
Show Gist options
  • Save iRynoh/21224502956f27b0aa9936c01ee37819 to your computer and use it in GitHub Desktop.
Save iRynoh/21224502956f27b0aa9936c01ee37819 to your computer and use it in GitHub Desktop.
CSL_Scraper.php
<?php
require 'Scraper.class.php';
class CSL_Scraper extends Scraper
{
public $domain;
public $referer;
public $car_url;
public $motorcycle_url;
public $phone_token_base;
public $key_recaptcha;
public function __construct()
{
parent::__construct();
$this->domain = 'carsales.com.au';
$this->car_url = 'www.gumtree.com.au/s-ad/';
$this->motorcycle_url = 'www.kijiji.ca/v-sport-bikes/';
$this->referer = 'www.carsales.com.au';
$this->phone_token_base = 'www.gumtree.com.au/j-recaptcha-verify-phone-get.json?token=';
$this->key_recaptcha = '6Lc4ATUUAAAAAGhFgdqfrvYQOXryqRbTK4k6H_wi';
}
public function get_total($html)
{
$pattern_total = '/(?<=<span class="titlecount">\()[\d,]{1,5}(?=\)<\/span)/';
preg_match($pattern_total, $html, $arr_regex_match);
if (! count($arr_regex_match)) {
throw new Exception($this->get_exception(7));
}
return preg_replace('/[^0-9]/', '', $arr_regex_match[0]);
}
function split_search($search)
{
$pattern_year = '/\d{4}__\d{4}/';
preg_match($pattern_year, $search, $arr_regex_match);
if (empty($arr_regex_match)) {
return false;
} else {
$arr_year = explode('__', $arr_regex_match[0]);
for ($i = reset($arr_year); $i <= end($arr_year); $i++) {
$arr_year_ranges[] = $i;
}
foreach ($arr_year_ranges as $key => $value) {
$arr_search[] = str_replace($arr_regex_match[0], $value, $search);
}
return $arr_search;
}
}
public function get_id($html, $nb_page)
{
$matches = [];
preg_match_all('!data-csn-item-history="(\w{3,}-\w{2,}-\d{7,})"!', $html, $matches);
// Delete duplicates and restore array keys.
$arr_id = array_values(array_unique($matches[1]));
//next page check
$nb_page++;
$tag_next = strstr($html, 'title="Next"');
$nb_page = $tag_next ? $nb_page : 0;
if (! count($arr_id)) {
throw new Exception($this->get_exception(10));
}
$arr_data = [$arr_id, $nb_page];
print_r2($arr_data);
return $arr_data;
}
public function set_url_phone($array, $captcha)
{
extract($array);
$arr_get = [
'urlg' => $this->phone_token_base.str_replace('|', '%7C', $token).'&recaptchaResponse='.$captcha.'&origin=jsp'
//'header' => array('X-CSRF-TOKEN: ' . $csrf, 'X-CSRF-TIME: ' . $csrft, 'X-Requested-With: XMLHttpRequest')
];
/*url example
www.gumtree.com.au/j-recaptcha-verify-phone-get.json?token=1183167206%7C1525483833986%7C297c421dd69169b53b6ec87188facbc1%7C3da19e23a3555b4a6f6dc28447e5bf0a%7C4fe342e0791aada79a8d785ad23121c7%7C26763cc9e81b36d213622ef40c092f2a&recaptchaResponse=03AJpayVHdKoRSMZib0anL85ylQ69BlJA7voGBjQNG6lejI4aV6Ok2ki_gq8MESkoPjjXu2vxmE5cpfyCzblQ2iZL5x4HwtzDZgeSJew_9otAV4u3o4t77fk1hKhB5wmss42yPM4j66SPXkdsK_sS7qhFLp5Ow5rw2Iv3mJBFoQRIHzsiICGPC1vD2b7UHeVer2rShe-dSbYX0ZFflvLoRmPDnrwvEFk5RTQ-cQBi3RnkKafZoroTkuVuxGAhoNKh4xeclFrBlieKAN5FGG5XPfSFLVTDN0l1F3N-JQYyT-a6WebFOPclbXZ_90oBJSy8Rx4DwSizXU1lvaRy0Xe1BAEbBqjGLbr3lYf76bN_OD32IQwaKokPVHxNrCDXab8FZVZsQynDyLo6Vng52LGNkc_XLZm_JB1ZSlGZBEUuOMBiI_J80xgLWRpzG91PGKqABuO2KjWDEUeV9OJBBiguZ-Cg-7SnbocuuuWaRq-TNLbBaRZjXADAOle_u6xzc5v4WcBBz7mauSRPs&origin=jsp*/
/*header example
X-CSRF-TOKEN: Bixisfacdi4kUwotZX8SSw
X-CSRF-TIME: d06a6aada34e1aa39f3ba1d7723a042b
X-Requested-With: XMLHttpRequest*/
return $arr_get;
}
public function get_phone($html)
{
if (strstr($html, '"phone"')) {
$arr_phone = json_decode($html, 1);
$phone = $arr_phone['phone'];
return $phone;
} else {
throw new Exception($this->get_exception(19));
}
}
public function get_vin($html)
{
$pattern_vin = '/(?<=id="actualVin">)\w{17}(?=<\/)/';
preg_match($pattern_vin, $html, $arr_regex_match);
if (empty($arr_regex_match[0])) {
throw new Exception($this->get_exception(25));
} else {
return $arr_regex_match[0];
}
}
/**
* @param $html
* @param $get_phone
* @param $id_vehicle_type
* @return array
* @throws \Exception
*/
public function get_data($html, $get_phone, $id_vehicle_type)
{
$isDomainBanned = ! stristr($html, $this->domain);
if ($isDomainBanned) {
throw new Exception(4);
}
$doc = new DomDocument;
// Use @ to avoid warning messages for HTML5.
@$doc->loadHtml($html);
// Use for the metas.
$xpath = new DOMXPath($doc);
$isLoggedIn = 0;
$arrLoginChecks = ['dashboard', 'Sign out', 'logout'];
foreach ($arrLoginChecks as $check) {
if (strstr($html, $check)) {
$isLoggedIn = 1;
break;
}
}
if (! $isLoggedIn) {
throw new Exception($this->get_exception(11));
}
$arrMainData = ['token', 'csrf', 'csrft'];
//'phoneToken': '81183167206|1525474164691|22662836d4cf152e9aa6c3c7331d801a|3d35ebd550ab8f4954d8107d585301e1|78d42ed80fdd01875b926bde753fd456|d9d6ee75e6ed2a22192dfbc08f0e3031'
$pattern_token = '/(?<=\'phoneToken\': \')[a-zA-Z0-9|]{100,200}(?=\')/';
preg_match($pattern_token, $html, $arr_regex_match);
//if (empty($arr_regex_match[0])) {
// throw new Exception(12);
//}
$arrMainData['token'] = reset($arr_regex_match);
$arrExpiredStrings = [
'expired-ad',
'No Longer Available',
'no longer on the site',
'PageExpired',
'<title></title>',
'there is no shortage of vehicles',
'Page Not Found',
'already gone',
'too late',
'trop tard',
'<title>New & Used',
];
foreach ($arrExpiredStrings as $key => $value) {
if (stristr($html, $value)) {
throw new Exception(1);
}
}
$arrPrivateSellerStrings = ['isPostedByOwner', '"Owner"', 'Private Seller Car', '<i class="private"></i>'];
foreach ($arrPrivateSellerStrings as $key => $value) {
if (stristr($html, $value)) {
$isPrivateSeller = 1;
break;
}
}
// Motorcycle parts.
if (($id_vehicle_type == 2) && stristr($html, 'Motorcycle Parts')) {
throw new Exception(1);
}
if (empty($isPrivateSeller)) {
throw new Exception($this->get_exception(5));
}
$keys = [
'date',
'title',
'vin',
'reg',
'make_model_title',
'make_model',
'year_title',
'year',
'make',
'model',
'trim',
'body',
'type_regex',
'mileage',
'price_title',
'price',
'color',
'icolor',
'engine',
'trans',
'drive',
'fuel',
'options',
'desc',
'name',
'has_phone',
'phone',
'city',
'state',
'zip',
'img_json',
'img',
];
$arrMainData = array_fill_keys($keys, '');
// Get title. Remove spaces, etc.
$query = $doc->getElementsByTagName("title");
if ($query->length > 0) {
$title = $this->cleanHTML($doc->getElementsByTagName("title")->item(0)->textContent);
// Set Title.
$arrMainData['title'] = $title;
} else {
throw new Exception('Title tag missing');
}
// Create an array to store html meta names and arrMainData keys.
$arrMetaFields = [
'make' => 'WT.z_make',
'model' => 'WT.z_model',
'type' => 'WT.z_adtype',
'trim' => 'WT.z_badge',
'color' => 'WT.z_color',
'year' => 'WT.z_year',
'price' => 'WT.z_price',
'fuel' => 'WT.z_fuel',
'state' => 'twitter:data2',
];
// Loop the array...
foreach ($arrMetaFields as $key => $arrBackupField) {
// Set the arrMainData key with the content found by value.
$xpathResult = $xpath->query("//meta[@name=\"$arrBackupField\"]/@content");
if ($xpathResult->length > 0) {
$arrMainData[$key] = $xpathResult->item(0)->textContent;
} else {
throw new Exception("{$key} data not registered, Meta tag not found for name {$arrBackupField}");
}
}
// Getting the phone
// Setting has_phone to false.
$arrMainData['has_phone'] = 0;
$phoneQuery = $xpath->query("//div[@class=\"contact-unlock-content\"]");
if ($phoneQuery->length) {
$arrMainData['has_phone'] = 1;
}
// Getting the city
$cityXpath = $xpath->query("//div[@class=\"location-content\"]");
if ($cityXpath->length > 0) {
// Get the result
$cityNodeObj = $cityXpath->item(0)->textContent;
// Explode the content with comma and get the first node. Example (Sam Panania, NSW 2213)
$cityRaw = explode(',', $cityNodeObj)[0];
// The string has strange characters, divide into an array and then glue it with space.
$city = implode(' ', $this->divideString($this->cleanHTML($cityRaw)));
$arrMainData['city'] = $city;
} else {
throw new Exception('City not registered, No xpath found for div with class location-content');
}
// Getting the description. class="view-more-target"
$descXpath = $xpath->query("//div[@class=\"view-more-target\"]");
if ($descXpath->length > 0) {
$description = $descXpath->item(0)->textContent;
$description = $this->cleanHTML($description);
$arrMainData['desc'] = $description;
} else {
throw new Exception('Description not registered, No xpath found for div with class view-more-target');
}
// Getting the body
// data-webm-make="Toyota"
// data-webm-model="Corolla"
// data-webm-badge="Levin ZR"
// data-webm-bodytype="Hatchback"
// data-webm-state="NSW"
// data-webm-price="20000">
$arrBackupFields = [
'make_alt' => 'data-webm-make',
'model_alt' => 'data-webm-model',
'trim_alt' => 'data-webm-badge',
'body' => 'data-webm-bodytype',
'state_alt' => 'data-webm-state',
'price_alt' => 'data-webm-price',
];
// Loop the array...
foreach ($arrBackupFields as $key => $arrBackupField) {
$query = $xpath->query("//a[contains(@class,\"btn save-car\")]/@{$arrBackupField}");
if ($query->length > 0) {
// Set the arrMainData key with the content found by value.
$arrMainData[$key] = $query->item(0)->textContent;
} else {
throw new Exception("Body not registered, 'a' tag not found for class btn save-car or {$arrBackupField} propertie not found inside 'a'");
}
}
// Getting the json from the source.
// {"Colour":"Crystal Pearl","Make":"Toyota","Model":"Corolla","Price":"20000","Year":2014,"Odometer":36300}
$jsonSource = strstr($html, 'data-car-details="');
if ($jsonSource) {
// Remove everything after the close tag.
$jsonSource = strstr($jsonSource, '"></div>', 1);
// Remove the html tag and leave the raw json.
$jsonSource = str_replace('data-car-details="', '', $jsonSource);
// Decode the json.
$arrJson = json_decode($jsonSource, true);
$arrFields = [
'color_alt' => 'Colour',
'make_alt_2' => 'Make',
'mode_alt_2' => 'Model',
'price_alt_2' => 'Price',
'year_alt' => 'Year',
'mileage' => 'Odometer',
];
// Insert the fields into data.
foreach ($arrFields as $key => $arrField) {
$arrMainData[$key] = $arrJson[$arrField];
}
} else {
throw new Exception("Mileage not registered, data-car-details property not found in source");
}
// Getting the date. <th>Last Modified</th>
$engineHtml = strstr($html, '<th>Engine</th>');
if ($engineHtml) {
// Remove everything after the closure tag.
$engineHtml = strstr($engineHtml, '</tr>', 1);
// Replace engine word.
$engineHtml = str_replace('Engine', '', $engineHtml);
// Clean the html, divide it to remove extra spaces and glue it again.
$arrMainData['engine'] = $this->cleanHTML($engineHtml);
}
// Getting the transmission
$transNodes = $xpath->query("//span[@class=\"data-type\"]");
if ($transNodes->length > 0) {
$nodeRelations = [
'mileage_alt' => 'ODOMETER',
'body_alt' => 'BODY TYPE',
'trans' => 'TRANSMISSION',
'engine_alt' => 'ENGINE',
];
foreach ($transNodes as $transNode) {
foreach ($nodeRelations as $key => $nodeRelation) {
if ($nodeRelation == $transNode->textContent) {
$arrMainData[$key] = $transNode->parentNode->childNodes[1]->textContent;
}
}
}
} else {
throw new Exception("Transmission not registered, no span found with class 'data-type'");
}
// Getting zip code
// pcode=2213
$zipHtml = strstr($html, "/pcode=");
if ($zipHtml) {
$zipHtml = strstr($zipHtml, "/kw", 1);
$zipHtml = str_replace('/pcode=', '', $zipHtml);
$arrMainData['zip'] = $zipHtml;
} else {
throw new Exception("Zip not registered, no pcode string found");
}
// Getting the date. <th>Last Modified</th>
$date = $this->getTextFromDetailsTable($html, 'Last Modified');
$this->registerDataFromDetailsTable('date', $date, $arrMainData, 'No date registered, last modified table row not found');
// Getting the registration plate. <th>Registration Plate</th>
$regPlate = $this->getTextFromDetailsTable($html, 'Registration Plate');
$this->registerDataFromDetailsTable('reg', $regPlate, $arrMainData, 'No registration plate registered, Registration Plate table row not found');
// Getting the drive type <th>Drive Type</th>
$driveType = $this->getTextFromDetailsTable($html, 'Drive Type');
$this->registerDataFromDetailsTable('drive', $driveType, $arrMainData, 'No drive type registered, Drive Type table row not found');
// Getting the interior color <th>Interior Colour</th>
$interiorColor = $this->getTextFromDetailsTable($html, 'Interior Colour');
$this-> registerDataFromDetailsTable('icolor', $interiorColor, $arrMainData, 'No interior color registered, Interior Colour table row not found');
//img https://carsales.pxcrush.net/carsales/car/private/rzov8m7s5vdhrdsty9sy1.jpg
$img_pattern = '#https://carsales.pxcrush.net/\w{1,8}/\w{1,3}/\w{1,7}/[a-zA-Z0-9\-~=]{5,25}.\w+#';
preg_match_all($img_pattern, $html, $arr_match);
$arrMainData['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : '';
print_r2($arrMainData);
return $arrMainData;
}
/**
* Trim and remove html tags
*
* @param $string
* @return string
*/
function cleanHTML($string)
{
return trim(strip_tags(html_entity_decode($string)));
}
/**
* @param $string
* @return array[]|false|string[]
*/
function divideString($string)
{
return preg_split('/\s+/', $string);
}
/**
* Returns the data inside the <th> column
*
* @param $html
* @param string $column Name of the <th> column
* @return boolean|string
*/
function getTextFromDetailsTable($html, $column)
{
$result = false;
$text = strstr($html, "<th>$column</th>");
if ($text) {
// Remove everything after the closure tag.
$text = strstr($text, '</tr>', 1);
// Replace unncesary words.
$text = str_replace($column, '', $text);
// Clean it.
$text = $this->cleanHTML($text);
$result = $text;
}
return $result;
}
/**
* @param string $key Key from data array
* @param string|boolean $text Text found in row or false
* @param array $data Data array
* @param string $errorMessage Error to display
* @throws \Exception
*/
function registerDataFromDetailsTable($key, $text, &$data, $errorMessage)
{
if ($text) {
$data[$key] = $text;
} else {
throw new Exception($errorMessage);
}
}
}
require('../req/print_r2.php');
//$html = file_get_contents('req/csl-get-no_phone.html');
$html = file_get_contents('../req/csl-srp.html');
$scraper = new CSL_Scraper();
$scraper->get_id($html, 0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment