Skip to content

Instantly share code, notes, and snippets.

@iRynoh
Last active May 9, 2018 20:52
Show Gist options
  • Save iRynoh/c5d623fad0d3c3f4e358952b78e9d44d to your computer and use it in GitHub Desktop.
Save iRynoh/c5d623fad0d3c3f4e358952b78e9d44d to your computer and use it in GitHub Desktop.
csl.get
<?php
require('req/print_r2.php');
$html = file_get_contents('req/csl-get-no_phone.html');
//$html = file_get_contents('req/csl-get.html');
$keys = [
'date',
'title',
'vin',
'reg',
'make_model_title',
'make_model',
'year_title',
'year',
'make',
'model',
'trim',
'body',
'type_regex',
'mileage',
'price_title',
'price',
'color',
'icolor',
'engine',
'trans',
'drive',
'fuel',
'options',
'desc',
'name',
'has_phone',
'phone',
'city',
'state',
'zip',
'img_json',
'img',
];
// Initialize fields.
$arr_data = array_fill_keys($keys, '');
$doc = new DomDocument;
// Use @ to avoid warning messages for HTML5.
@$doc->loadHtml($html);
// Get title. Remove spaces, etc.
$query = $doc->getElementsByTagName("title");
if ($query->length > 0) {
$title = cleanHTML($doc->getElementsByTagName("title")->item(0)->textContent);
// Set Title.
$arr_data['title'] = $title;
} else {
throw new Exception('Title element not found');
}
// Use for the metas.
$xpath = new DOMXPath($doc);
// Create an array to store html meta names and arr_data keys.
$arrMetaFields = [
'make' => 'WT.z_make',
'model' => 'WT.z_model',
'type' => 'WT.z_adtype',
'trim' => 'WT.z_badge',
'color' => 'WT.z_color',
'year' => 'WT.z_year',
'price' => 'WT.z_price',
'fuel' => 'WT.z_fuel',
'state' => 'twitter:data2',
];
// Loop the array...
foreach ($arrMetaFields as $key => $arrBackupField) {
// Set the arr_data key with the content found by value.
$xpathResult = $xpath->query("//meta[@name=\"$arrBackupField\"]/@content");
if ($xpathResult->length > 0) {
$arr_data[$key] = $xpathResult->item(0)->textContent;
} else {
throw new Exception("Main data not registered, Meta tag not found for name {$arrBackupField}");
}
}
// Getting the phone
// Setting has_phone to false.
$arr_data['has_phone'] = 0;
$phoneQuery = $xpath->query("//div[@class=\"contact-unlock-content\"]");
if ($phoneQuery->length) {
$arr_data['has_phone'] = 1;
}
// Getting the city
$cityXpath = $xpath->query("//div[@class=\"location-content\"]");
if ($cityXpath->length > 0) {
// Get the result
$cityNodeObj = $cityXpath->item(0)->textContent;
// Explode the content with comma and get the first node. Example (Sam Panania, NSW 2213)
$cityRaw = explode(',', $cityNodeObj)[0];
// The string has strange characters, divide into an array and then glue it with space.
$city = implode(' ', divideString(cleanHTML($cityRaw)));
$arr_data['city'] = $city;
} else {
throw new Exception('City not registered, No xpath found for div with class location-content');
}
// Getting the description. class="view-more-target"
$descXpath = $xpath->query("//div[@class=\"view-more-target\"]");
if ($descXpath->length > 0) {
$description = $descXpath->item(0)->textContent;
$description = cleanHTML($description);
$arr_data['desc'] = $description;
} else {
throw new Exception('Description not registered, No xpath found for div with class view-more-target');
}
// Getting the body
// data-webm-make="Toyota"
// data-webm-model="Corolla"
// data-webm-badge="Levin ZR"
// data-webm-bodytype="Hatchback"
// data-webm-state="NSW"
// data-webm-price="20000">
$arrBackupFields = [
'make_alt' => 'data-webm-make',
'model_alt' => 'data-webm-model',
'trim_alt' => 'data-webm-badge',
'body' => 'data-webm-bodytype',
'state_alt' => 'data-webm-state',
'price_alt' => 'data-webm-price',
];
// Loop the array...
foreach ($arrBackupFields as $key => $arrBackupField) {
// Set the arr_data key with the content found by value.
$query = $xpath->query("//a[contains(@class,\"btn save-car\")]/@{$arrBackupField}");
if ($query->length > 0) {
$arr_data[$key] = $query->item(0)->textContent;
} else {
throw new Exception("Body not registered, 'a' tag not found for class btn save-car or {$arrBackupField} propertie not found inside 'a'");
}
}
// Getting the json from the source.
// {"Colour":"Crystal Pearl","Make":"Toyota","Model":"Corolla","Price":"20000","Year":2014,"Odometer":36300}
$jsonSource = strstr($html, 'data-car-details="');
if ($jsonSource) {
// Remove everything after the close tag.
$jsonSource = strstr($jsonSource, '"></div>', 1);
// Remove the html tag and leave the raw json.
$jsonSource = str_replace('data-car-details="', '', $jsonSource);
// Decode the json.
$arrJson = json_decode($jsonSource, true);
$arrFields = [
'color_alt' => 'Colour',
'make_alt_2' => 'Make',
'mode_alt_2' => 'Model',
'price_alt_2' => 'Price',
'year_alt' => 'Year',
'mileage' => 'Odometer',
];
// Insert the fields into data.
foreach ($arrFields as $key => $arrField) {
$arr_data[$key] = $arrJson[$arrField];
}
} else {
throw new Exception("Mileage not registered, data-car-details property not found in source");
}
// Getting the date. <th>Last Modified</th>
$engineHtml = strstr($html, '<th>Engine</th>');
if ($engineHtml) {
// Remove everything after the closure tag.
$engineHtml = strstr($engineHtml, '</tr>', 1);
// Replace engine word.
$engineHtml = str_replace('Engine', '', $engineHtml);
// Clean the html, divide it to remove extra spaces and glue it again.
$arr_data['engine'] = cleanHTML($engineHtml);
}
// Getting the transmission
$transNodes = $xpath->query("//span[@class=\"data-type\"]");
if ($transNodes->length > 0) {
$nodeRelations = [
'mileage_alt' => 'ODOMETER',
'body_alt' => 'BODY TYPE',
'trans' => 'TRANSMISSION',
'engine_alt' => 'ENGINE',
];
foreach ($transNodes as $transNode) {
foreach ($nodeRelations as $key => $nodeRelation) {
if ($nodeRelation == $transNode->textContent) {
$arr_data[$key] = $transNode->parentNode->childNodes[1]->textContent;
}
}
}
} else {
throw new Exception("Transmission not registered, no span found with class 'data-type'");
}
// Getting zip code
// pcode=2213
$zipHtml = strstr($html, "/pcode=");
if ($zipHtml) {
$zipHtml = strstr($zipHtml, "/kw", 1);
$zipHtml = str_replace('/pcode=', '', $zipHtml);
$arr_data['zip'] = $zipHtml;
} else {
throw new Exception("Zip not registered, no pcode string found");
}
// Getting the date. <th>Last Modified</th>
$date = getTextFromDetailsTable($html, 'Last Modified');
registerDataFromDetailsTable('date', $date, $arr_data, 'No date registered, last modified table row not found');
// Getting the registration plate. <th>Registration Plate</th>
$regPlate = getTextFromDetailsTable($html, 'Registration Plate');
registerDataFromDetailsTable('reg', $regPlate, $arr_data, 'No registration plate registered, Registration Plate table row not found');
// Getting the drive type <th>Drive Type</th>
$driveType = getTextFromDetailsTable($html, 'Drive Type');
registerDataFromDetailsTable('drive', $driveType, $arr_data, 'No drive type registered, Drive Type table row not found');
// Getting the interior color <th>Interior Colour</th>
$interiorColor = getTextFromDetailsTable($html, 'Interior Colour');
registerDataFromDetailsTable('icolor', $interiorColor, $arr_data, 'No interior color registered, Interior Colour table row not found');
//img https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/UPEAAOSwcIJa6mGA/$_35.JPG
//$img_pattern = '#https://i.ebayimg.com/\d{1,3}/\w{1,3}/[a-zA-Z0-9\-~=]{5,20}/\w{1,3}/[a-zA-Z0-9\-~=]{5,30}/\$_20.\w+#';
//img https://carsales.pxcrush.net/carsales/car/private/rzov8m7s5vdhrdsty9sy1.jpg
$img_pattern = '#https://carsales.pxcrush.net/\w{1,8}/\w{1,3}/\w{1,7}/[a-zA-Z0-9\-~=]{5,25}.\w+#';
preg_match_all($img_pattern, $html, $arr_match);
$arr_data['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : '';
print_r2($arr_data);
/**
* Trim and remove html tags
*
* @param $string
* @return string
*/
function cleanHTML($string)
{
return trim(strip_tags(html_entity_decode($string)));
}
/**
* @param $string
* @return array[]|false|string[]
*/
function divideString($string)
{
return preg_split('/\s+/', $string);
}
/**
* Returns the data inside the <th> column
*
* @param $html
* @param string $column Name of the <th> column
* @return boolean|string
*/
function getTextFromDetailsTable($html, $column)
{
$result = false;
$text = strstr($html, "<th>$column</th>");
if ($text) {
// Remove everything after the closure tag.
$text = strstr($text, '</tr>', 1);
// Replace unncesary words.
$text = str_replace($column, '', $text);
// Clean it.
$text = cleanHTML($text);
$result = $text;
}
return $result;
}
/**
* @param string $key Key from data array
* @param string|boolean $text Text found in row or false
* @param array $data Data array
* @param string $errorMessage Error to display
* @throws \Exception
*/
function registerDataFromDetailsTable($key, $text, &$data, $errorMessage)
{
if ($text) {
$data[$key] = $text;
} else {
throw new Exception($errorMessage);
}
}
//$arr_tag = ['\'{"a":{"id":', 'window.dfpTargetingModel = {'];
//$arr_closure = ["}',", '};'];
//
//foreach ($arr_tag as $key => $value) {
// $tag_data = stristr($html, $value);
// if (! $tag_data) {
// echo 'err 1';
// //throw new Exception($this->get_exception(2));
// }
//
// echo $arr_closure[$key];
//
// $json_srch = str_replace(['[', ']'], '', strstr(strstr($tag_data, $arr_closure[$key], 1), '{').'}');
// echo $json_srch;
// if (! $json_srch) {
// echo 'err 2';
// //throw new Exception($this->get_exception(2));
// } else {
// $arr_json[] = json_decode($json_srch, 1);
// }
//
// if (! is_array($arr_json[$key])) {
// echo 'err 3';
// //throw new Exception($this->get_exception(13));
// }
//}
//
///*//vehicle infos
//$json_srch = strstr($html, '\'{"a":{"id":');
//$json_srch_b = strstr($json_srch, "}',", 1);
//$arr_json = json_decode(strstr($json_srch_b, '{') . '}', 1);*/
//
//print_r2($arr_json);
//
////exit;
//
//$arr_wk = $arr_json[0]['a']['attr'];
//$arr_wk = array_merge($arr_wk, $arr_json[1]);
//print_r2($arr_wk);
//
//$arr_json_vars = [
// 'vin' => 'vin',
// 'is_reg' => 'registered',
// 'reg' => 'vreg',
// 'date_reg_exp' => 'registrationexpiry_tdt',
// 'year' => 'caryear',
// 'year_alt' => 'year',
// 'make' => 'carmake',
// 'make_alt' => 'make',
// 'model' => 'carmodel',
// 'model_alt' => 'model',
// 'trim' => 'variant',
// 'body' => 'carbodytype',
// 'body_alt' => 'bodyType',
// 'mileage' => 'carmileageinkms',
// 'price' => 'highest_price',
// 'price_alt' => 'lp',
// 'color' => 'colour',
// 'icolor' => 'vehicleInteriorColor',
// 'engine' => 'cylinder_configuration',
// 'trans' => 'cartransmission',
// 'trans_alt' => 'transmission',
// 'drive' => 'drivetrain',
// 'drive_alt' => 'driveTrain',
// 'fuel' => 'fueltype',
// 'fuel_alt' => 'fuelType',
// 'city' => 'suburb',
// 'state' => 'state',
// 'zip' => 'postcode',
//];
//
////img file type pattern
//$pattern_img_file_type = '#(?<=.)\w+(?=&size)#';
//
//foreach ($arr_json_vars as $key => $value) {
//
// if (! empty($arr_wk[$value]) && ! is_array($arr_wk[$value])) {
// $arr_data[$key] = trim(strip_tags(html_entity_decode($arr_wk[$value])));
// }
//}
//
////post date "releaseDate">2018-05-03</span>
//$pattern_date = '/(?<="releaseDate">)\d{4}-\d{2}-\d{2}(?=<\/span>)/';
//preg_match($pattern_date, $html, $match_arr);
//$arr_data['date'] = isset($match_arr[0]) ? $match_arr[0] : '';
//
////seller name /s-seller/Harish/
//$pattern_name = '/(?<=\/s-seller\/)[a-zA-Z0-9\-\s]{1,20}(?=\/)/';
//preg_match($pattern_name, $html, $match_arr);
//$arr_data['name'] = isset($match_arr[0]) ? $match_arr[0] : '';
//
//$arr_tag_item = [
// 'state' => '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/',
// 'latitude' => '/(?<="latitude">)[0-9\-.]{5,40}(?="<\/span>)/',
// 'longitude' => '/(?<="longitude">)[0-9\-.]{5,40}(?="<\/span>)/',
//];
//
//foreach ($arr_tag_item as $key => $value) {
// preg_match($value, $html, $match_arr);
// $arr_data[$key.'_regex'] = isset($match_arr[0]) ? $match_arr[0] : '';
//}
///*
////region <span itemprop="addressRegion">NSW</span>
//$pattern_state = '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/';
//preg_match($pattern_state, $html, $match_arr);
//$arr_data['state_regex'] = isset($match_arr[0]) ? $match_arr[0] : '';
//
////geo localisation <span itemprop="latitude">-33.86462"</span> <span itemprop="longitude">151.04562999999996"</span>
//$pattern_latitude = '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/';
//preg_match($pattern_state, $html, $match_arr);
//$arr_data['state_regex'] = isset($match_arr[0]) ? $match_arr[0] : '';
//*/
//
//$arr_tag_phone = ['Show number', '__phone'];
//
//foreach ($arr_tag_phone as $key => $value) {
// if (strstr($html, $value)) {
// $arr_data['has_phone'] = 1;
// }
//}
//
////desc <div id="ad_description_details_content">
//$desc_srch = strstr($html, '<div id="ad_description_details_content">');
//if ($desc_srch) {
// $desc_srchb = strstr($desc_srch, '</div>', 1);
// $arr_data['desc'] = $desc_srchb;
//}
//
////img https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/UPEAAOSwcIJa6mGA/$_35.JPG
//$img_pattern = '#https://i.ebayimg.com/\d{1,3}/\w{1,3}/[a-zA-Z0-9\-~=]{5,20}/\w{1,3}/[a-zA-Z0-9\-~=]{5,30}/\$_20.\w+#';
//preg_match_all($img_pattern, $html, $arr_match);
//$arr_data['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : '';
//
///*
//$arr_data['img'] = count($match_arr) ? array_unique(array_map('html_entity_decode', $match_arr[0])) : '';
//$first_image = array_pop($arr_data['img']);
//array_unshift($arr_data['img'], $first_image);*/
//
//print_r2($arr_data);
/*
//options
$opt_srch = strstr($html, '<ul class="features-list">');
$arr_options = explode('</li>', strstr($opt_srch, '</ul>', 1));
array_pop($arr_options);
$arr_options = array_map('strip_tags', $arr_options);
$arr_data['options'] = '<li>' . implode('</li><li>', $arr_options) . '</li>';
//print_r( $arr_options);
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment