Last active
May 9, 2018 20:52
-
-
Save iRynoh/c5d623fad0d3c3f4e358952b78e9d44d to your computer and use it in GitHub Desktop.
csl.get
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require('req/print_r2.php'); | |
$html = file_get_contents('req/csl-get-no_phone.html'); | |
//$html = file_get_contents('req/csl-get.html'); | |
$keys = [ | |
'date', | |
'title', | |
'vin', | |
'reg', | |
'make_model_title', | |
'make_model', | |
'year_title', | |
'year', | |
'make', | |
'model', | |
'trim', | |
'body', | |
'type_regex', | |
'mileage', | |
'price_title', | |
'price', | |
'color', | |
'icolor', | |
'engine', | |
'trans', | |
'drive', | |
'fuel', | |
'options', | |
'desc', | |
'name', | |
'has_phone', | |
'phone', | |
'city', | |
'state', | |
'zip', | |
'img_json', | |
'img', | |
]; | |
// Initialize fields. | |
$arr_data = array_fill_keys($keys, ''); | |
$doc = new DomDocument; | |
// Use @ to avoid warning messages for HTML5. | |
@$doc->loadHtml($html); | |
// Get title. Remove spaces, etc. | |
$query = $doc->getElementsByTagName("title"); | |
if ($query->length > 0) { | |
$title = cleanHTML($doc->getElementsByTagName("title")->item(0)->textContent); | |
// Set Title. | |
$arr_data['title'] = $title; | |
} else { | |
throw new Exception('Title element not found'); | |
} | |
// Use for the metas. | |
$xpath = new DOMXPath($doc); | |
// Create an array to store html meta names and arr_data keys. | |
$arrMetaFields = [ | |
'make' => 'WT.z_make', | |
'model' => 'WT.z_model', | |
'type' => 'WT.z_adtype', | |
'trim' => 'WT.z_badge', | |
'color' => 'WT.z_color', | |
'year' => 'WT.z_year', | |
'price' => 'WT.z_price', | |
'fuel' => 'WT.z_fuel', | |
'state' => 'twitter:data2', | |
]; | |
// Loop the array... | |
foreach ($arrMetaFields as $key => $arrBackupField) { | |
// Set the arr_data key with the content found by value. | |
$xpathResult = $xpath->query("//meta[@name=\"$arrBackupField\"]/@content"); | |
if ($xpathResult->length > 0) { | |
$arr_data[$key] = $xpathResult->item(0)->textContent; | |
} else { | |
throw new Exception("Main data not registered, Meta tag not found for name {$arrBackupField}"); | |
} | |
} | |
// Getting the phone | |
// Setting has_phone to false. | |
$arr_data['has_phone'] = 0; | |
$phoneQuery = $xpath->query("//div[@class=\"contact-unlock-content\"]"); | |
if ($phoneQuery->length) { | |
$arr_data['has_phone'] = 1; | |
} | |
// Getting the city | |
$cityXpath = $xpath->query("//div[@class=\"location-content\"]"); | |
if ($cityXpath->length > 0) { | |
// Get the result | |
$cityNodeObj = $cityXpath->item(0)->textContent; | |
// Explode the content with comma and get the first node. Example (Sam Panania, NSW 2213) | |
$cityRaw = explode(',', $cityNodeObj)[0]; | |
// The string has strange characters, divide into an array and then glue it with space. | |
$city = implode(' ', divideString(cleanHTML($cityRaw))); | |
$arr_data['city'] = $city; | |
} else { | |
throw new Exception('City not registered, No xpath found for div with class location-content'); | |
} | |
// Getting the description. class="view-more-target" | |
$descXpath = $xpath->query("//div[@class=\"view-more-target\"]"); | |
if ($descXpath->length > 0) { | |
$description = $descXpath->item(0)->textContent; | |
$description = cleanHTML($description); | |
$arr_data['desc'] = $description; | |
} else { | |
throw new Exception('Description not registered, No xpath found for div with class view-more-target'); | |
} | |
// Getting the body | |
// data-webm-make="Toyota" | |
// data-webm-model="Corolla" | |
// data-webm-badge="Levin ZR" | |
// data-webm-bodytype="Hatchback" | |
// data-webm-state="NSW" | |
// data-webm-price="20000"> | |
$arrBackupFields = [ | |
'make_alt' => 'data-webm-make', | |
'model_alt' => 'data-webm-model', | |
'trim_alt' => 'data-webm-badge', | |
'body' => 'data-webm-bodytype', | |
'state_alt' => 'data-webm-state', | |
'price_alt' => 'data-webm-price', | |
]; | |
// Loop the array... | |
foreach ($arrBackupFields as $key => $arrBackupField) { | |
// Set the arr_data key with the content found by value. | |
$query = $xpath->query("//a[contains(@class,\"btn save-car\")]/@{$arrBackupField}"); | |
if ($query->length > 0) { | |
$arr_data[$key] = $query->item(0)->textContent; | |
} else { | |
throw new Exception("Body not registered, 'a' tag not found for class btn save-car or {$arrBackupField} propertie not found inside 'a'"); | |
} | |
} | |
// Getting the json from the source. | |
// {"Colour":"Crystal Pearl","Make":"Toyota","Model":"Corolla","Price":"20000","Year":2014,"Odometer":36300} | |
$jsonSource = strstr($html, 'data-car-details="'); | |
if ($jsonSource) { | |
// Remove everything after the close tag. | |
$jsonSource = strstr($jsonSource, '"></div>', 1); | |
// Remove the html tag and leave the raw json. | |
$jsonSource = str_replace('data-car-details="', '', $jsonSource); | |
// Decode the json. | |
$arrJson = json_decode($jsonSource, true); | |
$arrFields = [ | |
'color_alt' => 'Colour', | |
'make_alt_2' => 'Make', | |
'mode_alt_2' => 'Model', | |
'price_alt_2' => 'Price', | |
'year_alt' => 'Year', | |
'mileage' => 'Odometer', | |
]; | |
// Insert the fields into data. | |
foreach ($arrFields as $key => $arrField) { | |
$arr_data[$key] = $arrJson[$arrField]; | |
} | |
} else { | |
throw new Exception("Mileage not registered, data-car-details property not found in source"); | |
} | |
// Getting the date. <th>Last Modified</th> | |
$engineHtml = strstr($html, '<th>Engine</th>'); | |
if ($engineHtml) { | |
// Remove everything after the closure tag. | |
$engineHtml = strstr($engineHtml, '</tr>', 1); | |
// Replace engine word. | |
$engineHtml = str_replace('Engine', '', $engineHtml); | |
// Clean the html, divide it to remove extra spaces and glue it again. | |
$arr_data['engine'] = cleanHTML($engineHtml); | |
} | |
// Getting the transmission | |
$transNodes = $xpath->query("//span[@class=\"data-type\"]"); | |
if ($transNodes->length > 0) { | |
$nodeRelations = [ | |
'mileage_alt' => 'ODOMETER', | |
'body_alt' => 'BODY TYPE', | |
'trans' => 'TRANSMISSION', | |
'engine_alt' => 'ENGINE', | |
]; | |
foreach ($transNodes as $transNode) { | |
foreach ($nodeRelations as $key => $nodeRelation) { | |
if ($nodeRelation == $transNode->textContent) { | |
$arr_data[$key] = $transNode->parentNode->childNodes[1]->textContent; | |
} | |
} | |
} | |
} else { | |
throw new Exception("Transmission not registered, no span found with class 'data-type'"); | |
} | |
// Getting zip code | |
// pcode=2213 | |
$zipHtml = strstr($html, "/pcode="); | |
if ($zipHtml) { | |
$zipHtml = strstr($zipHtml, "/kw", 1); | |
$zipHtml = str_replace('/pcode=', '', $zipHtml); | |
$arr_data['zip'] = $zipHtml; | |
} else { | |
throw new Exception("Zip not registered, no pcode string found"); | |
} | |
// Getting the date. <th>Last Modified</th> | |
$date = getTextFromDetailsTable($html, 'Last Modified'); | |
registerDataFromDetailsTable('date', $date, $arr_data, 'No date registered, last modified table row not found'); | |
// Getting the registration plate. <th>Registration Plate</th> | |
$regPlate = getTextFromDetailsTable($html, 'Registration Plate'); | |
registerDataFromDetailsTable('reg', $regPlate, $arr_data, 'No registration plate registered, Registration Plate table row not found'); | |
// Getting the drive type <th>Drive Type</th> | |
$driveType = getTextFromDetailsTable($html, 'Drive Type'); | |
registerDataFromDetailsTable('drive', $driveType, $arr_data, 'No drive type registered, Drive Type table row not found'); | |
// Getting the interior color <th>Interior Colour</th> | |
$interiorColor = getTextFromDetailsTable($html, 'Interior Colour'); | |
registerDataFromDetailsTable('icolor', $interiorColor, $arr_data, 'No interior color registered, Interior Colour table row not found'); | |
//img https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/UPEAAOSwcIJa6mGA/$_35.JPG | |
//$img_pattern = '#https://i.ebayimg.com/\d{1,3}/\w{1,3}/[a-zA-Z0-9\-~=]{5,20}/\w{1,3}/[a-zA-Z0-9\-~=]{5,30}/\$_20.\w+#'; | |
//img https://carsales.pxcrush.net/carsales/car/private/rzov8m7s5vdhrdsty9sy1.jpg | |
$img_pattern = '#https://carsales.pxcrush.net/\w{1,8}/\w{1,3}/\w{1,7}/[a-zA-Z0-9\-~=]{5,25}.\w+#'; | |
preg_match_all($img_pattern, $html, $arr_match); | |
$arr_data['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : ''; | |
print_r2($arr_data); | |
/** | |
* Trim and remove html tags | |
* | |
* @param $string | |
* @return string | |
*/ | |
function cleanHTML($string) | |
{ | |
return trim(strip_tags(html_entity_decode($string))); | |
} | |
/** | |
* @param $string | |
* @return array[]|false|string[] | |
*/ | |
function divideString($string) | |
{ | |
return preg_split('/\s+/', $string); | |
} | |
/** | |
* Returns the data inside the <th> column | |
* | |
* @param $html | |
* @param string $column Name of the <th> column | |
* @return boolean|string | |
*/ | |
function getTextFromDetailsTable($html, $column) | |
{ | |
$result = false; | |
$text = strstr($html, "<th>$column</th>"); | |
if ($text) { | |
// Remove everything after the closure tag. | |
$text = strstr($text, '</tr>', 1); | |
// Replace unncesary words. | |
$text = str_replace($column, '', $text); | |
// Clean it. | |
$text = cleanHTML($text); | |
$result = $text; | |
} | |
return $result; | |
} | |
/** | |
* @param string $key Key from data array | |
* @param string|boolean $text Text found in row or false | |
* @param array $data Data array | |
* @param string $errorMessage Error to display | |
* @throws \Exception | |
*/ | |
function registerDataFromDetailsTable($key, $text, &$data, $errorMessage) | |
{ | |
if ($text) { | |
$data[$key] = $text; | |
} else { | |
throw new Exception($errorMessage); | |
} | |
} | |
//$arr_tag = ['\'{"a":{"id":', 'window.dfpTargetingModel = {']; | |
//$arr_closure = ["}',", '};']; | |
// | |
//foreach ($arr_tag as $key => $value) { | |
// $tag_data = stristr($html, $value); | |
// if (! $tag_data) { | |
// echo 'err 1'; | |
// //throw new Exception($this->get_exception(2)); | |
// } | |
// | |
// echo $arr_closure[$key]; | |
// | |
// $json_srch = str_replace(['[', ']'], '', strstr(strstr($tag_data, $arr_closure[$key], 1), '{').'}'); | |
// echo $json_srch; | |
// if (! $json_srch) { | |
// echo 'err 2'; | |
// //throw new Exception($this->get_exception(2)); | |
// } else { | |
// $arr_json[] = json_decode($json_srch, 1); | |
// } | |
// | |
// if (! is_array($arr_json[$key])) { | |
// echo 'err 3'; | |
// //throw new Exception($this->get_exception(13)); | |
// } | |
//} | |
// | |
///*//vehicle infos | |
//$json_srch = strstr($html, '\'{"a":{"id":'); | |
//$json_srch_b = strstr($json_srch, "}',", 1); | |
//$arr_json = json_decode(strstr($json_srch_b, '{') . '}', 1);*/ | |
// | |
//print_r2($arr_json); | |
// | |
////exit; | |
// | |
//$arr_wk = $arr_json[0]['a']['attr']; | |
//$arr_wk = array_merge($arr_wk, $arr_json[1]); | |
//print_r2($arr_wk); | |
// | |
//$arr_json_vars = [ | |
// 'vin' => 'vin', | |
// 'is_reg' => 'registered', | |
// 'reg' => 'vreg', | |
// 'date_reg_exp' => 'registrationexpiry_tdt', | |
// 'year' => 'caryear', | |
// 'year_alt' => 'year', | |
// 'make' => 'carmake', | |
// 'make_alt' => 'make', | |
// 'model' => 'carmodel', | |
// 'model_alt' => 'model', | |
// 'trim' => 'variant', | |
// 'body' => 'carbodytype', | |
// 'body_alt' => 'bodyType', | |
// 'mileage' => 'carmileageinkms', | |
// 'price' => 'highest_price', | |
// 'price_alt' => 'lp', | |
// 'color' => 'colour', | |
// 'icolor' => 'vehicleInteriorColor', | |
// 'engine' => 'cylinder_configuration', | |
// 'trans' => 'cartransmission', | |
// 'trans_alt' => 'transmission', | |
// 'drive' => 'drivetrain', | |
// 'drive_alt' => 'driveTrain', | |
// 'fuel' => 'fueltype', | |
// 'fuel_alt' => 'fuelType', | |
// 'city' => 'suburb', | |
// 'state' => 'state', | |
// 'zip' => 'postcode', | |
//]; | |
// | |
////img file type pattern | |
//$pattern_img_file_type = '#(?<=.)\w+(?=&size)#'; | |
// | |
//foreach ($arr_json_vars as $key => $value) { | |
// | |
// if (! empty($arr_wk[$value]) && ! is_array($arr_wk[$value])) { | |
// $arr_data[$key] = trim(strip_tags(html_entity_decode($arr_wk[$value]))); | |
// } | |
//} | |
// | |
////post date "releaseDate">2018-05-03</span> | |
//$pattern_date = '/(?<="releaseDate">)\d{4}-\d{2}-\d{2}(?=<\/span>)/'; | |
//preg_match($pattern_date, $html, $match_arr); | |
//$arr_data['date'] = isset($match_arr[0]) ? $match_arr[0] : ''; | |
// | |
////seller name /s-seller/Harish/ | |
//$pattern_name = '/(?<=\/s-seller\/)[a-zA-Z0-9\-\s]{1,20}(?=\/)/'; | |
//preg_match($pattern_name, $html, $match_arr); | |
//$arr_data['name'] = isset($match_arr[0]) ? $match_arr[0] : ''; | |
// | |
//$arr_tag_item = [ | |
// 'state' => '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/', | |
// 'latitude' => '/(?<="latitude">)[0-9\-.]{5,40}(?="<\/span>)/', | |
// 'longitude' => '/(?<="longitude">)[0-9\-.]{5,40}(?="<\/span>)/', | |
//]; | |
// | |
//foreach ($arr_tag_item as $key => $value) { | |
// preg_match($value, $html, $match_arr); | |
// $arr_data[$key.'_regex'] = isset($match_arr[0]) ? $match_arr[0] : ''; | |
//} | |
///* | |
////region <span itemprop="addressRegion">NSW</span> | |
//$pattern_state = '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/'; | |
//preg_match($pattern_state, $html, $match_arr); | |
//$arr_data['state_regex'] = isset($match_arr[0]) ? $match_arr[0] : ''; | |
// | |
////geo localisation <span itemprop="latitude">-33.86462"</span> <span itemprop="longitude">151.04562999999996"</span> | |
//$pattern_latitude = '/(?<="addressRegion">)[a-zA-Z]{1,4}(?=<\/span>)/'; | |
//preg_match($pattern_state, $html, $match_arr); | |
//$arr_data['state_regex'] = isset($match_arr[0]) ? $match_arr[0] : ''; | |
//*/ | |
// | |
//$arr_tag_phone = ['Show number', '__phone']; | |
// | |
//foreach ($arr_tag_phone as $key => $value) { | |
// if (strstr($html, $value)) { | |
// $arr_data['has_phone'] = 1; | |
// } | |
//} | |
// | |
////desc <div id="ad_description_details_content"> | |
//$desc_srch = strstr($html, '<div id="ad_description_details_content">'); | |
//if ($desc_srch) { | |
// $desc_srchb = strstr($desc_srch, '</div>', 1); | |
// $arr_data['desc'] = $desc_srchb; | |
//} | |
// | |
////img https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/UPEAAOSwcIJa6mGA/$_35.JPG | |
//$img_pattern = '#https://i.ebayimg.com/\d{1,3}/\w{1,3}/[a-zA-Z0-9\-~=]{5,20}/\w{1,3}/[a-zA-Z0-9\-~=]{5,30}/\$_20.\w+#'; | |
//preg_match_all($img_pattern, $html, $arr_match); | |
//$arr_data['img'] = count($arr_match) ? array_unique(array_map('html_entity_decode', $arr_match[0])) : ''; | |
// | |
///* | |
//$arr_data['img'] = count($match_arr) ? array_unique(array_map('html_entity_decode', $match_arr[0])) : ''; | |
//$first_image = array_pop($arr_data['img']); | |
//array_unshift($arr_data['img'], $first_image);*/ | |
// | |
//print_r2($arr_data); | |
/* | |
//options | |
$opt_srch = strstr($html, '<ul class="features-list">'); | |
$arr_options = explode('</li>', strstr($opt_srch, '</ul>', 1)); | |
array_pop($arr_options); | |
$arr_options = array_map('strip_tags', $arr_options); | |
$arr_data['options'] = '<li>' . implode('</li><li>', $arr_options) . '</li>'; | |
//print_r( $arr_options); | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment