Created
October 7, 2015 19:28
-
-
Save pablophg/bf3fd96bb145dd5a7433 to your computer and use it in GitHub Desktop.
PHP spider that scrapes websites from hotels found in a TripAdvisor search, and searches for a contact email within such website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
ini_set('display_errors',1); | |
ini_set('display_startup_errors',1); | |
error_reporting(-1); | |
ini_set('max_execution_time', 1800); | |
class TripAdvisorSpider{ | |
private $url; // Unused | |
private $result_ids = array(); | |
public function __construct($search = false){ | |
//$this->url = "http://www.tripadvisor.es/Hotels-g187490-Castile_and_Leon-Hotels.html"; | |
} | |
public function getStringBetween($string, $start, $end){ | |
$string = ' ' . $string; | |
$ini = strpos($string, $start); | |
if ($ini == 0) return ''; | |
$ini += strlen($start); | |
$len = strpos($string, $end, $ini) - $ini; | |
return substr($string, $ini, $len); | |
} | |
public function makeRequest($index){ | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, "http://www.tripadvisor.es/Hotels"); | |
curl_setopt($ch,CURLOPT_POST, 10); | |
curl_setopt($ch,CURLOPT_POSTFIELDS, "seen=0&sequence=1&geo=187490&requestingServlet=Hotels&refineForm=true&hs=&o=a".$index."&pageSize=1000&rad=0&dateBumped=NONE&displayedSortOrder="); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
$result = curl_exec ($ch); | |
//return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); | |
curl_close ($ch); | |
//echo $result; | |
$json = json_decode($this->getStringBetween($result, "hasOyster: ", "};"), true); | |
foreach($json as $hotelid => $dummy){ | |
//echo $hotelid; | |
$this->result_ids[$hotelid] = $this->getRealWebsite($hotelid); | |
//array_push($this->result_ids, $hotelid); | |
} | |
} | |
public function iterateResultPages($num_pages = 10){ | |
$i = 0; | |
while ($i <= $num_pages){ | |
$this->makeRequest($i*25); | |
//$i = $i + 25; | |
$i++; | |
} | |
foreach ($this->result_ids as $key => $value){ | |
if ($value == false){ | |
unset($this->result_ids[$key]); | |
}else{ | |
$matches = array(); | |
$pattern = '/[A-Za-z0-9_-]+@[A-Za-z0-9_-]+\.([A-Za-z0-9_-][A-Za-z0-9_]+)/'; | |
preg_match($pattern, file_get_contents($value), $matches); | |
if (sizeOf($matches) > 0){ | |
$this->result_ids[$key] = $matches[0]; | |
}else{ | |
unset($this->result_ids[$key]); | |
} | |
} | |
} | |
/* | |
echo '<pre>'; | |
print_r($this->result_ids); | |
echo '</pre>'; | |
*/ | |
foreach ($this->result_ids as $hotelId => $hotelEmaiil){ | |
echo $hotelEmaiil; | |
echo '</br>'; | |
} | |
} | |
public function getRealWebsite ($id = false){ | |
if ($id){ | |
$ta_url = "http://www.tripadvisor.es/ShowUrl?&excludeFromVS=false&odc=BusinessListingsUrl&d=$id&url=0"; | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $ta_url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
$result = curl_exec ($ch); | |
$last_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); | |
curl_close ($ch); | |
if ($last_url == $ta_url) | |
return false; | |
return $last_url; | |
} | |
} | |
} | |
$ta = new TripAdvisorSpider(); | |
echo $ta->iterateResultPages(19); // 19 pages for Castille and Leon | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment