Skip to content

Instantly share code, notes, and snippets.

@pablophg
Created October 7, 2015 19:28
Show Gist options
  • Save pablophg/bf3fd96bb145dd5a7433 to your computer and use it in GitHub Desktop.
Save pablophg/bf3fd96bb145dd5a7433 to your computer and use it in GitHub Desktop.
PHP spider that scrapes websites from hotels found in a TripAdvisor search, and searches for a contact email within such website
<?php
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(-1);
ini_set('max_execution_time', 1800);
class TripAdvisorSpider{
private $url; // Unused
private $result_ids = array();
public function __construct($search = false){
//$this->url = "http://www.tripadvisor.es/Hotels-g187490-Castile_and_Leon-Hotels.html";
}
public function getStringBetween($string, $start, $end){
$string = ' ' . $string;
$ini = strpos($string, $start);
if ($ini == 0) return '';
$ini += strlen($start);
$len = strpos($string, $end, $ini) - $ini;
return substr($string, $ini, $len);
}
public function makeRequest($index){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://www.tripadvisor.es/Hotels");
curl_setopt($ch,CURLOPT_POST, 10);
curl_setopt($ch,CURLOPT_POSTFIELDS, "seen=0&sequence=1&geo=187490&requestingServlet=Hotels&refineForm=true&hs=&o=a".$index."&pageSize=1000&rad=0&dateBumped=NONE&displayedSortOrder=");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$result = curl_exec ($ch);
//return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close ($ch);
//echo $result;
$json = json_decode($this->getStringBetween($result, "hasOyster: ", "};"), true);
foreach($json as $hotelid => $dummy){
//echo $hotelid;
$this->result_ids[$hotelid] = $this->getRealWebsite($hotelid);
//array_push($this->result_ids, $hotelid);
}
}
public function iterateResultPages($num_pages = 10){
$i = 0;
while ($i <= $num_pages){
$this->makeRequest($i*25);
//$i = $i + 25;
$i++;
}
foreach ($this->result_ids as $key => $value){
if ($value == false){
unset($this->result_ids[$key]);
}else{
$matches = array();
$pattern = '/[A-Za-z0-9_-]+@[A-Za-z0-9_-]+\.([A-Za-z0-9_-][A-Za-z0-9_]+)/';
preg_match($pattern, file_get_contents($value), $matches);
if (sizeOf($matches) > 0){
$this->result_ids[$key] = $matches[0];
}else{
unset($this->result_ids[$key]);
}
}
}
/*
echo '<pre>';
print_r($this->result_ids);
echo '</pre>';
*/
foreach ($this->result_ids as $hotelId => $hotelEmaiil){
echo $hotelEmaiil;
echo '</br>';
}
}
public function getRealWebsite ($id = false){
if ($id){
$ta_url = "http://www.tripadvisor.es/ShowUrl?&excludeFromVS=false&odc=BusinessListingsUrl&d=$id&url=0";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $ta_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$result = curl_exec ($ch);
$last_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close ($ch);
if ($last_url == $ta_url)
return false;
return $last_url;
}
}
}
$ta = new TripAdvisorSpider();
echo $ta->iterateResultPages(19); // 19 pages for Castille and Leon
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment