#!/usr/bin/env python | |
from datetime import datetime | |
from time import time | |
from lxml import html,etree | |
import requests,re | |
import os,sys | |
import unicodecsv as csv | |
import argparse | |
def parse(locality,checkin_date,checkout_date,sort): | |
checkIn = checkin_date.strftime("%Y/%m/%d") | |
checkOut = checkout_date.strftime("%Y/%m/%d") | |
print "Scraper Inititated for Locality:%s"%locality | |
# TA rendering the autocomplete list using this API | |
print "Finding search result page URL" | |
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality | |
api_response = requests.get(geo_url, verify=False).json() | |
#getting the TA url for th equery from the autocomplete response | |
url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url'] | |
print 'URL found %s'%url_from_autocomplete | |
geo = api_response['results'][0]['value'] | |
#Formating date for writing to file | |
date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d") | |
#form data to get the hotels list from TA for the selected date | |
form_data = {'changeSet': 'TRAVEL_INFO', | |
'showSnippets': 'false', | |
'staydates':date, | |
'uguests': '2', | |
'sortOrder':sort | |
} | |
#Referrer is necessary to get the correct response from TA if not provided they will redirect to home page | |
headers = { | |
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*', | |
'Accept-Encoding': 'gzip,deflate', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive', | |
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', | |
'Host': 'www.tripadvisor.com', | |
'Pragma': 'no-cache', | |
'Referer': url_from_autocomplete, | |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0', | |
'X-Requested-With': 'XMLHttpRequest' | |
} | |
cookies= {"SetCurrency":"USD"} | |
print "Downloading search results page" | |
page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False) | |
print "Parsing results " | |
parser = html.fromstring(page_response.text) | |
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]') | |
hotel_data = [] | |
if not hotel_lists: | |
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]') | |
for hotel in hotel_lists: | |
XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href' | |
XPATH_REVIEWS = './/a[@class="review_count"]//text()' | |
XPATH_RANK = './/div[@class="popRanking"]//text()' | |
XPATH_RATING = './/span[contains(@class,"rating")]/@alt' | |
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()' | |
XPATH_HOTEL_FEATURES = './/div[contains(@class,"common_hotel_icons_list")]//li//text()' | |
XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()' | |
XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()' | |
XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()' | |
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER) | |
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS) | |
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK) | |
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS) | |
raw_rank = hotel.xpath(XPATH_RANK) | |
raw_rating = hotel.xpath(XPATH_RATING) | |
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME) | |
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES) | |
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE) | |
url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None | |
reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0 | |
rank = ''.join(raw_rank) if raw_rank else None | |
rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None | |
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None | |
hotel_features = ','.join(raw_hotel_features) | |
price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None | |
no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals)) | |
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None | |
if no_of_deals: | |
no_of_deals = no_of_deals[0] | |
else: | |
no_of_deals = 0 | |
data = { | |
'hotel_name':name, | |
'url':url, | |
'locality':locality, | |
'reviews':reviews, | |
'tripadvisor_rating':rating, | |
'checkOut':checkOut, | |
'checkIn':checkIn, | |
'hotel_features':hotel_features, | |
'price_per_night':price_per_night, | |
'no_of_deals':no_of_deals, | |
'booking_provider':booking_provider | |
} | |
hotel_data.append(data) | |
return hotel_data | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('checkin_date',help = 'Hotel Check In Date (Format: YYYY/MM/DD') | |
parser.add_argument('checkout_date',help = 'Hotel Chek Out Date (Format: YYYY/MM/DD)') | |
sortorder_help = """ | |
available sort orders are :\n | |
priceLow - hotels with lowest price, | |
distLow : Hotels located near to the search center, | |
recommended: highest rated hotels based on traveler reviews, | |
popularity :Most popular hotels as chosen by Tipadvisor users | |
""" | |
parser.add_argument('sort',help = sortorder_help,default ='popularity ') | |
parser.add_argument('locality',help = 'Search Locality') | |
args = parser.parse_args() | |
locality = args.locality | |
checkin_date = datetime.strptime(args.checkin_date,"%Y/%m/%d") | |
checkout_date = datetime.strptime(args.checkout_date,"%Y/%m/%d") | |
sort= args.sort | |
checkIn = checkin_date.strftime("%Y/%m/%d") | |
checkOut = checkout_date.strftime("%Y/%m/%d") | |
today = datetime.now() | |
if today<datetime.strptime(checkIn,"%Y/%m/%d") and datetime.strptime(checkIn,"%Y/%m/%d")<datetime.strptime(checkOut,"%Y/%m/%d"): | |
data = parse(locality,checkin_date,checkout_date,sort) | |
print "Writing to output file tripadvisor_data.csv" | |
with open('tripadvisor_data.csv','w')as csvfile: | |
fieldnames = ['hotel_name','url','locality','reviews','tripadvisor_rating','checkIn','checkOut','price_per_night','booking_provider','no_of_deals','hotel_features'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
#checking whether the entered date is already passed | |
elif today>datetime.strptime(checkIn,"%Y/%m/%d") or today>datetime.strptime(checkOut,"%Y/%m/%d"): | |
print "Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed" | |
elif datetime.strptime(checkIn,"%Y/%m/%d")>datetime.strptime(checkOut,"%Y/%m/%d"): | |
print "Invalid Checkin date: CheckIn date must be less than checkOut date" |
This comment has been minimized.
This comment has been minimized.
Unfortunatelly is not working anymore, does anyone has the fix? |
This comment has been minimized.
This comment has been minimized.
The site had changed their front-end slightly. We have now modified the code. |
This comment has been minimized.
This comment has been minimized.
CSV is empty for me. Any idea to fix that? |
This comment has been minimized.
This comment has been minimized.
Fixed now. |
This comment has been minimized.
This comment has been minimized.
it is not working csv is empty |
This comment has been minimized.
This comment has been minimized.
I have the following error : TypeError: a bytes-like object is required, not 'str' |
This comment has been minimized.
This comment has been minimized.
This script can be fixed by modifying line 83 as follows:
I have this running on Python 3 by only modifying this and adding brackets to the print statements. |
This comment has been minimized.
This comment has been minimized.
The code seems to skip hotels that show up in the search, every sixth hotel is saved in the csv - any suggestions? |
This comment has been minimized.
This comment has been minimized.
It also appears that on first call, many of the prices are not returning. Although, if you run the same command a second time (exact same arguments), then all prices return. Is there a way to get all prices on first call? |
This comment has been minimized.
This comment has been minimized.
it works fine after few modifications suggested above but the rating column is blank and adding to that it only provide the search results of 1st page. how can i go to next pages and get results? |
This comment has been minimized.
This comment has been minimized.
Hello.
Any idea how to fix it? |
This comment has been minimized.
This comment has been minimized.
#getting the TA url for th equery from the autocomplete response for this piece of code i am getting list index out of range. |
This comment has been minimized.
Cant use anymore.
Output CSV nothing inside.