Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python Script to scrape Tripadvisor.com for Hotels in a locality. Code Written as part of https://www.scrapehero.com/how-to-scrape-tripadvisor/
#!/usr/bin/env python
from datetime import datetime
from time import time
from lxml import html,etree
import requests,re
import os,sys
import unicodecsv as csv
import argparse
def parse(locality,checkin_date,checkout_date,sort):
checkIn = checkin_date.strftime("%Y/%m/%d")
checkOut = checkout_date.strftime("%Y/%m/%d")
print "Scraper Inititated for Locality:%s"%locality
# TA rendering the autocomplete list using this API
print "Finding search result page URL"
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
api_response = requests.get(geo_url, verify=False).json()
#getting the TA url for th equery from the autocomplete response
url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
print 'URL found %s'%url_from_autocomplete
geo = api_response['results'][0]['value']
#Formating date for writing to file
date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
#form data to get the hotels list from TA for the selected date
form_data = {'changeSet': 'TRAVEL_INFO',
'showSnippets': 'false',
'staydates':date,
'uguests': '2',
'sortOrder':sort
}
#Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
headers = {
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'en-US,en;q=0.5',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'Host': 'www.tripadvisor.com',
'Pragma': 'no-cache',
'Referer': url_from_autocomplete,
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
'X-Requested-With': 'XMLHttpRequest'
}
cookies= {"SetCurrency":"USD"}
print "Downloading search results page"
page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
print "Parsing results "
parser = html.fromstring(page_response.text)
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
hotel_data = []
if not hotel_lists:
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')
for hotel in hotel_lists:
XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
XPATH_REVIEWS = './/a[@class="review_count"]//text()'
XPATH_RANK = './/div[@class="popRanking"]//text()'
XPATH_RATING = './/span[contains(@class,"rating")]/@alt'
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
XPATH_HOTEL_FEATURES = './/div[contains(@class,"common_hotel_icons_list")]//li//text()'
XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()'
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
raw_rank = hotel.xpath(XPATH_RANK)
raw_rating = hotel.xpath(XPATH_RATING)
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None
reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
rank = ''.join(raw_rank) if raw_rank else None
rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
hotel_features = ','.join(raw_hotel_features)
price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
if no_of_deals:
no_of_deals = no_of_deals[0]
else:
no_of_deals = 0
data = {
'hotel_name':name,
'url':url,
'locality':locality,
'reviews':reviews,
'tripadvisor_rating':rating,
'checkOut':checkOut,
'checkIn':checkIn,
'hotel_features':hotel_features,
'price_per_night':price_per_night,
'no_of_deals':no_of_deals,
'booking_provider':booking_provider
}
hotel_data.append(data)
return hotel_data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('checkin_date',help = 'Hotel Check In Date (Format: YYYY/MM/DD')
parser.add_argument('checkout_date',help = 'Hotel Chek Out Date (Format: YYYY/MM/DD)')
sortorder_help = """
available sort orders are :\n
priceLow - hotels with lowest price,
distLow : Hotels located near to the search center,
recommended: highest rated hotels based on traveler reviews,
popularity :Most popular hotels as chosen by Tipadvisor users
"""
parser.add_argument('sort',help = sortorder_help,default ='popularity ')
parser.add_argument('locality',help = 'Search Locality')
args = parser.parse_args()
locality = args.locality
checkin_date = datetime.strptime(args.checkin_date,"%Y/%m/%d")
checkout_date = datetime.strptime(args.checkout_date,"%Y/%m/%d")
sort= args.sort
checkIn = checkin_date.strftime("%Y/%m/%d")
checkOut = checkout_date.strftime("%Y/%m/%d")
today = datetime.now()
if today<datetime.strptime(checkIn,"%Y/%m/%d") and datetime.strptime(checkIn,"%Y/%m/%d")<datetime.strptime(checkOut,"%Y/%m/%d"):
data = parse(locality,checkin_date,checkout_date,sort)
print "Writing to output file tripadvisor_data.csv"
with open('tripadvisor_data.csv','w')as csvfile:
fieldnames = ['hotel_name','url','locality','reviews','tripadvisor_rating','checkIn','checkOut','price_per_night','booking_provider','no_of_deals','hotel_features']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
#checking whether the entered date is already passed
elif today>datetime.strptime(checkIn,"%Y/%m/%d") or today>datetime.strptime(checkOut,"%Y/%m/%d"):
print "Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed"
elif datetime.strptime(checkIn,"%Y/%m/%d")>datetime.strptime(checkOut,"%Y/%m/%d"):
print "Invalid Checkin date: CheckIn date must be less than checkOut date"
@JDKao

This comment has been minimized.

Copy link

@JDKao JDKao commented Sep 19, 2017

Cant use anymore.
Output CSV nothing inside.

@aukrohero

This comment has been minimized.

Copy link

@aukrohero aukrohero commented Jan 10, 2018

Unfortunatelly is not working anymore, does anyone has the fix?

@ghost

This comment has been minimized.

Copy link
Owner Author

@ghost ghost commented Jan 12, 2018

The site had changed their front-end slightly. We have now modified the code.

@botsbuzz

This comment has been minimized.

Copy link

@botsbuzz botsbuzz commented May 6, 2018

CSV is empty for me. Any idea to fix that?

@ghost

This comment has been minimized.

Copy link
Owner Author

@ghost ghost commented Jun 5, 2018

Fixed now.

@haasasri

This comment has been minimized.

Copy link

@haasasri haasasri commented Aug 23, 2018

it is not working csv is empty

@JujuD28

This comment has been minimized.

Copy link

@JujuD28 JujuD28 commented Nov 24, 2018

I have the following error : TypeError: a bytes-like object is required, not 'str'

@s4lmon

This comment has been minimized.

Copy link

@s4lmon s4lmon commented Mar 17, 2019

I have the following error : TypeError: a bytes-like object is required, not 'str'

This script can be fixed by modifying line 83 as follows:

price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None

I have this running on Python 3 by only modifying this and adding brackets to the print statements.

@hamzafarooq

This comment has been minimized.

Copy link

@hamzafarooq hamzafarooq commented Jun 13, 2019

The code seems to skip hotels that show up in the search, every sixth hotel is saved in the csv - any suggestions?

@zmfink

This comment has been minimized.

Copy link

@zmfink zmfink commented Jul 25, 2019

It also appears that on first call, many of the prices are not returning. Although, if you run the same command a second time (exact same arguments), then all prices return. Is there a way to get all prices on first call?

@hgayanj

This comment has been minimized.

Copy link

@hgayanj hgayanj commented Feb 24, 2020

it works fine after few modifications suggested above but the rating column is blank and adding to that it only provide the search results of 1st page. how can i go to next pages and get results?

@AscoldOleg

This comment has been minimized.

Copy link

@AscoldOleg AscoldOleg commented Jun 3, 2020

Hello.
I've got an error :

Parsing results
Writing to output file tripadvisor_data.csv
Traceback (most recent call last):
  File "tripadvisor_scraper.py", line 144, in <module>
    writer.writeheader()
  File "c:\anaconda3\lib\site-packages\unicodecsv\py3.py", line 85, in writeheader
    self.writerow(header)
  File "C:\Anaconda3\lib\csv.py", line 155, in writerow
    return self.writer.writerow(self._dict_to_list(rowdict))
  File "c:\anaconda3\lib\site-packages\unicodecsv\py3.py", line 28, in writerow
    return self.writer.writerow(row)
  File "c:\anaconda3\lib\site-packages\unicodecsv\py3.py", line 15, in write
    return self.binary.write(string.encode(self.encoding, self.errors))
TypeError: write() argument must be str, not bytes

Any idea how to fix it?

@mahapara2411

This comment has been minimized.

Copy link

@mahapara2411 mahapara2411 commented Jul 31, 2020

#getting the TA url for th equery from the autocomplete response
url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
print 'URL found %s'%url_from_autocomplete
geo = api_response['results'][0]['value']

for this piece of code i am getting list index out of range.
can any one help please.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment