Skip to content

Instantly share code, notes, and snippets.

@vishwanath79
Last active June 5, 2017 04:36
Show Gist options
  • Save vishwanath79/eb9e4e4e821b3ae02e2ecadd8326a70a to your computer and use it in GitHub Desktop.
Save vishwanath79/eb9e4e4e821b3ae02e2ecadd8326a70a to your computer and use it in GitHub Desktop.
Extract data from tripadvisor hotel
#!/usr/bin/env python
from datetime import datetime
from time import time
from lxml import html, etree
import requests, re
import os, sys
import unicodecsv as csv
import argparse
def parse(locality, checkin_date, checkout_date, sort):
checkIn = checkin_date.strftime("%Y/%m/%d")
checkOut = checkout_date.strftime("%Y/%m/%d")
print("Scraper Inititated for Locality:%s" % locality)
# TA rendering the autocomplete list using this API
print("Finding search result page URL")
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime=' + str(int(
time())) + '&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query=' + locality
api_response = requests.get(geo_url).json()
# getting the TA url for th equery from the autocomplete response
url_from_autocomplete = "http://www.tripadvisor.com" + api_response['results'][0]['url']
print('URL found %s' % url_from_autocomplete)
geo = api_response['results'][0]['value']
# Formating date for writing to file
date = checkin_date.strftime("%Y_%m_%d") + "_" + checkout_date.strftime("%Y_%m_%d")
# form data to get the hotels list from TA for the selected date
form_data = {
'adults': '2',
'dateBumped': 'NONE',
'displayedSortOrder': sort,
'geo': geo,
'hs': '',
'isFirstPageLoad': 'false',
'rad': '0',
'refineForm': 'true',
'requestingServlet': 'Hotels',
'rooms': '1',
'scid': 'null_coupon',
'searchAll': 'false',
'seen': '0',
'sequence': '7',
'o': "0",
'staydates': date
}
# Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
headers = {
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'en-US,en;q=0.5',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'Host': 'www.tripadvisor.com',
'Pragma': 'no-cache',
'Referer': url_from_autocomplete,
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
'X-Requested-With': 'XMLHttpRequest'
}
print("Downloading search results page")
page_response = requests.post(url="https://www.tripadvisor.com/Hotels", data=form_data, headers=headers).text
#print(page_response)
print("Parsing results ")
parser = html.fromstring(page_response)
hotel_lists = parser.xpath('//div[contains(@class,"hotel_content easyClear sem")]')
#print(hotel_lists)
hotel_data = []
for hotel in hotel_lists:
XPATH_HOTEL_LINK = './/div[@class="listing_title"]/a/@href'
XPATH_REVIEWS = './/span[@class="more review_count"]//text()'
XPATH_RANK = './/div[@class="popRanking"]//text()'
XPATH_RATING = './/div[@class="rating"]//span[contains(@class,"bubble_rating")]/@alt'
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
XPATH_HOTEL_FEATURES = './/a[contains(@class,"tag")]/text()'
XPATH_HOTEL_PRICE = './/div[contains(@class,"price")]/text()'
XPATH_VIEW_DEALS = './/div[contains(@id,"VIEW_ALL_DEALS")]//span[@class="taLnk"]/text()'
XPATH_BOOKING_PROVIDER = './/div[contains(@class,"providerLogo")]/img/@alt'
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
raw_rank = hotel.xpath(XPATH_RANK)
raw_rating = hotel.xpath(XPATH_RATING)
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
url = 'http://www.tripadvisor.com' + raw_hotel_link[0] if raw_hotel_link else None
reviews = re.findall('(\d+\,?\d+)', raw_no_of_reviews[0])[0].replace(',', '') if raw_no_of_reviews else None
rank = ''.join(raw_rank) if raw_rank else None
rating = ''.join(raw_rating).replace(' of 5 bubbles', '') if raw_rating else None
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
hotel_features = ','.join(raw_hotel_features)
print("name is ", name)
#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
no_of_deals = re.sub('\D+', '', ''.join(raw_no_of_deals)) if raw_no_of_deals else None
# no_of_deals = re.sub('\D+','',no_of_deals)
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
data = {
'hotel_name': name,
'url': url,
'locality': locality,
'reviews': reviews,
'tripadvisor_rating': rating,
'checkOut': checkOut,
'checkIn': checkIn,
'hotel_features': hotel_features,
'price_per_night': price_per_night,
'no_of_deals': no_of_deals,
'booking_provider': booking_provider
}
hotel_data.append(data)
return hotel_data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('checkin_date', help='Hotel Check In Date (Format: YYYY/MM/DD')
parser.add_argument('checkout_date', help='Hotel Chek Out Date (Format: YYYY/MM/DD)')
sortorder_help = """
available sort orders are :\n
priceLow - hotels with lowest price,
distLow : Hotels located near to the search center,
recommended: highest rated hotels based on traveler reviews,
popularity :Most popular hotels as chosen by Tipadvisor users
"""
parser.add_argument('sort', help=sortorder_help, default='popularity ')
parser.add_argument('locality', help='Search Locality')
args = parser.parse_args()
locality = args.locality
checkin_date = datetime.strptime(args.checkin_date, "%Y/%m/%d")
checkout_date = datetime.strptime(args.checkout_date, "%Y/%m/%d")
sort = args.sort
checkIn = checkin_date.strftime("%Y/%m/%d")
checkOut = checkout_date.strftime("%Y/%m/%d")
today = datetime.now()
if today < datetime.strptime(checkIn, "%Y/%m/%d") and datetime.strptime(checkIn, "%Y/%m/%d") < datetime.strptime(
checkOut, "%Y/%m/%d"):
data = parse(locality, checkin_date, checkout_date, sort)
print("Writing to output file tripadvisor_data.csv")
with open('tripadvisor_data.csv', 'wb')as csvfile:
fieldnames = ['hotel_name', 'url', 'locality', 'reviews', 'tripadvisor_rating', 'checkIn', 'checkOut',
'price_per_night', 'booking_provider', 'no_of_deals', 'hotel_features']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
#print(data)
for row in data:
writer.writerow((row))
# checking whether the entered date is already passed
elif today > datetime.strptime(checkIn, "%Y/%m/%d") or today > datetime.strptime(checkOut, "%Y/%m/%d"):
print("Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed")
elif datetime.strptime(checkIn, "%Y/%m/%d") > datetime.strptime(checkOut, "%Y/%m/%d"):
print("Invalid Checkin date: CheckIn date must be less than checkOut date")
from lxml import html
import requests
from collections import OrderedDict
import pprint
import json
import argparse
def parse(url):
print "Fetching "+url
response = requests.get(url).text
parser = html.fromstring(response)
XPATH_RATING = '//div[@id="ratingFilter"]//ul//li'
XPATH_NAME = '//h1[@id="HEADING"]//text()'
XPATH_HOTEL_RATING = '//span[@property="ratingValue"]//@content'
XPATH_REVIEWS = '//a[@property="reviewCount"]/@content'
XPATH_RANK = '//div[contains(@class,"popRanking")]//text()'
XPATH_STREET_ADDRESS = "//span[@class='street-address']//text()"
XPATH_LOCALITY = '//span[@class="format_address"]//span[@class="locality"]//span[@property="v:locality"]//text()'
XPATH_ZIP = '//span[@property="v:postal-code"]//text()'
XPATH_COUNTRY = '//span[@class="country-name"]/@content'
XPATH_AMENITIES = '//div[@id="AMENITIES_TAB"]//div[contains(@class,"amenity_row")]'
XPATH_HIGHLIGHTS = '//div[@class="property_tags_wrap"]//li//text()'
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(@class,"additional_info")]//span[contains(@class,"tabs_descriptive_text")]//text()'
XPATH_ADDITIONAL_INFO = '//div[@class="additional_info_amenities"]//div[@class="content"]//text()'
ratings = parser.xpath(XPATH_RATING)
raw_name = parser.xpath(XPATH_NAME)
raw_rank = parser.xpath(XPATH_RANK)
raw_street_address = parser.xpath(XPATH_STREET_ADDRESS)
raw_locality = parser.xpath(XPATH_LOCALITY)
raw_zipcode = parser.xpath(XPATH_ZIP)
raw_country = parser.xpath(XPATH_COUNTRY)
raw_review_count = parser.xpath(XPATH_REVIEWS)
raw_rating = parser.xpath(XPATH_HOTEL_RATING)
amenities = parser.xpath(XPATH_AMENITIES)
raw_highlights = parser.xpath(XPATH_HIGHLIGHTS)
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO)
name = ''.join(raw_name).strip() if raw_name else None
rank = ''.join(raw_rank).strip() if raw_rank else None
street_address = raw_street_address[0].strip() if raw_street_address else None
locality = raw_locality[0].strip() if raw_locality else None
zipcode = ''.join(raw_zipcode).strip() if raw_zipcode else None
country = raw_country[0].strip() if raw_country else None
review_count = ''.join(raw_review_count).strip() if raw_review_count else None
hotel_rating = ''.join(raw_rating).strip() if raw_rating else None
official_description = ' '.join(' '.join(raw_official_description).split()) if raw_official_description else None
additional_info = ' '.join(''.join(raw_additional_info).split()) if raw_additional_info else None
cleaned_highlights = filter(lambda x:x != '\n', raw_highlights)
highlights = ','.join(cleaned_highlights).replace('\n','')
# Ordereddict is for preserve the site order
ratings_dict = OrderedDict()
for rating in ratings:
XPATH_RATING_KEY = './/div[@class="row_label"]//text()'
XPATH_RATING_VALUE = './/span[@class="row_bar"]/following-sibling::span//text()'
raw_rating_key = rating.xpath(XPATH_RATING_KEY)
raw_rating_value = rating.xpath(XPATH_RATING_VALUE)
cleaned_rating_key = ''.join(raw_rating_key).replace('\n','')
cleaned_rating_value = ''.join(raw_rating_value).replace('\n','')
ratings_dict.update({cleaned_rating_key:cleaned_rating_value})
amenity_dict = OrderedDict()
for amenity in amenities:
XPATH_AMENITY_KEY = './/div[@class="amenity_hdr"]//text()'
XPATH_AMENITY_VALUE = './/div[@class="amenity_lst"]//li/text()'
raw_amenity_key = amenity.xpath(XPATH_AMENITY_KEY)
raw_amenity_value = amenity.xpath(XPATH_AMENITY_VALUE)
cleaned_aminity_value = filter(lambda x:x != ' ', raw_amenity_value)
amenity_key = ''.join(raw_amenity_key).replace('\n','')
amenity_value = ' ,'.join(cleaned_aminity_value).replace('\n','')
amenity_dict.update({amenity_key:amenity_value})
address = { 'street_address':street_address,
'locality':locality,
'zipcode':zipcode,
'country':country
}
data = {
'address':address,
'ratings':ratings_dict,
'amenities':amenity_dict,
'official_description':official_description,
'additional_info':additional_info,
'rating':hotel_rating,
'review_count':review_count,
'name':name,
'rank':rank,
'highlights':highlights
}
return data
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url',help='Tripadvisor hotel url')
args = parser.parse_args()
url = args.url
scraped_data = parse(url)
with open('tripadvisor_hotel_scraped_data.json','w') as f:
json.dump(scraped_data,f,indent=4)
@vishwanath79
Copy link
Author

Updated tripadvisor_scraper.py to work on Python 3.5.2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment