Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
A scraper for restaurant reviews from Tripadvisor
A script to scrape restaurant reviews from or using Selenium.
Author: Ferdinand Mütsch <>
License: MIT
Updated: January, 09 2018
- Install `selenium` and `pandas` using pip
- Install PhantomJS or get Chrome- or Firefox webdriver binaries and add them to your PATH (see
Usage as library:
from tripadvisor_scraper import TripadvisorScraper
scraper = TripadvisorScraper(engine='chrome')
reviews = scraper.fetch_reviews('<some_url_here>', 10)
Command-line usage:
$ python3 -e chrome -n 10 -o my_reviews.csv
import argparse
import datetime
import locale
import logging
import re
import sys
import time
import pandas as pd
from selenium import webdriver
URL_PATTERN = 'http(s)?:\/\/.?(www\.)?tripadvisor\.(com|de)\/Restaurant_Review.*'
class Review():
def __init__(self, id, date, title, user, text): = id = date
self.title = title
self.user = user
self.text = text
class TripadvisorScraper():
def __init__(self, engine='phantomjs'):
self.language = 'en'
self.locale_backup = locale.getlocale()[0]
self.lookup = {}
if engine == 'chrome':
self.driver = webdriver.Chrome()
elif engine == 'firefox':
self.driver = webdriver.Firefox()
elif engine == 'phantomjs':
self.driver = webdriver.PhantomJS()
logging.warning('Engine {} not supported. Defaulting to PhantomJS.'.format(engine))
self.driver = webdriver.PhantomJS()
self.i18n = {
'en': {
'more_btn': 'More',
'date_format': '%B %d, %Y'
'de': {
'more_btn': 'Mehr',
'date_format': '%d. %B %Y'
def _parse_page(self):
reviews = []
self.driver.find_element_by_xpath('//span[contains(., "{}") and @class="taLnk ulBlueLinks"]'.format(self.i18n[self.language]['more_btn'])).click()
time.sleep(2) # TODO
review_elements = self.driver.find_elements_by_class_name('reviewSelector')
for e in review_elements:
id = e.get_attribute('id')
date = e.find_element_by_class_name('ratingDate').get_attribute('title')
date = datetime.datetime.strptime(date, self.i18n[self.language]['date_format'])
title = e.find_element_by_class_name('quote').find_element_by_tag_name('a').find_element_by_class_name('noQuotes').text
user = e.find_element_by_class_name('memberOverlayLink').get_attribute('id')
user = user[4:user.index('-')]
user = None
text = e.find_element_by_class_name('partial_entry').text.replace('\n', '')
if id in self.lookup:
logging.warning('Fetched review {} twice.'.format(
self.lookup[id] = True
reviews.append(Review(id, date, title, user, text))
logging.warning('Couldn\'t fetch review.')
return reviews
def _set_language(self, url=''):
if '' in url:
self.language = 'de'
locale.setlocale(locale.LC_TIME, 'de_DE')
elif '' in url:
self.language = 'en'
locale.setlocale(locale.LC_TIME, 'en_US')
logging.warn('Tripadvisor domain location not supported. Defaulting to English (.com)')
def fetch_reviews(self, url, max_reviews=None, as_dataframe=True):
self.lookup = {}
reviews = []
if not max_reviews: max_reviews = sys.maxsize
if not is_valid_url(url): return logging.warning('Tripadvisor URL not valid.')
time.sleep(2) # TODO
while len(reviews) < max_reviews:
reviews += self._parse_page()'Fetched a total of {} reviews by now.'.format(len(reviews)))
next_button_container = self.driver.find_element_by_class_name('next')
if 'disabled' in next_button_container.get_attribute('class'): break
locale.setlocale(locale.LC_TIME, self.locale_backup)
reviews = reviews[:max_reviews]
if as_dataframe: return pd.DataFrame.from_records([r.__dict__ for r in reviews]).set_index('id', drop=True)
return reviews
def close(self):
def is_valid_url(url):
return re.compile(URL_PATTERN).match(url)
def get_language_by_url(url):
if '' in url: return 'de'
elif '' in url: return 'en'
return None
def get_id_by_url(url):
if not is_valid_url(url): return None
match = re.compile('.*Restaurant_Review-g\d+-(d\d+).*').match(url)
if match is None: return None
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Scrape restaurant reviews from Tripadvisor (.com or .de).')
parser.add_argument('url', help='URL to a Tripadvisor restaurant page')
parser.add_argument('-o', '--out', dest='outfile', help='Path for output CSV file', default='reviews.csv')
parser.add_argument('-n', dest='max', help='Maximum number of reviews to fetch', default=sys.maxsize, type=int)
parser.add_argument('-e', '--engine', dest='engine', help='Driver to use', choices=['phantomjs', 'chrome', 'firefox'], default='phantomjs')
args = parser.parse_args()
scraper = TripadvisorScraper(engine=args.engine)
df = scraper.fetch_reviews(args.url, args.max)
print('Successfully fetched {} reviews.'.format(len(df.index)))

This comment has been minimized.

Copy link

commented Jul 10, 2018

can you explain at a noob how to launach this script ? i have installed selenium into my chrome browser
Amazing job

Edit i launch the script and i got the error :

File "C:\Users\thoma\Desktop\", line 35 URL_PATTERN = ^ SyntaxError: invalid syntax


File "C:\Users\thoma\Desktop\", line 35 URL_PATTERN = ' ^ SyntaxError: EOL while scanning string literal


This comment has been minimized.

Copy link

commented Mar 1, 2019


you dont replace the URL Patter with the url you want. Instead, backspace and leave it as it was.

What you want to do is launch it through the command terminal Or through another python script.

you need to open up terminal using your windows and launch the script through the terminal then make sure to add the commands for example to launch yours, you would need to launch terminal and make sure the terminal is pointing towards your directory with this file in it.

then you would copy paste and hit enter

python3 -e chrome -n 10 -o cool_reviews_Thowmas.csv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.