Last active
July 27, 2019 00:22
-
-
Save muety/be52a86b82bbb1aa9c357125028eb7ca to your computer and use it in GitHub Desktop.
A scraper for restaurant reviews from Tripadvisor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
A script to scrape restaurant reviews from tripadvisor.com or tripadvisor.de using Selenium. | |
Author: Ferdinand Mütsch <mail@ferdinand-muetsch.de> | |
License: MIT | |
Updated: January, 09 2018 | |
Installation: | |
- Install `selenium` and `pandas` using pip | |
- Install PhantomJS or get Chrome- or Firefox webdriver binaries and add them to your PATH (see http://selenium-python.readthedocs.io/installation.html#drivers) | |
Usage as library: | |
``` | |
from tripadvisor_scraper import TripadvisorScraper | |
scraper = TripadvisorScraper(engine='chrome') | |
reviews = scraper.fetch_reviews('<some_url_here>', 10) | |
scraper.close() | |
``` | |
Command-line usage: | |
$ python3 tripadvisor_scraper.py -e chrome -n 10 -o my_reviews.csv https://www.tripadvisor.de/Restaurant_Review-g187289-d7595215-Reviews-Dom_Grill_Kitchen_Bar-Karlsruhe_Baden_Wurttemberg.html | |
''' | |
import argparse | |
import datetime | |
import locale | |
import logging | |
import re | |
import sys | |
import time | |
import pandas as pd | |
from selenium import webdriver | |
URL_PATTERN = 'http(s)?:\/\/.?(www\.)?tripadvisor\.(com|de)\/Restaurant_Review.*' | |
class Review(): | |
def __init__(self, id, date, title, user, text): | |
self.id = id | |
self.date = date | |
self.title = title | |
self.user = user | |
self.text = text | |
class TripadvisorScraper(): | |
def __init__(self, engine='phantomjs'): | |
self.language = 'en' | |
self.locale_backup = locale.getlocale()[0] | |
self.lookup = {} | |
if engine == 'chrome': | |
self.driver = webdriver.Chrome() | |
elif engine == 'firefox': | |
self.driver = webdriver.Firefox() | |
elif engine == 'phantomjs': | |
self.driver = webdriver.PhantomJS() | |
else: | |
logging.warning('Engine {} not supported. Defaulting to PhantomJS.'.format(engine)) | |
self.driver = webdriver.PhantomJS() | |
self.i18n = { | |
'en': { | |
'more_btn': 'More', | |
'date_format': '%B %d, %Y' | |
}, | |
'de': { | |
'more_btn': 'Mehr', | |
'date_format': '%d. %B %Y' | |
} | |
} | |
def _parse_page(self): | |
reviews = [] | |
try: | |
self.driver.find_element_by_xpath('//span[contains(., "{}") and @class="taLnk ulBlueLinks"]'.format(self.i18n[self.language]['more_btn'])).click() | |
except: | |
pass | |
time.sleep(2) # TODO | |
review_elements = self.driver.find_elements_by_class_name('reviewSelector') | |
for e in review_elements: | |
try: | |
id = e.get_attribute('id') | |
date = e.find_element_by_class_name('ratingDate').get_attribute('title') | |
date = datetime.datetime.strptime(date, self.i18n[self.language]['date_format']) | |
title = e.find_element_by_class_name('quote').find_element_by_tag_name('a').find_element_by_class_name('noQuotes').text | |
try: | |
user = e.find_element_by_class_name('memberOverlayLink').get_attribute('id') | |
user = user[4:user.index('-')] | |
except: | |
user = None | |
text = e.find_element_by_class_name('partial_entry').text.replace('\n', '') | |
if id in self.lookup: | |
logging.warning('Fetched review {} twice.'.format(r.id)) | |
else: | |
self.lookup[id] = True | |
reviews.append(Review(id, date, title, user, text)) | |
except: | |
logging.warning('Couldn\'t fetch review.') | |
pass | |
return reviews | |
def _set_language(self, url=''): | |
if 'tripadvisor.de' in url: | |
self.language = 'de' | |
locale.setlocale(locale.LC_TIME, 'de_DE') | |
elif 'tripadvisor.com' in url: | |
self.language = 'en' | |
locale.setlocale(locale.LC_TIME, 'en_US') | |
else: | |
logging.warn('Tripadvisor domain location not supported. Defaulting to English (.com)') | |
def fetch_reviews(self, url, max_reviews=None, as_dataframe=True): | |
self.lookup = {} | |
reviews = [] | |
if not max_reviews: max_reviews = sys.maxsize | |
self._set_language(url) | |
if not is_valid_url(url): return logging.warning('Tripadvisor URL not valid.') | |
self.driver.get(url) | |
time.sleep(2) # TODO | |
while len(reviews) < max_reviews: | |
reviews += self._parse_page() | |
logging.info('Fetched a total of {} reviews by now.'.format(len(reviews))) | |
next_button_container = self.driver.find_element_by_class_name('next') | |
if 'disabled' in next_button_container.get_attribute('class'): break | |
next_button_container.find_element_by_tag_name('div').click() | |
locale.setlocale(locale.LC_TIME, self.locale_backup) | |
reviews = reviews[:max_reviews] | |
if as_dataframe: return pd.DataFrame.from_records([r.__dict__ for r in reviews]).set_index('id', drop=True) | |
return reviews | |
def close(self): | |
self.driver.quit() | |
def is_valid_url(url): | |
return re.compile(URL_PATTERN).match(url) | |
def get_language_by_url(url): | |
if 'tripadvisor.de' in url: return 'de' | |
elif 'tripadvisor.com' in url: return 'en' | |
return None | |
def get_id_by_url(url): | |
if not is_valid_url(url): return None | |
match = re.compile('.*Restaurant_Review-g\d+-(d\d+).*').match(url) | |
if match is None: return None | |
return match.group(1) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Scrape restaurant reviews from Tripadvisor (.com or .de).') | |
parser.add_argument('url', help='URL to a Tripadvisor restaurant page') | |
parser.add_argument('-o', '--out', dest='outfile', help='Path for output CSV file', default='reviews.csv') | |
parser.add_argument('-n', dest='max', help='Maximum number of reviews to fetch', default=sys.maxsize, type=int) | |
parser.add_argument('-e', '--engine', dest='engine', help='Driver to use', choices=['phantomjs', 'chrome', 'firefox'], default='phantomjs') | |
args = parser.parse_args() | |
scraper = TripadvisorScraper(engine=args.engine) | |
df = scraper.fetch_reviews(args.url, args.max) | |
print('Successfully fetched {} reviews.'.format(len(df.index))) | |
df.to_csv(args.outfile) | |
scraper.close() |
you dont replace the URL Patter with the url you want. Instead, backspace and leave it as it was.
What you want to do is launch it through the command terminal Or through another python script.
you need to open up terminal using your windows and launch the script through the terminal then make sure to add the commands for example to launch yours, you would need to launch terminal and make sure the terminal is pointing towards your directory with this file in it.
then you would copy paste and hit enter
python3 tripadvisor_scraper.py -e chrome -n 10 -o cool_reviews_Thowmas.csv
https://www.tripadvisor.com/Restaurant_Review-g187175-d806529-Reviews-Michel_Sarran-Toulouse_Haute_Garonne_Occitanie.html
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
can you explain at a noob how to launach this script ? i have installed selenium into my chrome browser
Amazing job
Edit i launch the script and i got the error :
File "C:\Users\thoma\Desktop\Scraper.py", line 35 URL_PATTERN = https://www.tripadvisor.com/Restaurant_Review-g187175-d806529-Reviews-Michel_Sarran-Toulouse_Haute_Garonne_Occitanie.html ^ SyntaxError: invalid syntax
or
File "C:\Users\thoma\Desktop\Scraper.py", line 35 URL_PATTERN = 'https://www.tripadvisor.com/Restaurant_Review-g187175-d806529-Reviews-Michel_Sarran-Toulouse_Haute_Garonne_Occitanie.html ^ SyntaxError: EOL while scanning string literal