Last active
November 23, 2023 13:21
-
-
Save lobstrio/0729d2fe50e9ea4d38946a79a7db445a to your computer and use it in GitHub Desktop.
Extract dynamically @mail on Tripadvisor.com, using Python 3, Request, and lxm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from lxml import html | |
import datetime | |
import re | |
import argparse | |
def extract_mails(city, nb): | |
""" | |
Export the 100 first @mail of restaurants from a city, | |
through TripAdvisor website | |
Arguments: | |
city (str): | |
name of the city | |
nb (int): | |
number of restaurants to be scraped | |
Return: | |
dict_restaurants (dict): | |
dict with all mails and restaurants URLs | |
https://lobstr.io/index.php/2018/11/14/comment-recolter-les-mails-des-restaurants-sur-tripadvisor-avec-python-3-et-request/ | |
""" | |
# INITIALISATION | |
print('-- INITIALISATION --') | |
print('VILLE: {}'.format(city.upper())) | |
print('RESTAURANTS: {}'.format(nb)) | |
print('SITE: TRIPADVISOR.COM') | |
s = requests.session() | |
start = datetime.datetime.now() | |
base_url = 'https://www.tripadvisor.com' | |
cities_url = { | |
'paris': 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html', | |
'marseille': 'Restaurants-g187253-Marseille_Bouches_du_Rhone_Provence_Alpes_Cote_d_Azur.html', | |
'lyon': 'https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html', | |
'bordeaux': 'https://www.tripadvisor.com/Restaurants-g187079-Bordeaux_Gironde_Nouvelle_Aquitaine.html', | |
'amboise': 'https://www.tripadvisor.com/Restaurants-g187116-Amboise_Indre_et_Loire_Centre_Val_de_Loire.html' | |
} | |
try: | |
url = cities_url[city.lower()] | |
except IndexError: | |
print('NOT AVAILABLE CITY\nAVAILABLE CITIES: Paris, Marseille, Lyon, Bordeaux, Amboise\nsorry for others :\'\\') | |
raise IndexError | |
restaurants_dict = {'url': [], 'mail': []} | |
# COLLECTE DU CODE SOURCE | |
response = s.get(url=url) | |
if response.status_code == 200: | |
tree = html.fromstring(response.text) | |
else: | |
raise ConnectionError | |
# PARSING ET CRAWLING DES URLS DE RESTAURANTS | |
while True: | |
if len(restaurants_dict['url']) < nb: | |
restaurant_urls = tree.xpath("//div[@class='title']/a/@href") | |
for restaurant_url in restaurant_urls: | |
if len(restaurants_dict['url']) < nb: | |
restaurants_dict['url'].append(restaurant_url) | |
else: | |
break | |
next_page = base_url + tree.xpath("//a[@data-page-number and contains(text(), 'Next')]/@href")[0] | |
response = s.get(url=next_page) | |
if response.status_code == 200: | |
tree = html.fromstring(response.text) | |
else: | |
raise ConnectionError | |
else: | |
break | |
# RECOLTE DES MAILS GRACE AU REGEX | |
assert len(restaurants_dict['url']) == nb | |
for url in restaurants_dict['url']: | |
response = s.get(base_url + url) | |
if response.status_code == 200: | |
el = re.findall(r'\w+\@\w+\.\w+', response.text) | |
if el: | |
restaurants_dict['mail'].append(el[0]) | |
else: | |
restaurants_dict['mail'].append('') | |
else: | |
raise ConnectionError | |
# TEMPS PASSE | |
end = datetime.datetime.now() | |
time_elapsed = str(end-start) | |
print('-- TIME ELAPSED --') | |
print(time_elapsed) | |
return restaurants_dict | |
if __name__ == "__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('city', help='Search City\'s restaurants on Tripadvisor.com') | |
argparser.add_argument('nb', help='Total restaurants to be collected') | |
args = argparser.parse_args() | |
city = args.city | |
nb = int(args.nb) | |
restaurant_dict = extract_mails(city, nb) | |
print('\n') | |
if restaurant_dict: | |
for i in range(len(restaurant_dict['url'])): | |
name = restaurant_dict['url'][i].split('Reviews')[-1].split('Paris')[0].replace('_', ' ').replace('-', '') | |
mail = restaurant_dict['mail'][i] | |
print('{}: {}'.format(name, mail)) | |
nb_restaurants = len(restaurant_dict['url']) | |
nb_mails = len([x for x in restaurant_dict['mail'] if x]) | |
print("\nPour la ville de {}, nous avons récolté:\n{} RESTAURANTS\n{} MAILS\n".format(city.upper(), nb_restaurants, nb_mails)) | |
if not restaurant_dict: | |
print('Données indisponibles, veuillez réessayer') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, thank you for this code, i am a beginner in the py game, is it normal to wait more than 1 hour to get 10 restaurant adresses in lille for example ? I am french and as i saw french cities maybe you are to, so you can reply in fr ou eng as you want !