Extract dynamically @mail on Tripadvisor.com, using Python 3, Request, and lxm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from lxml import html | |
import datetime | |
import re | |
import argparse | |
def extract_mails(city, nb): | |
""" | |
Export the 100 first @mail of restaurants from a city, | |
through TripAdvisor website | |
Arguments: | |
city (str): | |
name of the city | |
nb (int): | |
number of restaurants to be scraped | |
Return: | |
dict_restaurants (dict): | |
dict with all mails and restaurants URLs | |
https://lobstr.io/index.php/2018/11/14/comment-recolter-les-mails-des-restaurants-sur-tripadvisor-avec-python-3-et-request/ | |
""" | |
# INITIALISATION | |
print('-- INITIALISATION --') | |
print('VILLE: {}'.format(city.upper())) | |
print('RESTAURANTS: {}'.format(nb)) | |
print('SITE: TRIPADVISOR.COM') | |
s = requests.session() | |
start = datetime.datetime.now() | |
base_url = 'https://www.tripadvisor.com' | |
cities_url = { | |
'paris': 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html', | |
'marseille': 'Restaurants-g187253-Marseille_Bouches_du_Rhone_Provence_Alpes_Cote_d_Azur.html', | |
'lyon': 'https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html', | |
'bordeaux': 'https://www.tripadvisor.com/Restaurants-g187079-Bordeaux_Gironde_Nouvelle_Aquitaine.html', | |
'amboise': 'https://www.tripadvisor.com/Restaurants-g187116-Amboise_Indre_et_Loire_Centre_Val_de_Loire.html' | |
} | |
try: | |
url = cities_url[city.lower()] | |
except IndexError: | |
print('NOT AVAILABLE CITY\nAVAILABLE CITIES: Paris, Marseille, Lyon, Bordeaux, Amboise\nsorry for others :\'\\') | |
raise IndexError | |
restaurants_dict = {'url': [], 'mail': []} | |
# COLLECTE DU CODE SOURCE | |
response = s.get(url=url) | |
if response.status_code == 200: | |
tree = html.fromstring(response.text) | |
else: | |
raise ConnectionError | |
# PARSING ET CRAWLING DES URLS DE RESTAURANTS | |
while True: | |
if len(restaurants_dict['url']) < nb: | |
restaurant_urls = tree.xpath("//div[@class='title']/a/@href") | |
for restaurant_url in restaurant_urls: | |
if len(restaurants_dict['url']) < nb: | |
restaurants_dict['url'].append(restaurant_url) | |
else: | |
break | |
next_page = base_url + tree.xpath("//a[@data-page-number and contains(text(), 'Next')]/@href")[0] | |
response = s.get(url=next_page) | |
if response.status_code == 200: | |
tree = html.fromstring(response.text) | |
else: | |
raise ConnectionError | |
else: | |
break | |
# RECOLTE DES MAILS GRACE AU REGEX | |
assert len(restaurants_dict['url']) == nb | |
for url in restaurants_dict['url']: | |
response = s.get(base_url + url) | |
if response.status_code == 200: | |
el = re.findall(r'\w+\@\w+\.\w+', response.text) | |
if el: | |
restaurants_dict['mail'].append(el[0]) | |
else: | |
restaurants_dict['mail'].append('') | |
else: | |
raise ConnectionError | |
# TEMPS PASSE | |
end = datetime.datetime.now() | |
time_elapsed = str(end-start) | |
print('-- TIME ELAPSED --') | |
print(time_elapsed) | |
return restaurants_dict | |
if __name__ == "__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('city', help='Search City\'s restaurants on Tripadvisor.com') | |
argparser.add_argument('nb', help='Total restaurants to be collected') | |
args = argparser.parse_args() | |
city = args.city | |
nb = int(args.nb) | |
restaurant_dict = extract_mails(city, nb) | |
print('\n') | |
if restaurant_dict: | |
for i in range(len(restaurant_dict['url'])): | |
name = restaurant_dict['url'][i].split('Reviews')[-1].split('Paris')[0].replace('_', ' ').replace('-', '') | |
mail = restaurant_dict['mail'][i] | |
print('{}: {}'.format(name, mail)) | |
nb_restaurants = len(restaurant_dict['url']) | |
nb_mails = len([x for x in restaurant_dict['mail'] if x]) | |
print("\nPour la ville de {}, nous avons récolté:\n{} RESTAURANTS\n{} MAILS\n".format(city.upper(), nb_restaurants, nb_mails)) | |
if not restaurant_dict: | |
print('Données indisponibles, veuillez réessayer') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment