Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract dynamically @mail on Tripadvisor.com, using Python 3, Request, and lxm
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
import datetime
import re
import argparse
def extract_mails(city, nb):
"""
Export the 100 first @mail of restaurants from a city,
through TripAdvisor website
Arguments:
city (str):
name of the city
nb (int):
number of restaurants to be scraped
Return:
dict_restaurants (dict):
dict with all mails and restaurants URLs
https://lobstr.io/index.php/2018/11/14/comment-recolter-les-mails-des-restaurants-sur-tripadvisor-avec-python-3-et-request/
"""
# INITIALISATION
print('-- INITIALISATION --')
print('VILLE: {}'.format(city.upper()))
print('RESTAURANTS: {}'.format(nb))
print('SITE: TRIPADVISOR.COM')
s = requests.session()
start = datetime.datetime.now()
base_url = 'https://www.tripadvisor.com'
cities_url = {
'paris': 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html',
'marseille': 'Restaurants-g187253-Marseille_Bouches_du_Rhone_Provence_Alpes_Cote_d_Azur.html',
'lyon': 'https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html',
'bordeaux': 'https://www.tripadvisor.com/Restaurants-g187079-Bordeaux_Gironde_Nouvelle_Aquitaine.html',
'amboise': 'https://www.tripadvisor.com/Restaurants-g187116-Amboise_Indre_et_Loire_Centre_Val_de_Loire.html'
}
try:
url = cities_url[city.lower()]
except IndexError:
print('NOT AVAILABLE CITY\nAVAILABLE CITIES: Paris, Marseille, Lyon, Bordeaux, Amboise\nsorry for others :\'\\')
raise IndexError
restaurants_dict = {'url': [], 'mail': []}
# COLLECTE DU CODE SOURCE
response = s.get(url=url)
if response.status_code == 200:
tree = html.fromstring(response.text)
else:
raise ConnectionError
# PARSING ET CRAWLING DES URLS DE RESTAURANTS
while True:
if len(restaurants_dict['url']) < nb:
restaurant_urls = tree.xpath("//div[@class='title']/a/@href")
for restaurant_url in restaurant_urls:
if len(restaurants_dict['url']) < nb:
restaurants_dict['url'].append(restaurant_url)
else:
break
next_page = base_url + tree.xpath("//a[@data-page-number and contains(text(), 'Next')]/@href")[0]
response = s.get(url=next_page)
if response.status_code == 200:
tree = html.fromstring(response.text)
else:
raise ConnectionError
else:
break
# RECOLTE DES MAILS GRACE AU REGEX
assert len(restaurants_dict['url']) == nb
for url in restaurants_dict['url']:
response = s.get(base_url + url)
if response.status_code == 200:
el = re.findall(r'\w+\@\w+\.\w+', response.text)
if el:
restaurants_dict['mail'].append(el[0])
else:
restaurants_dict['mail'].append('')
else:
raise ConnectionError
# TEMPS PASSE
end = datetime.datetime.now()
time_elapsed = str(end-start)
print('-- TIME ELAPSED --')
print(time_elapsed)
return restaurants_dict
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('city', help='Search City\'s restaurants on Tripadvisor.com')
argparser.add_argument('nb', help='Total restaurants to be collected')
args = argparser.parse_args()
city = args.city
nb = int(args.nb)
restaurant_dict = extract_mails(city, nb)
print('\n')
if restaurant_dict:
for i in range(len(restaurant_dict['url'])):
name = restaurant_dict['url'][i].split('Reviews')[-1].split('Paris')[0].replace('_', ' ').replace('-', '')
mail = restaurant_dict['mail'][i]
print('{}: {}'.format(name, mail))
nb_restaurants = len(restaurant_dict['url'])
nb_mails = len([x for x in restaurant_dict['mail'] if x])
print("\nPour la ville de {}, nous avons récolté:\n{} RESTAURANTS\n{} MAILS\n".format(city.upper(), nb_restaurants, nb_mails))
if not restaurant_dict:
print('Données indisponibles, veuillez réessayer')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment