lobstrio/tripadvisor_mail.py

## tripadvisor_mail.py
#!/usr/bin/python3
# coding: utf-8

import requests
from lxml import html
import datetime
import re
import argparse


def extract_mails(city, nb):

    """
    Export the 100 first @mail of restaurants from a city,
    through TripAdvisor website

    Arguments:
        city (str):
            name of the city
        nb (int):
            number of restaurants to be scraped

    Return:
        dict_restaurants (dict):
            dict with all mails and restaurants URLs

    https://lobstr.io/index.php/2018/11/14/comment-recolter-les-mails-des-restaurants-sur-tripadvisor-avec-python-3-et-request/
    """

    # INITIALISATION
    print('-- INITIALISATION --')
    print('VILLE: {}'.format(city.upper()))
    print('RESTAURANTS: {}'.format(nb))
    print('SITE: TRIPADVISOR.COM')

    s = requests.session()
    start = datetime.datetime.now()

    base_url = 'https://www.tripadvisor.com'
    cities_url = {
        'paris': 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html',
        'marseille': 'Restaurants-g187253-Marseille_Bouches_du_Rhone_Provence_Alpes_Cote_d_Azur.html',
        'lyon': 'https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html',
        'bordeaux': 'https://www.tripadvisor.com/Restaurants-g187079-Bordeaux_Gironde_Nouvelle_Aquitaine.html',
        'amboise': 'https://www.tripadvisor.com/Restaurants-g187116-Amboise_Indre_et_Loire_Centre_Val_de_Loire.html'
    }
    try:
        url = cities_url[city.lower()]
    except IndexError:
        print('NOT AVAILABLE CITY\nAVAILABLE CITIES: Paris, Marseille, Lyon, Bordeaux, Amboise\nsorry for others :\'\\')
        raise IndexError
    restaurants_dict = {'url': [], 'mail': []}

    # COLLECTE DU CODE SOURCE
    response = s.get(url=url)
    if response.status_code == 200:
        tree = html.fromstring(response.text)
    else:
        raise ConnectionError

    # PARSING ET CRAWLING DES URLS DE RESTAURANTS

    while True:

        if len(restaurants_dict['url']) < nb:
            restaurant_urls = tree.xpath("//div[@class='title']/a/@href")
            for restaurant_url in restaurant_urls:
                if len(restaurants_dict['url']) < nb:
                    restaurants_dict['url'].append(restaurant_url)
                else:
                    break
            next_page = base_url + tree.xpath("//a[@data-page-number and contains(text(), 'Next')]/@href")[0]
            response = s.get(url=next_page)

            if response.status_code == 200:
                tree = html.fromstring(response.text)
            else:
                raise ConnectionError
        else:
            break

    # RECOLTE DES MAILS GRACE AU REGEX

    assert len(restaurants_dict['url']) == nb
    for url in restaurants_dict['url']:
        response = s.get(base_url + url)
        if response.status_code == 200:
            el = re.findall(r'\w+\@\w+\.\w+', response.text)
            if el:
                restaurants_dict['mail'].append(el[0])
            else:
                restaurants_dict['mail'].append('')
        else:
            raise ConnectionError

    # TEMPS PASSE
    end = datetime.datetime.now()
    time_elapsed = str(end-start)
    print('-- TIME ELAPSED --')
    print(time_elapsed)

    return restaurants_dict


if __name__ == "__main__":

    argparser = argparse.ArgumentParser()
    argparser.add_argument('city', help='Search City\'s restaurants on Tripadvisor.com')
    argparser.add_argument('nb', help='Total restaurants to be collected')

    args = argparser.parse_args()
    city = args.city
    nb = int(args.nb)
    restaurant_dict = extract_mails(city, nb)

    print('\n')
    if restaurant_dict:
        for i in range(len(restaurant_dict['url'])):
            name = restaurant_dict['url'][i].split('Reviews')[-1].split('Paris')[0].replace('_', ' ').replace('-', '')
            mail = restaurant_dict['mail'][i]
            print('{}: {}'.format(name, mail))

        nb_restaurants = len(restaurant_dict['url'])
        nb_mails = len([x for x in restaurant_dict['mail'] if x])
        print("\nPour la ville de {}, nous avons récolté:\n{} RESTAURANTS\n{} MAILS\n".format(city.upper(), nb_restaurants, nb_mails))

    if not restaurant_dict:
        print('Données indisponibles, veuillez réessayer')
	#!/usr/bin/python3
	# coding: utf-8

	import requests
	from lxml import html
	import datetime
	import re
	import argparse


	def extract_mails(city, nb):

	"""
	Export the 100 first @mail of restaurants from a city,
	through TripAdvisor website

	Arguments:
	city (str):
	name of the city
	nb (int):
	number of restaurants to be scraped

	Return:
	dict_restaurants (dict):
	dict with all mails and restaurants URLs

	https://lobstr.io/index.php/2018/11/14/comment-recolter-les-mails-des-restaurants-sur-tripadvisor-avec-python-3-et-request/
	"""

	# INITIALISATION
	print('-- INITIALISATION --')
	print('VILLE: {}'.format(city.upper()))
	print('RESTAURANTS: {}'.format(nb))
	print('SITE: TRIPADVISOR.COM')

	s = requests.session()
	start = datetime.datetime.now()

	base_url = 'https://www.tripadvisor.com'
	cities_url = {
	'paris': 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html',
	'marseille': 'Restaurants-g187253-Marseille_Bouches_du_Rhone_Provence_Alpes_Cote_d_Azur.html',
	'lyon': 'https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html',
	'bordeaux': 'https://www.tripadvisor.com/Restaurants-g187079-Bordeaux_Gironde_Nouvelle_Aquitaine.html',
	'amboise': 'https://www.tripadvisor.com/Restaurants-g187116-Amboise_Indre_et_Loire_Centre_Val_de_Loire.html'
	}
	try:
	url = cities_url[city.lower()]
	except IndexError:
	print('NOT AVAILABLE CITY\nAVAILABLE CITIES: Paris, Marseille, Lyon, Bordeaux, Amboise\nsorry for others :\'\\')
	raise IndexError
	restaurants_dict = {'url': [], 'mail': []}

	# COLLECTE DU CODE SOURCE
	response = s.get(url=url)
	if response.status_code == 200:
	tree = html.fromstring(response.text)
	else:
	raise ConnectionError

	# PARSING ET CRAWLING DES URLS DE RESTAURANTS

	while True:

	if len(restaurants_dict['url']) < nb:
	restaurant_urls = tree.xpath("//div[@class='title']/a/@href")
	for restaurant_url in restaurant_urls:
	if len(restaurants_dict['url']) < nb:
	restaurants_dict['url'].append(restaurant_url)
	else:
	break
	next_page = base_url + tree.xpath("//a[@data-page-number and contains(text(), 'Next')]/@href")[0]
	response = s.get(url=next_page)

	if response.status_code == 200:
	tree = html.fromstring(response.text)
	else:
	raise ConnectionError
	else:
	break

	# RECOLTE DES MAILS GRACE AU REGEX

	assert len(restaurants_dict['url']) == nb
	for url in restaurants_dict['url']:
	response = s.get(base_url + url)
	if response.status_code == 200:
	el = re.findall(r'\w+\@\w+\.\w+', response.text)
	if el:
	restaurants_dict['mail'].append(el[0])
	else:
	restaurants_dict['mail'].append('')
	else:
	raise ConnectionError

	# TEMPS PASSE
	end = datetime.datetime.now()
	time_elapsed = str(end-start)
	print('-- TIME ELAPSED --')
	print(time_elapsed)

	return restaurants_dict


	if __name__ == "__main__":

	argparser = argparse.ArgumentParser()
	argparser.add_argument('city', help='Search City\'s restaurants on Tripadvisor.com')
	argparser.add_argument('nb', help='Total restaurants to be collected')

	args = argparser.parse_args()
	city = args.city
	nb = int(args.nb)
	restaurant_dict = extract_mails(city, nb)

	print('\n')
	if restaurant_dict:
	for i in range(len(restaurant_dict['url'])):
	name = restaurant_dict['url'][i].split('Reviews')[-1].split('Paris')[0].replace('_', ' ').replace('-', '')
	mail = restaurant_dict['mail'][i]
	print('{}: {}'.format(name, mail))

	nb_restaurants = len(restaurant_dict['url'])
	nb_mails = len([x for x in restaurant_dict['mail'] if x])
	print("\nPour la ville de {}, nous avons récolté:\n{} RESTAURANTS\n{} MAILS\n".format(city.upper(), nb_restaurants, nb_mails))

	if not restaurant_dict:
	print('Données indisponibles, veuillez réessayer')