lobstrio/pagesjaunes_extract.py

## pagesjaunes_extract.py
#!/usr/bin/python3
# coding: utf-8

import requests
import csv
from lxml import html
import datetime
import argparse


def extract(url, path):

    """
    Export all Name/Phone from a (french) PagesJaunes Web Page

    Arguments:
         url (str):
            url of the aimed PagesJaunes Web Page
        path (str):
            path to the repository to save the .csv

    Return:
        .csv file
    """

    # INITIALISATION
    r = requests.session()
    start = datetime.datetime.now()

    # COLLECTE DU CODE SOURCE

    # on modifie les headers
    headers = {'Host': 'www.pagesjaunes.fr',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
                'Accept-Encoding': 'gzip, deflate, br',
                'Referer': 'https://www.pagesjaunes.fr/',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Content-Length': '379',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Cache-Control': 'max-age=0'
               }

    # on récupere la data depuis le navigateur
    data = "quoiqui=jardinier&ou=Marseille+%2813%29&idOu=L01305500&quiQuoiSaisi=jard&quiQuoiNbCar=4&ouSaisi=Marseille&ouNbCar=9&acOuSollicitee=1&rangOu=1&sourceOu=TOP&typeOu=Localite&nbPropositionOuTop=5&nbPropositionOuHisto=0&acQuiQuoiSollicitee=1&rangQuiQuoi=4&sourceQuiQuoi=TOP&typeQuiQuoi=1&idQuiQuoi=deb2d94cbf1ecfeae965ba02d84e18a7&nbPropositionQuiQuoiTop=5&nbPropositionQuiQuoiHisto=0"

    # on envoie la requête
    response = r.post(url=url, headers=headers, data=data)
    print('-- URL --')
    print(url)
    print("-- STATUS CODE --")
    print(response.status_code)

    # CREATION DU CSV
    with open(path + '/extract.csv', "w") as f:
        fieldnames = ['Name', 'Phone']
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()

        # PARSING DE LA PAGE
        tree = html.fromstring(response.text)
        articles = tree.xpath("//article[contains(@id, 'bi-bloc-')]")
        for article in articles:
            name = article.xpath(".//a[@class='denomination-links pj-lb pj-link']/text()")
            phone = article.xpath(".//strong[@class='num']/@title")
            if name and phone:
                print(name[0].strip(), phone[0].strip())
                values = [name[0].strip(), phone[0].strip()]
                dict_row = dict(zip(fieldnames, values))
                writer.writerow(dict_row)

    # TEMPS PASSE
    end = datetime.datetime.now()
    time_elapsed = str(end-start)
    print('\n')
    print('-- TIME ELAPSED --')
    print(time_elapsed)


if __name__ == '__main__':
    argparser = argparse.ArgumentParser()
    argparser.add_argument('url', help='PagesJaunes URLs')
    argparser.add_argument('path', help='Path to csv')

    args = argparser.parse_args()
    # URL
    url = args.url
    # CHEMIN DE SAUVEGARDE DU CSV P
    path = args.path

    # ON LANCE LA FONCTION
    extract(url, path)
	#!/usr/bin/python3
	# coding: utf-8

	import requests
	import csv
	from lxml import html
	import datetime
	import argparse


	def extract(url, path):

	"""
	Export all Name/Phone from a (french) PagesJaunes Web Page

	Arguments:
	url (str):
	url of the aimed PagesJaunes Web Page
	path (str):
	path to the repository to save the .csv

	Return:
	.csv file
	"""

	# INITIALISATION
	r = requests.session()
	start = datetime.datetime.now()

	# COLLECTE DU CODE SOURCE

	# on modifie les headers
	headers = {'Host': 'www.pagesjaunes.fr',
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
	'Accept-Encoding': 'gzip, deflate, br',
	'Referer': 'https://www.pagesjaunes.fr/',
	'Content-Type': 'application/x-www-form-urlencoded',
	'Content-Length': '379',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'Cache-Control': 'max-age=0'
	}

	# on récupere la data depuis le navigateur
	data = "quoiqui=jardinier&ou=Marseille+%2813%29&idOu=L01305500&quiQuoiSaisi=jard&quiQuoiNbCar=4&ouSaisi=Marseille&ouNbCar=9&acOuSollicitee=1&rangOu=1&sourceOu=TOP&typeOu=Localite&nbPropositionOuTop=5&nbPropositionOuHisto=0&acQuiQuoiSollicitee=1&rangQuiQuoi=4&sourceQuiQuoi=TOP&typeQuiQuoi=1&idQuiQuoi=deb2d94cbf1ecfeae965ba02d84e18a7&nbPropositionQuiQuoiTop=5&nbPropositionQuiQuoiHisto=0"

	# on envoie la requête
	response = r.post(url=url, headers=headers, data=data)
	print('-- URL --')
	print(url)
	print("-- STATUS CODE --")
	print(response.status_code)

	# CREATION DU CSV
	with open(path + '/extract.csv', "w") as f:
	fieldnames = ['Name', 'Phone']
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()

	# PARSING DE LA PAGE
	tree = html.fromstring(response.text)
	articles = tree.xpath("//article[contains(@id, 'bi-bloc-')]")
	for article in articles:
	name = article.xpath(".//a[@class='denomination-links pj-lb pj-link']/text()")
	phone = article.xpath(".//strong[@class='num']/@title")
	if name and phone:
	print(name[0].strip(), phone[0].strip())
	values = [name[0].strip(), phone[0].strip()]
	dict_row = dict(zip(fieldnames, values))
	writer.writerow(dict_row)

	# TEMPS PASSE
	end = datetime.datetime.now()
	time_elapsed = str(end-start)
	print('\n')
	print('-- TIME ELAPSED --')
	print(time_elapsed)


	if __name__ == '__main__':
	argparser = argparse.ArgumentParser()
	argparser.add_argument('url', help='PagesJaunes URLs')
	argparser.add_argument('path', help='Path to csv')

	args = argparser.parse_args()
	# URL
	url = args.url
	# CHEMIN DE SAUVEGARDE DU CSV P
	path = args.path

	# ON LANCE LA FONCTION
	extract(url, path)