Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
def extract(url, path):
"""
Export all Name/Phone from a (french) PagesJaunes Web Page
Arguments:
url (str):
url of the aimed PagesJaunes Web Page
path (str):
path to the repository to save the .csv
Return:
.csv file
"""
# INITIALISATION
r = requests.session()
start = datetime.datetime.now()
# COLLECTE DU CODE SOURCE
# on modifie les headers
headers = {'Host': 'www.pagesjaunes.fr',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.pagesjaunes.fr/',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '379',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
# on récupere la data depuis le navigateur
data = "quoiqui=jardinier&ou=Marseille+%2813%29&idOu=L01305500&quiQuoiSaisi=jard&quiQuoiNbCar=4&ouSaisi=Marseille&ouNbCar=9&acOuSollicitee=1&rangOu=1&sourceOu=TOP&typeOu=Localite&nbPropositionOuTop=5&nbPropositionOuHisto=0&acQuiQuoiSollicitee=1&rangQuiQuoi=4&sourceQuiQuoi=TOP&typeQuiQuoi=1&idQuiQuoi=deb2d94cbf1ecfeae965ba02d84e18a7&nbPropositionQuiQuoiTop=5&nbPropositionQuiQuoiHisto=0"
# on envoie la requête
response = r.post(url=url, headers=headers, data=data)
print('-- URL --')
print(url)
print("-- STATUS CODE --")
print(response.status_code)
# CREATION DU CSV
with open(path + '/extract.csv', "w") as f:
fieldnames = ['Name', 'Phone']
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
# PARSING DE LA PAGE
tree = html.fromstring(response.text)
articles = tree.xpath("//article[contains(@id, 'bi-bloc-')]")
for article in articles:
name = article.xpath(".//a[@class='denomination-links pj-lb pj-link']/text()")
phone = article.xpath(".//strong[@class='num']/@title")
if name and phone:
print(name[0].strip(), phone[0].strip())
values = [name[0].strip(), phone[0].strip()]
dict_row = dict(zip(fieldnames, values))
writer.writerow(dict_row)
# TEMPS PASSE
end = datetime.datetime.now()
time_elapsed = str(end-start)
print('\n')
print('-- TIME ELAPSED --')
print(time_elapsed)
if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='PagesJaunes URLs')
argparser.add_argument('path', help='Path to csv')
args = argparser.parse_args()
# URL
url = args.url
# CHEMIN DE SAUVEGARDE DU CSV P
path = args.path
# ON LANCE LA FONCTION
extract(url, path)
@JujuD28

This comment has been minimized.

Copy link

commented Nov 25, 2018

Bonjour,
Pourquoi est-ce que nous collectons uniquement 11 références alors que 20 sont affichées sur le site des Pages Jaunes?
Merci

@Fbo06

This comment has been minimized.

Copy link

commented Mar 26, 2019

Bonjour, super travail, très pédagogique. j'aurai une question SVP, chez moi votre script, bien que renvoyant le code 200, ne renvoi aucune ligne 7 fois sur 10, que peut il se passer ? par avance merci pour votre aide

@mohamedNCIR

This comment has been minimized.

Copy link

commented May 7, 2019

Bonjour, de meme pour moi , le code ne renvoie aucune linge (fréquence 10/10)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.