Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
def extract(url, path):
"""
Export all Name/Phone from a (french) PagesJaunes Web Page
Arguments:
url (str):
url of the aimed PagesJaunes Web Page
path (str):
path to the repository to save the .csv
Return:
.csv file
"""
# INITIALISATION
r = requests.session()
start = datetime.datetime.now()
# COLLECTE DU CODE SOURCE
# on modifie les headers
headers = {'Host': 'www.pagesjaunes.fr',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.pagesjaunes.fr/',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '379',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
# on récupere la data depuis le navigateur
data = "quoiqui=jardinier&ou=Marseille+%2813%29&idOu=L01305500&quiQuoiSaisi=jard&quiQuoiNbCar=4&ouSaisi=Marseille&ouNbCar=9&acOuSollicitee=1&rangOu=1&sourceOu=TOP&typeOu=Localite&nbPropositionOuTop=5&nbPropositionOuHisto=0&acQuiQuoiSollicitee=1&rangQuiQuoi=4&sourceQuiQuoi=TOP&typeQuiQuoi=1&idQuiQuoi=deb2d94cbf1ecfeae965ba02d84e18a7&nbPropositionQuiQuoiTop=5&nbPropositionQuiQuoiHisto=0"
# on envoie la requête
response = r.post(url=url, headers=headers, data=data)
print('-- URL --')
print(url)
print("-- STATUS CODE --")
print(response.status_code)
# CREATION DU CSV
with open(path + '/extract.csv', "w") as f:
fieldnames = ['Name', 'Phone']
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
# PARSING DE LA PAGE
tree = html.fromstring(response.text)
articles = tree.xpath("//article[contains(@id, 'bi-bloc-')]")
for article in articles:
name = article.xpath(".//a[@class='denomination-links pj-lb pj-link']/text()")
phone = article.xpath(".//strong[@class='num']/@title")
if name and phone:
print(name[0].strip(), phone[0].strip())
values = [name[0].strip(), phone[0].strip()]
dict_row = dict(zip(fieldnames, values))
writer.writerow(dict_row)
# TEMPS PASSE
end = datetime.datetime.now()
time_elapsed = str(end-start)
print('\n')
print('-- TIME ELAPSED --')
print(time_elapsed)
if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='PagesJaunes URLs')
argparser.add_argument('path', help='Path to csv')
args = argparser.parse_args()
# URL
url = args.url
# CHEMIN DE SAUVEGARDE DU CSV P
path = args.path
# ON LANCE LA FONCTION
extract(url, path)
@JujuD28

This comment has been minimized.

JujuD28 commented Nov 25, 2018

Bonjour,
Pourquoi est-ce que nous collectons uniquement 11 références alors que 20 sont affichées sur le site des Pages Jaunes?
Merci

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment