Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Created November 21, 2018 19:05
Show Gist options
  • Save lobstrio/8ba48ad2b7e2d535008a5eb83ff33334 to your computer and use it in GitHub Desktop.
Save lobstrio/8ba48ad2b7e2d535008a5eb83ff33334 to your computer and use it in GitHub Desktop.
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
def extract(url, path):
"""
Export all Name/Phone from a (french) PagesJaunes Web Page
Arguments:
url (str):
url of the aimed PagesJaunes Web Page
path (str):
path to the repository to save the .csv
Return:
.csv file
"""
# INITIALISATION
r = requests.session()
start = datetime.datetime.now()
# COLLECTE DU CODE SOURCE
# on modifie les headers
headers = {'Host': 'www.pagesjaunes.fr',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.pagesjaunes.fr/',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '379',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
# on récupere la data depuis le navigateur
data = "quoiqui=jardinier&ou=Marseille+%2813%29&idOu=L01305500&quiQuoiSaisi=jard&quiQuoiNbCar=4&ouSaisi=Marseille&ouNbCar=9&acOuSollicitee=1&rangOu=1&sourceOu=TOP&typeOu=Localite&nbPropositionOuTop=5&nbPropositionOuHisto=0&acQuiQuoiSollicitee=1&rangQuiQuoi=4&sourceQuiQuoi=TOP&typeQuiQuoi=1&idQuiQuoi=deb2d94cbf1ecfeae965ba02d84e18a7&nbPropositionQuiQuoiTop=5&nbPropositionQuiQuoiHisto=0"
# on envoie la requête
response = r.post(url=url, headers=headers, data=data)
print('-- URL --')
print(url)
print("-- STATUS CODE --")
print(response.status_code)
# CREATION DU CSV
with open(path + '/extract.csv', "w") as f:
fieldnames = ['Name', 'Phone']
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
# PARSING DE LA PAGE
tree = html.fromstring(response.text)
articles = tree.xpath("//article[contains(@id, 'bi-bloc-')]")
for article in articles:
name = article.xpath(".//a[@class='denomination-links pj-lb pj-link']/text()")
phone = article.xpath(".//strong[@class='num']/@title")
if name and phone:
print(name[0].strip(), phone[0].strip())
values = [name[0].strip(), phone[0].strip()]
dict_row = dict(zip(fieldnames, values))
writer.writerow(dict_row)
# TEMPS PASSE
end = datetime.datetime.now()
time_elapsed = str(end-start)
print('\n')
print('-- TIME ELAPSED --')
print(time_elapsed)
if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='PagesJaunes URLs')
argparser.add_argument('path', help='Path to csv')
args = argparser.parse_args()
# URL
url = args.url
# CHEMIN DE SAUVEGARDE DU CSV P
path = args.path
# ON LANCE LA FONCTION
extract(url, path)
@SarraISMAIL
Copy link

moi aussi le code ne renvoie aucun ligne

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment