Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/ef1063b2878870acdc85c26410ffff04 to your computer and use it in GitHub Desktop.
Save lobstrio/ef1063b2878870acdc85c26410ffff04 to your computer and use it in GitHub Desktop.
doctolib_scraper_072023.py
from curl_cffi import requests
from lxml import html
import json
import csv
import time
import argparse
HEADERS = {
'authority': 'www.doctolib.fr',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'fr-FR,fr;q=0.9',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
FIELDNAMES = [
'type',
'name',
'specialty',
'url',
'address_name',
'address_street',
'address_postal_code',
'address_locality',
'payment_accepted'
]
DATA = []
class DoctolibScraper:
def __init__(self):
self.s = requests.Session(impersonate="chrome101")
self.s.headers = HEADERS
def iter_doctors(self, url, max_page):
assert all([url, max_page])
for i in range(1, max_page+1):
u = url + '?page=%s' % i
print('going page %s' % i)
time.sleep(1)
response = self.s.get(u)
assert response.status_code == 200
doc = html.fromstring(response.text)
raw_json_data = doc.xpath("//script[@type='application/ld+json' and contains(text(), 'medicalSpecialty')]")
if not raw_json_data:
print('no more data')
break
assert raw_json_data and len(raw_json_data) == 1
raw_json_data = json.loads(raw_json_data[0].text)
assert isinstance(raw_json_data, list)
doctors = raw_json_data
for doctor in doctors:
d = {}
d['type'] = doctor['@type']
d['name'] = doctor['name']
d['specialty'] = doctor['medicalSpecialty']
d['url'] = 'https://www.doctolib.fr/'+doctor['url']
_address = doctor['address']
d['address_name'] = _address['name']
d['address_street'] = _address['streetAddress']
d['address_postal_code'] = _address['postalCode']
d['address_locality'] = _address['addressLocality']
d['payment_accepted'] = doctor['paymentAccepted']
print('scraped: %s' % d['name'])
DATA.append(d)
return DATA
def write_csv(self, data):
filename = 'data_scraping_doctolib_lobstr_io.csv'
assert data and isinstance(data, list)
assert filename
with open(filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
for d in DATA:
writer.writerow(d)
print('write csv: complete')
if __name__ == '__main__':
s = time.perf_counter()
d = DoctolibScraper()
argparser = argparse.ArgumentParser()
argparser.add_argument('--search-url', '-u', type=str, required=False, help='doctolib search url', default='https://www.doctolib.fr/radiologue/lyon')
argparser.add_argument('--max-page', '-p', type=int, required=False, help='max page to visit', default=2)
args = argparser.parse_args()
search_url = args.search_url
max_page = args.max_page
assert all([search_url, max_page])
data = d.iter_doctors(search_url, max_page)
d.write_csv(data)
elapsed = time.perf_counter() - s
elapsed_formatted = "{:.2f}".format(elapsed)
print("elapsed:", elapsed_formatted, "s")
print('''~~ success
_ _ _
| | | | | |
| | ___ | |__ ___| |_ __ __
| |/ _ \| '_ \/ __| __/| '__|
| | (_) | |_) \__ \ |_ | |
|_|\___/|_.__/|___/\__||_|
''')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment