Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Last active August 11, 2023 21:57
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/f695937ea7eed124470c1c1475be24f0 to your computer and use it in GitHub Desktop.
Save lobstrio/f695937ea7eed124470c1c1475be24f0 to your computer and use it in GitHub Desktop.
Scrape all products from a cDiscount hot barbecue category URL πŸ–
import requests
import re
import json
from lxml import html
import time
from retry import retry
import csv
URL = 'https://www.cdiscount.com/search/10/barbecue.html'
HEADERS = {
'authority': 'www.cdiscount.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'fr-FR,fr;q=0.9',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
FIELDNAMES = [
"sku",
"url",
"image_url",
"title",
"score",
"reviews_count",
"price",
"is_cdav"
]
DATA = []
class JavascriptChallengeError(Exception):
pass
class cDiscountScraper:
def __init__(self):
self.s = requests.Session()
self.s.headers = HEADERS
def solve_javascript_challenge(self, response):
print('solving javascript challenge')
raw_data = re.findall(r'(?<=__blnChallengeStore=)\{[^\;]+', response.text)
assert raw_data and len(raw_data) == 1
raw_data = ''.join(raw_data)
raw_data = json.loads(raw_data)
challenge_cookie = raw_data['cookie']
challenge_cookie.pop('maxAge')
challenge_cookie_value = challenge_cookie['value']
challenge_cookie_name = challenge_cookie['name']
self.s.cookies.set_cookie(requests.cookies.create_cookie(**challenge_cookie))
check_params = raw_data['checkChallengeParams']
url = 'https://www.cdiscount.com/.well-known/baleen/challengejs/check?%s=%s' % (
challenge_cookie_name,
challenge_cookie_value
)
data = '&'.join(['%s=%s' % (k,v) for k, v in check_params.items()])
response = self.s.post(url, data)
assert response.status_code == 200
@retry(JavascriptChallengeError, tries=5, delay=5, backoff=1)
def get_cdiscount_data(self):
print('start')
response = self.s.get(URL, headers=HEADERS)
with open('first_req.html', 'w') as f:
f.write(response.text)
if 'Le JavaScript n\'est pas' in response.text:
self.solve_javascript_challenge(response)
raise JavascriptChallengeError
assert response.status_code == 200
assert 'GEORGES' in response.text
doc = html.fromstring(response.text)
products = doc.xpath("//ul/[@id='lpBloc']/li[@data-sku]")
assert products
for product in products:
sku = product.get('data-sku')
url = "".join(product.xpath('./a/@href'))
assert url.startswith('https://www.cdiscount.com')
image_url = "".join(product.xpath('.//li/img[@class="prdtBImg"]/@data-src')) or "".join(product.xpath(".//li/img[@class='prdtBImg']/@src"))
title = "".join(product.xpath('.//h2[@class="prdtTit"]/text()'))
score = "".join(product.xpath('.//span[@class="c-stars-result c-stars-result--small"]/@data-score'))
if score:
score = float(score) / 20
reviews_count = "".join(product.xpath('.//span[@class="c-stars-result c-stars-result--small"]/following-sibling::span/text()'))
if reviews_count:
reviews_count = int(reviews_count.strip('()'))
price = "".join(product.xpath('.//span[contains(@class, "price priceColor hideFromPro") and not(contains(@class, "price--xs"))]/text()')).strip('€')
is_cdav = len(product.xpath('.//div[@class="cdavZone"]')) > 0
values = [sku, url, image_url, title, score, reviews_count, price, is_cdav]
d = dict(zip(FIELDNAMES, values))
print(*d.values())
DATA.append(d)
return DATA
def write_csv(self, DATA):
print('starting writing csv')
with open('cdiscount_results.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
for d in DATA:
writer.writerow(d)
if __name__ == '__main__':
cdiscount_scraper = cDiscountScraper()
DATA = cdiscount_scraper.get_cdiscount_data()
cdiscount_scraper.write_csv(DATA)
print('''~~ success
_ _ _
| | | | | |
| | ___ | |__ ___| |_ __ __
| |/ _ \| '_ \/ __| __/| '__|
| | (_) | |_) \__ \ |_ | |
|_|\___/|_.__/|___/\__||_|
''')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment