- add hcaptcha solver
- add dynamic url
- publish no-code scraper on https://lobstr.io/ (?)
Last active
August 11, 2023 21:57
-
-
Save lobstrio/f695937ea7eed124470c1c1475be24f0 to your computer and use it in GitHub Desktop.
Scrape all products from a cDiscount hot barbecue category URL π
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import json | |
from lxml import html | |
import time | |
from retry import retry | |
import csv | |
URL = 'https://www.cdiscount.com/search/10/barbecue.html' | |
HEADERS = { | |
'authority': 'www.cdiscount.com', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-language': 'fr-FR,fr;q=0.9', | |
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'none', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
} | |
FIELDNAMES = [ | |
"sku", | |
"url", | |
"image_url", | |
"title", | |
"score", | |
"reviews_count", | |
"price", | |
"is_cdav" | |
] | |
DATA = [] | |
class JavascriptChallengeError(Exception): | |
pass | |
class cDiscountScraper: | |
def __init__(self): | |
self.s = requests.Session() | |
self.s.headers = HEADERS | |
def solve_javascript_challenge(self, response): | |
print('solving javascript challenge') | |
raw_data = re.findall(r'(?<=__blnChallengeStore=)\{[^\;]+', response.text) | |
assert raw_data and len(raw_data) == 1 | |
raw_data = ''.join(raw_data) | |
raw_data = json.loads(raw_data) | |
challenge_cookie = raw_data['cookie'] | |
challenge_cookie.pop('maxAge') | |
challenge_cookie_value = challenge_cookie['value'] | |
challenge_cookie_name = challenge_cookie['name'] | |
self.s.cookies.set_cookie(requests.cookies.create_cookie(**challenge_cookie)) | |
check_params = raw_data['checkChallengeParams'] | |
url = 'https://www.cdiscount.com/.well-known/baleen/challengejs/check?%s=%s' % ( | |
challenge_cookie_name, | |
challenge_cookie_value | |
) | |
data = '&'.join(['%s=%s' % (k,v) for k, v in check_params.items()]) | |
response = self.s.post(url, data) | |
assert response.status_code == 200 | |
@retry(JavascriptChallengeError, tries=5, delay=5, backoff=1) | |
def get_cdiscount_data(self): | |
print('start') | |
response = self.s.get(URL, headers=HEADERS) | |
with open('first_req.html', 'w') as f: | |
f.write(response.text) | |
if 'Le JavaScript n\'est pas' in response.text: | |
self.solve_javascript_challenge(response) | |
raise JavascriptChallengeError | |
assert response.status_code == 200 | |
assert 'GEORGES' in response.text | |
doc = html.fromstring(response.text) | |
products = doc.xpath("//ul/[@id='lpBloc']/li[@data-sku]") | |
assert products | |
for product in products: | |
sku = product.get('data-sku') | |
url = "".join(product.xpath('./a/@href')) | |
assert url.startswith('https://www.cdiscount.com') | |
image_url = "".join(product.xpath('.//li/img[@class="prdtBImg"]/@data-src')) or "".join(product.xpath(".//li/img[@class='prdtBImg']/@src")) | |
title = "".join(product.xpath('.//h2[@class="prdtTit"]/text()')) | |
score = "".join(product.xpath('.//span[@class="c-stars-result c-stars-result--small"]/@data-score')) | |
if score: | |
score = float(score) / 20 | |
reviews_count = "".join(product.xpath('.//span[@class="c-stars-result c-stars-result--small"]/following-sibling::span/text()')) | |
if reviews_count: | |
reviews_count = int(reviews_count.strip('()')) | |
price = "".join(product.xpath('.//span[contains(@class, "price priceColor hideFromPro") and not(contains(@class, "price--xs"))]/text()')).strip('β¬') | |
is_cdav = len(product.xpath('.//div[@class="cdavZone"]')) > 0 | |
values = [sku, url, image_url, title, score, reviews_count, price, is_cdav] | |
d = dict(zip(FIELDNAMES, values)) | |
print(*d.values()) | |
DATA.append(d) | |
return DATA | |
def write_csv(self, DATA): | |
print('starting writing csv') | |
with open('cdiscount_results.csv', 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=FIELDNAMES) | |
writer.writeheader() | |
for d in DATA: | |
writer.writerow(d) | |
if __name__ == '__main__': | |
cdiscount_scraper = cDiscountScraper() | |
DATA = cdiscount_scraper.get_cdiscount_data() | |
cdiscount_scraper.write_csv(DATA) | |
print('''~~ success | |
_ _ _ | |
| | | | | | | |
| | ___ | |__ ___| |_ __ __ | |
| |/ _ \| '_ \/ __| __/| '__| | |
| | (_) | |_) \__ \ |_ | | | |
|_|\___/|_.__/|___/\__||_| | |
''') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment