Created
December 27, 2023 07:33
-
-
Save cpouldev/b644e3892a46bb53f266c3c8de553803 to your computer and use it in GitHub Desktop.
Kritikos
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
import re | |
import urllib.parse | |
from scrapy import Request | |
from src.scraper.shops.spiders.base import SupermarketSpider | |
URLS = [ | |
'https://kritikos-sm.gr/offers/', | |
'https://kritikos-cxm-production.herokuapp.com/api/v2/products?collection_eq=900&eligible=true' | |
] | |
class KritikosSpider(SupermarketSpider): | |
name = 'kritikos' | |
allowed_domains = ['kritikos-sm.gr', 'kritikos-cxm-production.herokuapp.com'] | |
start_urls = [ | |
'https://kritikos-cxm-production.herokuapp.com/api/v2/categories/tree?collectionType=900&countProduct=true'] | |
categories = {} | |
def format_price(self, p): | |
if not p: | |
return None | |
# if type(p) is not int: | |
# n = int(p) | |
# else: | |
# n = p | |
# n_len = len(str(n)) | |
# | |
# n = n / pow(10, n_len - 1) | |
return p | |
def parse(self, response): | |
data = json.loads(response.body) | |
data = {d['_id']: d for d in data['payload']['categories']} | |
self.categories = data | |
yield Request(url=URLS[0], callback=self.parse_offers) | |
yield Request(url=URLS[1], callback=self.parse_products) | |
def parse_products(self, response): | |
data = json.loads(response.body) | |
for item in data['payload']['products']: | |
vendor_id = item['sku'] | |
brand = item['brand'] | |
_type = item['type'] | |
kind = item['categoryKind'] | |
qty = item['quantity'] | |
image = None | |
if 'image' in item: | |
image = item['image']['mobileLarge'] | |
elif 'images' in item: | |
if 'primary' in item['images']: | |
image = f'https://s3.eu-central-1.amazonaws.com/w4ve/kritikos/products/{item["images"]["primary"]}' | |
full_price = self.format_price(item['beginPrice']) | |
full_sale_price = self.format_price(item['finalPrice']) | |
if not full_price and not full_sale_price: | |
continue | |
if full_price >= full_sale_price: | |
full_sale_price = full_price | |
full_price = None | |
if not full_sale_price: | |
continue | |
if full_price: | |
full_price = float(format(full_price / 100, '.2f')) | |
if full_sale_price: | |
full_sale_price = float(format(full_sale_price / 100, '.2f')) | |
price = None | |
if full_price: | |
price = float(format(full_price * item['stepToUom'], '.2f')) | |
sale_price = float(format(full_sale_price * item['stepToUom'], '.2f')) | |
if (price and price > 100) or (sale_price and sale_price > 100): | |
continue | |
cost_per_unit_1 = f"{sale_price}€ / {item['stepToUom']} {item['uom']}" | |
cost_per_unit_2 = f"{full_sale_price}€ / 1 {item['uom']}" | |
if cost_per_unit_1 == cost_per_unit_2: | |
cost_per_unit = cost_per_unit_1 | |
else: | |
cost_per_unit = f'{cost_per_unit_1}, {cost_per_unit_2}' | |
# categories_ids = item['category']['categoryTree']['ancestors'] | |
if item['enabled'] and item['available']: | |
url = f'https://kritikos-sm.gr/{item["slug"]}' | |
else: | |
url = None | |
name = kind | |
if brand: | |
brand = brand.replace('^', "'") | |
name = f'{brand} {name}' | |
if _type: | |
name = f'{name} {_type}' | |
if qty: | |
name = f'{name} {qty}' | |
# parent_cat = self.categories[categories_ids[0]] | |
# parent_slug = parent_cat['slug'] | |
# | |
# url = f'{url}/{parent_slug}' | |
# | |
# if len(categories_ids) > 1: | |
# sub_cats = parent_cat['subCategories'] | |
# sub_cats = {cat['_id']: cat for cat in sub_cats} | |
# sub_slug = sub_cats[categories_ids[1]]['slug'] | |
# | |
# url = f'{url}/{sub_slug}' | |
# | |
# url = f'{url}/{vendor_id}' | |
image_item, image_hash = self.get_image_item(image) | |
yield image_item | |
self.insert_item( | |
item=name, | |
key=vendor_id, | |
url=url, | |
price=price, | |
sale_price=sale_price, | |
image=image_hash, | |
cost_per_unit=cost_per_unit | |
) | |
def parse_offers(self, response): | |
body = response.body.decode() | |
data = json.loads( | |
re.findall(r'(?<=<script id=\"__NEXT_DATA__\" type=\"application/json\">).*?(?=</script>)', body)[ | |
0].strip()) | |
for item in data['props']['pageProps']['offers']: | |
title = item['name'] | |
offer = item['webSticker'] | |
if 'Προσφορά' in offer: | |
offer = offer.replace('Προσφορά ', '') | |
else: | |
continue | |
vendor_id = item['_id'] | |
image_url = item['image']['mobileLarge'] | |
url = f'https://kritikos-sm.gr/offers/{urllib.parse.quote(title)}-{item["externalId"]}' | |
sale_price = float(format(self.format_price(item['price']) / 100, '.2f')) | |
price = None | |
image_item, image_hash = self.get_image_item(image_url) | |
yield image_item | |
self.insert_item( | |
item=title, | |
key=vendor_id, | |
url=url, | |
price=price, | |
sale_price=sale_price, | |
image=image_hash, | |
offer=offer | |
) | |
yield Request(url=URLS[1], callback=self.parse_products) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment