Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:33
Show Gist options
  • Save cpouldev/b644e3892a46bb53f266c3c8de553803 to your computer and use it in GitHub Desktop.
Save cpouldev/b644e3892a46bb53f266c3c8de553803 to your computer and use it in GitHub Desktop.
Kritikos
# -*- coding: utf-8 -*-
import json
import re
import urllib.parse
from scrapy import Request
from src.scraper.shops.spiders.base import SupermarketSpider
URLS = [
'https://kritikos-sm.gr/offers/',
'https://kritikos-cxm-production.herokuapp.com/api/v2/products?collection_eq=900&eligible=true'
]
class KritikosSpider(SupermarketSpider):
name = 'kritikos'
allowed_domains = ['kritikos-sm.gr', 'kritikos-cxm-production.herokuapp.com']
start_urls = [
'https://kritikos-cxm-production.herokuapp.com/api/v2/categories/tree?collectionType=900&countProduct=true']
categories = {}
def format_price(self, p):
if not p:
return None
# if type(p) is not int:
# n = int(p)
# else:
# n = p
# n_len = len(str(n))
#
# n = n / pow(10, n_len - 1)
return p
def parse(self, response):
data = json.loads(response.body)
data = {d['_id']: d for d in data['payload']['categories']}
self.categories = data
yield Request(url=URLS[0], callback=self.parse_offers)
yield Request(url=URLS[1], callback=self.parse_products)
def parse_products(self, response):
data = json.loads(response.body)
for item in data['payload']['products']:
vendor_id = item['sku']
brand = item['brand']
_type = item['type']
kind = item['categoryKind']
qty = item['quantity']
image = None
if 'image' in item:
image = item['image']['mobileLarge']
elif 'images' in item:
if 'primary' in item['images']:
image = f'https://s3.eu-central-1.amazonaws.com/w4ve/kritikos/products/{item["images"]["primary"]}'
full_price = self.format_price(item['beginPrice'])
full_sale_price = self.format_price(item['finalPrice'])
if not full_price and not full_sale_price:
continue
if full_price >= full_sale_price:
full_sale_price = full_price
full_price = None
if not full_sale_price:
continue
if full_price:
full_price = float(format(full_price / 100, '.2f'))
if full_sale_price:
full_sale_price = float(format(full_sale_price / 100, '.2f'))
price = None
if full_price:
price = float(format(full_price * item['stepToUom'], '.2f'))
sale_price = float(format(full_sale_price * item['stepToUom'], '.2f'))
if (price and price > 100) or (sale_price and sale_price > 100):
continue
cost_per_unit_1 = f"{sale_price}€ / {item['stepToUom']} {item['uom']}"
cost_per_unit_2 = f"{full_sale_price}€ / 1 {item['uom']}"
if cost_per_unit_1 == cost_per_unit_2:
cost_per_unit = cost_per_unit_1
else:
cost_per_unit = f'{cost_per_unit_1}, {cost_per_unit_2}'
# categories_ids = item['category']['categoryTree']['ancestors']
if item['enabled'] and item['available']:
url = f'https://kritikos-sm.gr/{item["slug"]}'
else:
url = None
name = kind
if brand:
brand = brand.replace('^', "'")
name = f'{brand} {name}'
if _type:
name = f'{name} {_type}'
if qty:
name = f'{name} {qty}'
# parent_cat = self.categories[categories_ids[0]]
# parent_slug = parent_cat['slug']
#
# url = f'{url}/{parent_slug}'
#
# if len(categories_ids) > 1:
# sub_cats = parent_cat['subCategories']
# sub_cats = {cat['_id']: cat for cat in sub_cats}
# sub_slug = sub_cats[categories_ids[1]]['slug']
#
# url = f'{url}/{sub_slug}'
#
# url = f'{url}/{vendor_id}'
image_item, image_hash = self.get_image_item(image)
yield image_item
self.insert_item(
item=name,
key=vendor_id,
url=url,
price=price,
sale_price=sale_price,
image=image_hash,
cost_per_unit=cost_per_unit
)
def parse_offers(self, response):
body = response.body.decode()
data = json.loads(
re.findall(r'(?<=<script id=\"__NEXT_DATA__\" type=\"application/json\">).*?(?=</script>)', body)[
0].strip())
for item in data['props']['pageProps']['offers']:
title = item['name']
offer = item['webSticker']
if 'Προσφορά' in offer:
offer = offer.replace('Προσφορά ', '')
else:
continue
vendor_id = item['_id']
image_url = item['image']['mobileLarge']
url = f'https://kritikos-sm.gr/offers/{urllib.parse.quote(title)}-{item["externalId"]}'
sale_price = float(format(self.format_price(item['price']) / 100, '.2f'))
price = None
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
item=title,
key=vendor_id,
url=url,
price=price,
sale_price=sale_price,
image=image_hash,
offer=offer
)
yield Request(url=URLS[1], callback=self.parse_products)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment