Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:29
Show Gist options
  • Save cpouldev/773c15c730cb8bc672cf262059515882 to your computer and use it in GitHub Desktop.
Save cpouldev/773c15c730cb8bc672cf262059515882 to your computer and use it in GitHub Desktop.
AB
# -*- coding: utf-8 -*-
import json
from scrapy import Request
from src.scraper.shops.spiders.base import SupermarketSpider
BASE_URL = 'https://www.ab.gr/search/{cat}/loadMore?pageSize=20&pageNumber={page}&sort=relevance'
CDN_URL = 'https://d3hz4baxchepgp.cloudfront.net'
def gen_gql_query(cat_id, page):
return {
"operationName": "GetCategoryProductSearch",
"variables": {
"lang": "gr",
"searchQuery": "",
"category": cat_id,
"pageNumber": page,
"pageSize": 20,
"filterFlag": True
},
"query": "query GetCategoryProductSearch($anonymousCartCookie: String, $lang: String, $searchQuery: String, $pageSize: Int, $pageNumber: Int, $category: String, $sort: String, $filterFlag: Boolean) {\n categoryProductSearch(anonymousCartCookie: $anonymousCartCookie, lang: $lang, searchQuery: $searchQuery, pageSize: $pageSize, pageNumber: $pageNumber, category: $category, sort: $sort, filterFlag: $filterFlag) {\n products {\n ...ProductBlockDetails\n __typename\n }\n breadcrumbs {\n ...Breadcrumbs\n __typename\n }\n facets {\n ...Facets\n __typename\n }\n sorts {\n name\n selected\n code\n __typename\n }\n pagination {\n ...Pagination\n __typename\n }\n currentQuery {\n query {\n value\n __typename\n }\n __typename\n }\n categorySearchTree {\n categoryDataList {\n categoryCode\n categoryData {\n facetData {\n count\n name\n query {\n query {\n value\n __typename\n }\n url\n __typename\n }\n selected\n __typename\n }\n subCategories\n __typename\n }\n __typename\n }\n level\n __typename\n }\n __typename\n }\n}\n\nfragment ProductBlockDetails on Product {\n available\n averageRating\n numberOfReviews\n manufacturerName\n manufacturerSubBrandName\n code\n freshnessDuration\n freshnessDurationTipFormatted\n frozen\n recyclable\n images {\n format\n imageType\n url\n __typename\n }\n maxOrderQuantity\n limitedAssortment\n name\n onlineExclusive\n potentialPromotions {\n alternativePromotionMessage\n code\n priceToBurn\n promotionType\n range\n redemptionLevel\n toDisplay\n description\n title\n promoBooster\n simplePromotionMessage\n __typename\n }\n price {\n approximatePriceSymbol\n currencySymbol\n formattedValue\n priceType\n supplementaryPriceLabel1\n supplementaryPriceLabel2\n showStrikethroughPrice\n discountedPriceFormatted\n unit\n unitCode\n unitPrice\n value\n __typename\n }\n purchasable\n productProposedPackaging\n productProposedPackaging2\n stock {\n inStock\n inStockBeforeMaxAdvanceOrderingDate\n partiallyInStock\n availableFromDate\n __typename\n }\n url\n previouslyBought\n nutriScoreLetter\n __typename\n}\n\nfragment Breadcrumbs on SearchBreadcrumb {\n facetCode\n facetName\n facetValueName\n facetValueCode\n removeQuery {\n query {\n value\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Facets on Facet {\n code\n name\n category\n facetUiType\n values {\n code\n count\n name\n query {\n query {\n value\n __typename\n }\n __typename\n }\n selected\n __typename\n }\n __typename\n}\n\nfragment Pagination on Pagination {\n currentPage\n totalResults\n totalPages\n sort\n __typename\n}\n"}
def gen_initial_gql_query():
return {"operationName": "LeftHandNavigationBar",
"variables": {"rootCategoryCode": "", "cutOffLevel": "4", "lang": "gr"},
"query": "query LeftHandNavigationBar($rootCategoryCode: String, $cutOffLevel: String, $lang: String, $topLevelCategoriesToHideIfEmpty: String, $anonymousCartCookie: String) {\n leftHandNavigationBar(rootCategoryCode: $rootCategoryCode, cutOffLevel: $cutOffLevel, lang: $lang, topLevelCategoriesToHideIfEmpty: $topLevelCategoriesToHideIfEmpty, anonymousCartCookie: $anonymousCartCookie) {\n categoryTreeList {\n categoriesInfo {\n categoryCode\n levelInfo {\n ...CategoryFields\n __typename\n }\n __typename\n }\n level\n __typename\n }\n levelInfo {\n ...CategoryFields\n __typename\n }\n __typename\n }\n}\n\nfragment CategoryFields on CategoryLevelInfo {\n name\n productCount\n url\n code\n __typename\n}\n"}
def format_price(price):
try:
return float(price.replace('€', '').replace(',', '.'))
except:
return None
class AbSpider(SupermarketSpider):
name = 'ab'
allowed_domains = ['ab.gr']
start_urls = ['https://www.ab.gr/']
download_delay = 5
def catalog_request(self, cat_id, page, cat_name):
return Request(url='https://api.ab.gr/',
method='POST',
headers={'content-type': 'application/json'},
body=json.dumps(gen_gql_query(cat_id, page)),
callback=self.parse_catalog,
meta={
'cat_id': cat_id,
'page': page,
'cat_name': cat_name
})
def start_requests(self):
yield Request(url='https://api.ab.gr/',
method='POST',
headers={'content-type': 'application/json'},
body=json.dumps(gen_initial_gql_query()),
callback=self.parse_categories)
def parse_categories(self, response):
data = json.loads(response.body)
data = data['data']['leftHandNavigationBar']['categoryTreeList'][0]['categoriesInfo']
for group in data:
for cat in group['levelInfo']:
cat_name = cat['name']
cat_id = cat['code']
yield self.catalog_request(cat_id, 0, cat_name)
def parse_catalog(self, response):
data = json.loads(response.body)
data = data['data']['categoryProductSearch']
total_pages = data['pagination']['totalPages']
items = data['products']
meta = response.meta
for item in items:
cost_per_unit = item['price']['supplementaryPriceLabel1']
if cost_per_unit:
price = format_price(item['price']['formattedValue'])
sale_price = format_price(item['price']['discountedPriceFormatted'])
if not price and not sale_price:
continue
brand = item['manufacturerName']
name = item['name']
vendor_id = item['code']
url = 'https://www.ab.gr' + item['url']
if item['images']:
image_url = [f"{CDN_URL}{i['url']}" for i in item['images'] if i['format'] == 'respListGrid']
else:
image_url = []
offer = [i['title'] for i in item['potentialPromotions']]
if len(offer) > 0:
offer = offer[0]
else:
offer = None
if brand and brand != '-':
title = f'{brand} {name}'
else:
title = name
if len(image_url) > 0:
image_url = image_url[0]
else:
image_url = None
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
item=title,
url=url,
key=vendor_id,
price=price,
sale_price=sale_price,
offer=offer,
cost_per_unit=cost_per_unit,
image=image_hash)
if meta['page'] >= total_pages:
return None
next_page = meta['page'] + 1
yield self.catalog_request(meta['cat_id'], next_page, meta['cat_name'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment