Created
December 27, 2023 07:29
-
-
Save cpouldev/773c15c730cb8bc672cf262059515882 to your computer and use it in GitHub Desktop.
AB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
from scrapy import Request | |
from src.scraper.shops.spiders.base import SupermarketSpider | |
BASE_URL = 'https://www.ab.gr/search/{cat}/loadMore?pageSize=20&pageNumber={page}&sort=relevance' | |
CDN_URL = 'https://d3hz4baxchepgp.cloudfront.net' | |
def gen_gql_query(cat_id, page): | |
return { | |
"operationName": "GetCategoryProductSearch", | |
"variables": { | |
"lang": "gr", | |
"searchQuery": "", | |
"category": cat_id, | |
"pageNumber": page, | |
"pageSize": 20, | |
"filterFlag": True | |
}, | |
"query": "query GetCategoryProductSearch($anonymousCartCookie: String, $lang: String, $searchQuery: String, $pageSize: Int, $pageNumber: Int, $category: String, $sort: String, $filterFlag: Boolean) {\n categoryProductSearch(anonymousCartCookie: $anonymousCartCookie, lang: $lang, searchQuery: $searchQuery, pageSize: $pageSize, pageNumber: $pageNumber, category: $category, sort: $sort, filterFlag: $filterFlag) {\n products {\n ...ProductBlockDetails\n __typename\n }\n breadcrumbs {\n ...Breadcrumbs\n __typename\n }\n facets {\n ...Facets\n __typename\n }\n sorts {\n name\n selected\n code\n __typename\n }\n pagination {\n ...Pagination\n __typename\n }\n currentQuery {\n query {\n value\n __typename\n }\n __typename\n }\n categorySearchTree {\n categoryDataList {\n categoryCode\n categoryData {\n facetData {\n count\n name\n query {\n query {\n value\n __typename\n }\n url\n __typename\n }\n selected\n __typename\n }\n subCategories\n __typename\n }\n __typename\n }\n level\n __typename\n }\n __typename\n }\n}\n\nfragment ProductBlockDetails on Product {\n available\n averageRating\n numberOfReviews\n manufacturerName\n manufacturerSubBrandName\n code\n freshnessDuration\n freshnessDurationTipFormatted\n frozen\n recyclable\n images {\n format\n imageType\n url\n __typename\n }\n maxOrderQuantity\n limitedAssortment\n name\n onlineExclusive\n potentialPromotions {\n alternativePromotionMessage\n code\n priceToBurn\n promotionType\n range\n redemptionLevel\n toDisplay\n description\n title\n promoBooster\n simplePromotionMessage\n __typename\n }\n price {\n approximatePriceSymbol\n currencySymbol\n formattedValue\n priceType\n supplementaryPriceLabel1\n supplementaryPriceLabel2\n showStrikethroughPrice\n discountedPriceFormatted\n unit\n unitCode\n unitPrice\n value\n __typename\n }\n purchasable\n productProposedPackaging\n productProposedPackaging2\n stock {\n inStock\n inStockBeforeMaxAdvanceOrderingDate\n partiallyInStock\n availableFromDate\n __typename\n }\n url\n previouslyBought\n nutriScoreLetter\n __typename\n}\n\nfragment Breadcrumbs on SearchBreadcrumb {\n facetCode\n facetName\n facetValueName\n facetValueCode\n removeQuery {\n query {\n value\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Facets on Facet {\n code\n name\n category\n facetUiType\n values {\n code\n count\n name\n query {\n query {\n value\n __typename\n }\n __typename\n }\n selected\n __typename\n }\n __typename\n}\n\nfragment Pagination on Pagination {\n currentPage\n totalResults\n totalPages\n sort\n __typename\n}\n"} | |
def gen_initial_gql_query(): | |
return {"operationName": "LeftHandNavigationBar", | |
"variables": {"rootCategoryCode": "", "cutOffLevel": "4", "lang": "gr"}, | |
"query": "query LeftHandNavigationBar($rootCategoryCode: String, $cutOffLevel: String, $lang: String, $topLevelCategoriesToHideIfEmpty: String, $anonymousCartCookie: String) {\n leftHandNavigationBar(rootCategoryCode: $rootCategoryCode, cutOffLevel: $cutOffLevel, lang: $lang, topLevelCategoriesToHideIfEmpty: $topLevelCategoriesToHideIfEmpty, anonymousCartCookie: $anonymousCartCookie) {\n categoryTreeList {\n categoriesInfo {\n categoryCode\n levelInfo {\n ...CategoryFields\n __typename\n }\n __typename\n }\n level\n __typename\n }\n levelInfo {\n ...CategoryFields\n __typename\n }\n __typename\n }\n}\n\nfragment CategoryFields on CategoryLevelInfo {\n name\n productCount\n url\n code\n __typename\n}\n"} | |
def format_price(price): | |
try: | |
return float(price.replace('€', '').replace(',', '.')) | |
except: | |
return None | |
class AbSpider(SupermarketSpider): | |
name = 'ab' | |
allowed_domains = ['ab.gr'] | |
start_urls = ['https://www.ab.gr/'] | |
download_delay = 5 | |
def catalog_request(self, cat_id, page, cat_name): | |
return Request(url='https://api.ab.gr/', | |
method='POST', | |
headers={'content-type': 'application/json'}, | |
body=json.dumps(gen_gql_query(cat_id, page)), | |
callback=self.parse_catalog, | |
meta={ | |
'cat_id': cat_id, | |
'page': page, | |
'cat_name': cat_name | |
}) | |
def start_requests(self): | |
yield Request(url='https://api.ab.gr/', | |
method='POST', | |
headers={'content-type': 'application/json'}, | |
body=json.dumps(gen_initial_gql_query()), | |
callback=self.parse_categories) | |
def parse_categories(self, response): | |
data = json.loads(response.body) | |
data = data['data']['leftHandNavigationBar']['categoryTreeList'][0]['categoriesInfo'] | |
for group in data: | |
for cat in group['levelInfo']: | |
cat_name = cat['name'] | |
cat_id = cat['code'] | |
yield self.catalog_request(cat_id, 0, cat_name) | |
def parse_catalog(self, response): | |
data = json.loads(response.body) | |
data = data['data']['categoryProductSearch'] | |
total_pages = data['pagination']['totalPages'] | |
items = data['products'] | |
meta = response.meta | |
for item in items: | |
cost_per_unit = item['price']['supplementaryPriceLabel1'] | |
if cost_per_unit: | |
price = format_price(item['price']['formattedValue']) | |
sale_price = format_price(item['price']['discountedPriceFormatted']) | |
if not price and not sale_price: | |
continue | |
brand = item['manufacturerName'] | |
name = item['name'] | |
vendor_id = item['code'] | |
url = 'https://www.ab.gr' + item['url'] | |
if item['images']: | |
image_url = [f"{CDN_URL}{i['url']}" for i in item['images'] if i['format'] == 'respListGrid'] | |
else: | |
image_url = [] | |
offer = [i['title'] for i in item['potentialPromotions']] | |
if len(offer) > 0: | |
offer = offer[0] | |
else: | |
offer = None | |
if brand and brand != '-': | |
title = f'{brand} {name}' | |
else: | |
title = name | |
if len(image_url) > 0: | |
image_url = image_url[0] | |
else: | |
image_url = None | |
image_item, image_hash = self.get_image_item(image_url) | |
yield image_item | |
self.insert_item( | |
item=title, | |
url=url, | |
key=vendor_id, | |
price=price, | |
sale_price=sale_price, | |
offer=offer, | |
cost_per_unit=cost_per_unit, | |
image=image_hash) | |
if meta['page'] >= total_pages: | |
return None | |
next_page = meta['page'] + 1 | |
yield self.catalog_request(meta['cat_id'], next_page, meta['cat_name']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment