Skip to content

Instantly share code, notes, and snippets.

@cpouldev
Created December 27, 2023 07:34
Show Gist options
  • Save cpouldev/e0628908771071f715388abf361f1193 to your computer and use it in GitHub Desktop.
Save cpouldev/e0628908771071f715388abf361f1193 to your computer and use it in GitHub Desktop.
marketin
# -*- coding: utf-8 -*-
from random import randint
from scrapy import Request
from w3lib import html
from src.scraper.shops.spiders.base import SupermarketSpider
def format_price(p):
try:
return float(p.strip().replace(' ', '').replace(',', '.').replace('€', ''))
except:
return None
class MarketInSpider(SupermarketSpider):
name = 'market-in'
allowed_domains = ['market-in.gr']
def start_requests(self):
yield Request(url='https://www.market-in.gr/el-gr/', callback=self.get_cats,
headers={'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'})
def get_cats(self, response):
cats = response.css('nav.category-menu > div > ul > li > a').xpath('@href').getall()
for cat in cats:
yield Request(url=cat, callback=self.parse_catalog,
headers={'X-Forwarded-For': f'46.246.{randint(128, 255)}.{randint(0, 255)}'})
def parse_catalog(self, response):
items = response.css('.product-item')
next_page = response.css('.pagination ul li:last-child')
if 'material-icons' in next_page.get():
next_page = next_page.css('a').xpath('@href').get()
else:
next_page = None
for item in items:
title = item.css('.product-title a:last-child::text').get()
url = item.css('a.product-thumb').xpath('@href').get()
cost_per_unit = None
price_wrappers = item.css('.product-price')
if len(price_wrappers) == 2:
cost_per_unit = html.remove_tags(price_wrappers[0].css('span.new-price').get()) \
.strip().replace('/n', '')
price = price_wrappers[1].css('span.old-price::text').get()
sale_price = price_wrappers[1].css('span.new-price::text').get()
else:
price = item.css('span.old-price::text').get()
sale_price = item.css('span.new-price::text').get()
price = format_price(price)
sale_price = format_price(sale_price)
if price and not sale_price:
sale_price = price
if not price and not sale_price:
continue
image_url = item.css('a.product-thumb img').xpath('@src').get()
image_url = f'https://market-in.gr{image_url}'
image_item, image_hash = self.get_image_item(image_url)
yield image_item
self.insert_item(
key=url,
item=title,
url=url,
price=price,
sale_price=sale_price,
image=image_hash,
cost_per_unit=cost_per_unit,
)
if next_page:
yield Request(url=next_page, callback=self.parse_catalog)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment