KareemMAX/scrapper.py

## scrapper.py
import urllib.request
import scrapy
import re


class MohmScrapper(scrapy.Spider):
    name = 'mohm'
    start_urls = ['https://mohmfurniture.com/products/']
    idNum = 1

    def parse(self, response):
        categories = response.css("li.eg-monroe-wrapper")
        for category in categories:
            category_name = category.css("div.esg-center::text").get()
            yield scrapy.Request(
                category.css("a::attr(href)").get(),
                callback=self.parse_category,
                cb_kwargs=dict(
                    category_name=category_name
                )
            )

    def parse_category(self, response, category_name):
        products = response.css("li.product")
        for product in products:
            product_name = product.css("h2::text").get()
            yield scrapy.Request(
                product.css("a::attr(href)").get(),
                callback=self.parse_product,
                cb_kwargs=dict(
                    category_name=category_name,
                    product_name=product_name
                )
            )

    def parse_product(self, response, category_name, product_name):
        product_items = response.css("li.product")
        if product_items[0].css("h3 a::text").get() is None:
            yield scrapy.Request(
                response.url,
                callback=self.parse_category,
                cb_kwargs=dict(
                    category_name=category_name
                )
            )
            return
        for product_item in product_items:
            product_item_name = product_item.css("h3 a::text").get()
            yield scrapy.Request(
                product_item.css("a::attr(href)").get(),
                callback=self.parse_product_item,
                cb_kwargs=dict(
                    category_name=category_name,
                    product_name=product_name,
                    product_item_name=product_item_name
                )
            )

    def parse_product_item(self, response, category_name, product_name, product_item_name):
        final_product_name = response.css("div.page-header h1::text").get()
        description = response.css("div.woo-short-description").get()
        images = response.css("a.photoswipe::attr(href)").getall()
        product_id = str(category_name)[0].upper() + "MO" + "%03i" % self.idNum
        img_id = 0
        self.idNum += 1
        for image in images:
            img_id += 1
            # file_add = os.path.join("\\photos\\", product_id + "_%03i" % img_id+".jpg")
            urllib.request.urlretrieve(image, product_id + "_%03i" % img_id+".jpg")
        try:
            dimensions = re.findall(r"Dimensions.*?</tr>", str(description).replace('\n', ''), re.I)[0]
        except IndexError:
            dimensions = ""
        yield {
            "id": product_id,
            "category_name": str(category_name),
            "product_name": str(product_name).strip(),
            "product_item_name": str(product_item_name).strip(),
            "final_product_name": str(final_product_name),
            "dimensions": dimensions,
        }
	import urllib.request
	import scrapy
	import re


	class MohmScrapper(scrapy.Spider):
	name = 'mohm'
	start_urls = ['https://mohmfurniture.com/products/']
	idNum = 1

	def parse(self, response):
	categories = response.css("li.eg-monroe-wrapper")
	for category in categories:
	category_name = category.css("div.esg-center::text").get()
	yield scrapy.Request(
	category.css("a::attr(href)").get(),
	callback=self.parse_category,
	cb_kwargs=dict(
	category_name=category_name
	)
	)

	def parse_category(self, response, category_name):
	products = response.css("li.product")
	for product in products:
	product_name = product.css("h2::text").get()
	yield scrapy.Request(
	product.css("a::attr(href)").get(),
	callback=self.parse_product,
	cb_kwargs=dict(
	category_name=category_name,
	product_name=product_name
	)
	)

	def parse_product(self, response, category_name, product_name):
	product_items = response.css("li.product")
	if product_items[0].css("h3 a::text").get() is None:
	yield scrapy.Request(
	response.url,
	callback=self.parse_category,
	cb_kwargs=dict(
	category_name=category_name
	)
	)
	return
	for product_item in product_items:
	product_item_name = product_item.css("h3 a::text").get()
	yield scrapy.Request(
	product_item.css("a::attr(href)").get(),
	callback=self.parse_product_item,
	cb_kwargs=dict(
	category_name=category_name,
	product_name=product_name,
	product_item_name=product_item_name
	)
	)

	def parse_product_item(self, response, category_name, product_name, product_item_name):
	final_product_name = response.css("div.page-header h1::text").get()
	description = response.css("div.woo-short-description").get()
	images = response.css("a.photoswipe::attr(href)").getall()
	product_id = str(category_name)[0].upper() + "MO" + "%03i" % self.idNum
	img_id = 0
	self.idNum += 1
	for image in images:
	img_id += 1
	# file_add = os.path.join("\\photos\\", product_id + "_%03i" % img_id+".jpg")
	urllib.request.urlretrieve(image, product_id + "_%03i" % img_id+".jpg")
	try:
	dimensions = re.findall(r"Dimensions.*?</tr>", str(description).replace('\n', ''), re.I)[0]
	except IndexError:
	dimensions = ""
	yield {
	"id": product_id,
	"category_name": str(category_name),
	"product_name": str(product_name).strip(),
	"product_item_name": str(product_item_name).strip(),
	"final_product_name": str(final_product_name),
	"dimensions": dimensions,
	}