Skip to content

Instantly share code, notes, and snippets.

@KareemMAX
Created December 10, 2019 09:54
Show Gist options
  • Save KareemMAX/7429092632f3764c3a91bfa656e0307b to your computer and use it in GitHub Desktop.
Save KareemMAX/7429092632f3764c3a91bfa656e0307b to your computer and use it in GitHub Desktop.
Furniture Scrapper
import urllib.request
import scrapy
import re
class MohmScrapper(scrapy.Spider):
name = 'mohm'
start_urls = ['https://mohmfurniture.com/products/']
idNum = 1
def parse(self, response):
categories = response.css("li.eg-monroe-wrapper")
for category in categories:
category_name = category.css("div.esg-center::text").get()
yield scrapy.Request(
category.css("a::attr(href)").get(),
callback=self.parse_category,
cb_kwargs=dict(
category_name=category_name
)
)
def parse_category(self, response, category_name):
products = response.css("li.product")
for product in products:
product_name = product.css("h2::text").get()
yield scrapy.Request(
product.css("a::attr(href)").get(),
callback=self.parse_product,
cb_kwargs=dict(
category_name=category_name,
product_name=product_name
)
)
def parse_product(self, response, category_name, product_name):
product_items = response.css("li.product")
if product_items[0].css("h3 a::text").get() is None:
yield scrapy.Request(
response.url,
callback=self.parse_category,
cb_kwargs=dict(
category_name=category_name
)
)
return
for product_item in product_items:
product_item_name = product_item.css("h3 a::text").get()
yield scrapy.Request(
product_item.css("a::attr(href)").get(),
callback=self.parse_product_item,
cb_kwargs=dict(
category_name=category_name,
product_name=product_name,
product_item_name=product_item_name
)
)
def parse_product_item(self, response, category_name, product_name, product_item_name):
final_product_name = response.css("div.page-header h1::text").get()
description = response.css("div.woo-short-description").get()
images = response.css("a.photoswipe::attr(href)").getall()
product_id = str(category_name)[0].upper() + "MO" + "%03i" % self.idNum
img_id = 0
self.idNum += 1
for image in images:
img_id += 1
# file_add = os.path.join("\\photos\\", product_id + "_%03i" % img_id+".jpg")
urllib.request.urlretrieve(image, product_id + "_%03i" % img_id+".jpg")
try:
dimensions = re.findall(r"Dimensions.*?</tr>", str(description).replace('\n', ''), re.I)[0]
except IndexError:
dimensions = ""
yield {
"id": product_id,
"category_name": str(category_name),
"product_name": str(product_name).strip(),
"product_item_name": str(product_item_name).strip(),
"final_product_name": str(final_product_name),
"dimensions": dimensions,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment