Skip to content

Instantly share code, notes, and snippets.

@rsarai
Last active August 27, 2017 01:53
Show Gist options
  • Save rsarai/6f3a24e55f03c7be4c4d2d4e752ac096 to your computer and use it in GitHub Desktop.
Save rsarai/6f3a24e55f03c7be4c4d2d4e752ac096 to your computer and use it in GitHub Desktop.
Spider for retrieve all the products from a web site and another one for download images and renaming them.
import scrapy
from scrapymercado.items import ProductsItem
from scrapymercado.constants import URL
class ProductsSpider(scrapy.Spider):
name = "products"
def start_requests(self):
max_page_number = 67
base_url = 'http://supersecretsite.com/products'
yield scrapy.Request(url=base_url, callback=self.parse)
for idx in range(2, max_page_number + 1):
url = base_url + '?page=' + str(idx)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
base_url = 'http://supersecretsite.com'
for product in response.css("div.product-list-item"):
yield {
'nome': product.css("span.info::text").extract_first(),
'preco-cheio': '3000',
'imagem-1': base_url + product.css("img").xpath('@src').extract()[0],
'categoria-nome-nivel-1': "Produtos",
}
class ProductImgSpider(scrapy.Spider):
name = "productsimg"
def start_requests(self):
max_page_number = 67
base_url = 'http://supersecretsite.com'
yield scrapy.Request(url=base_url, callback=self.parse)
for idx in range(2, max_page_number + 1):
url = base_url + '?page=' + str(idx)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
urls = URL
item = ProductsItem()
item['image_urls'] = urls
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment