Skip to content

Instantly share code, notes, and snippets.

@scdekov
Created June 21, 2019 12:52
Show Gist options
  • Save scdekov/4b47d8dfc46592403f3e2505970f9f99 to your computer and use it in GitHub Desktop.
Save scdekov/4b47d8dfc46592403f3e2505970f9f99 to your computer and use it in GitHub Desktop.
import re
import scrapy
product = {
'url': 'https://www.amazon.co.uk/Asmodee-ASMDOBB01EN-Dobble-Card-Game/dp/B0031QBHMA/'
}
data_selectors_map = {
'title': lambda response: response.xpath('//*[@id="productTitle"]/text()').strip(),
# validate if all group results are the same
'parent_asin': lambda response: re.findall(r'["\']parentAsin["\']\s*:\s*["\']([A-z0-9]{10})',
response.body.decode('utf8'))[0],
'bullets': lambda response: "\n".join(map(str.strip,
response.xpath('//*[@id="feature-bullets"]//li//text()').getall())),
'price': lambda response: response.xpath('//*[@id="priceblock_ourprice"]//text()').get()[1:],
# TODO
# 'stock': lambda response:
# 'is_prime': ''
'brand': lambda response: response.xpath('//*[@id="bylineInfo"]/text()').get(),
'seller_rank': lambda response:\
re.findall(r'(\d+)', response.xpath('//*[@id="SalesRank"]//td[@class="value"]/text()').get().strip())[0],
'category_rank': lambda response:\
response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()').get()[1:],
'other_category_rank': lambda response:\
(response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()')\
.getall()[1:2] or [''])[0],
'description_length': ''
}
class ProductDataSpidere(scrapy.Spider):
name = 'product-data'
def start_requests(self):
yield scrapy.Request(url=product['url'], callback=self.parse)
def parse(self, response):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment