Skip to content

Instantly share code, notes, and snippets.

@scdekov
Created June 21, 2019 13:38
Show Gist options
  • Save scdekov/e0e6775e97626605feb77d558d2bb2d9 to your computer and use it in GitHub Desktop.
Save scdekov/e0e6775e97626605feb77d558d2bb2d9 to your computer and use it in GitHub Desktop.
class ProductDataExtractor:
def __init__(self, response):
self.response = response
def extract_data(self):
pass
def get_title(self):
return self.response.xpath('//*[@id="productTitle"]/text()').strip()
def get_parent_asin(self):
# validate if all group results are the same
parent_asins = re.findall(r'["\']parentAsin["\']\s*:\s*["\']([A-z0-9]{10})', response.body.decode('utf8'))
for asin in parent_asins:
if asin != parent_asins[0]:
# TODO: log error
pass
return parent_asins[0]
def get_bullets(self):
return "\n".join(map(str.strip, self.response.xpath('//*[@id="feature-bullets"]//li//text()').getall()))
def get_price(self):
# TODO: should we store what the currency is
return self.response.xpath('//*[@id="priceblock_ourprice"]//text()').get()[1:]
def get_stock(self):
# TODO
return 0
def get_is_prime(self):
# TODO: where to find this
return False
def get_brand(self):
return self.response.xpath('//*[@id="bylineInfo"]/text()').get()
def get_seller_rank(self):
return re.findall(r'(\d+)',
self.response.xpath('//*[@id="SalesRank"]//td[@class="value"]/text()').get().strip())[0]
def get_category_rank(self):
return self.response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()')\
.get()[1:]
def get_other_category_rank(self):
ranks = response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()').getall()
if len(ranks) > 1:
return ranks[1]
return ''
def get_description_length(self):
description = response.xpath(
'//*[@id="productDescription_feature_div"]//*[not(self::script or self::style)]/text()')\
.getall()
return sum([len(line.strip()) for line in description])
def get_best_seller_badge(self):
# TODO: check if this is correct
return bool(self.response.xpath('//*[@id="zeitgeistBadge_feature_div"][*]').get())
def get_has_amazon_choices_badge(self):
return bool(self.response.xpath('//*[@id="acBadge_feature_div"]//*[not(self::script)]').get())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment