scdekov/extractor.py

## extractor.py
class ProductDataExtractor:
    def __init__(self, response):
        self.response = response

    def extract_data(self):
        pass

    def get_title(self):
        return self.response.xpath('//*[@id="productTitle"]/text()').strip()

    def get_parent_asin(self):
        # validate if all group results are the same
        parent_asins = re.findall(r'["\']parentAsin["\']\s*:\s*["\']([A-z0-9]{10})', response.body.decode('utf8'))
        for asin in parent_asins:
            if asin != parent_asins[0]:
                # TODO: log error
                pass

        return parent_asins[0]

    def get_bullets(self):
        return "\n".join(map(str.strip, self.response.xpath('//*[@id="feature-bullets"]//li//text()').getall()))

    def get_price(self):
        # TODO: should we store what the currency is
        return self.response.xpath('//*[@id="priceblock_ourprice"]//text()').get()[1:]

    def get_stock(self):
        # TODO
        return 0

    def get_is_prime(self):
        # TODO: where to find this
        return False

    def get_brand(self):
        return self.response.xpath('//*[@id="bylineInfo"]/text()').get()

    def get_seller_rank(self):
        return re.findall(r'(\d+)',
                          self.response.xpath('//*[@id="SalesRank"]//td[@class="value"]/text()').get().strip())[0]

    def get_category_rank(self):
        return self.response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()')\
                            .get()[1:]

    def get_other_category_rank(self):
        ranks = response.xpath('//*[@id="SalesRank"]//td[@class="value"]//*[@class="zg_hrsr_rank"]/text()').getall()
        if len(ranks) > 1:
            return ranks[1]
        return ''

    def get_description_length(self):
        description = response.xpath(
                              '//*[@id="productDescription_feature_div"]//*[not(self::script or self::style)]/text()')\
                              .getall()
        return sum([len(line.strip()) for line in description])

    def get_best_seller_badge(self):
        # TODO: check if this is correct
        return bool(self.response.xpath('//*[@id="zeitgeistBadge_feature_div"][*]').get())

    def get_has_amazon_choices_badge(self):
        return bool(self.response.xpath('//*[@id="acBadge_feature_div"]//*[not(self::script)]').get())
	class ProductDataExtractor:
	def __init__(self, response):
	self.response = response

	def extract_data(self):
	pass

	def get_title(self):
	return self.response.xpath('//*[@id="productTitle"]/text()').strip()

	def get_parent_asin(self):
	# validate if all group results are the same
	parent_asins = re.findall(r'["\']parentAsin["\']\s:\s["\']([A-z0-9]{10})', response.body.decode('utf8'))
	for asin in parent_asins:
	if asin != parent_asins[0]:
	# TODO: log error
	pass

	return parent_asins[0]

	def get_bullets(self):
	return "\n".join(map(str.strip, self.response.xpath('//*[@id="feature-bullets"]//li//text()').getall()))

	def get_price(self):
	# TODO: should we store what the currency is
	return self.response.xpath('//*[@id="priceblock_ourprice"]//text()').get()[1:]

	def get_stock(self):
	# TODO
	return 0

	def get_is_prime(self):
	# TODO: where to find this
	return False

	def get_brand(self):
	return self.response.xpath('//*[@id="bylineInfo"]/text()').get()

	def get_seller_rank(self):
	return re.findall(r'(\d+)',
	self.response.xpath('//*[@id="SalesRank"]//td[@class="value"]/text()').get().strip())[0]

	def get_category_rank(self):
	return self.response.xpath('//[@id="SalesRank"]//td[@class="value"]//[@class="zg_hrsr_rank"]/text()')\
	.get()[1:]

	def get_other_category_rank(self):
	ranks = response.xpath('//[@id="SalesRank"]//td[@class="value"]//[@class="zg_hrsr_rank"]/text()').getall()
	if len(ranks) > 1:
	return ranks[1]
	return ''

	def get_description_length(self):
	description = response.xpath(
	'//[@id="productDescription_feature_div"]//[not(self::script or self::style)]/text()')\
	.getall()
	return sum([len(line.strip()) for line in description])

	def get_best_seller_badge(self):
	# TODO: check if this is correct
	return bool(self.response.xpath('//[@id="zeitgeistBadge_feature_div"][]').get())

	def get_has_amazon_choices_badge(self):
	return bool(self.response.xpath('//[@id="acBadge_feature_div"]//[not(self::script)]').get())