markus2120/amazon_seller_listing_scrape.py

## amazon_seller_listing_scrape.py
import requests
from lxml import html
from lxml.etree import ParserError
import json
from time import sleep
import argparse
import unicodecsv as csv
import traceback


def parse_offer_details(url):
    '''
    Function to parse seller details from amazon offer listing page
    eg:https://www.amazon.com/gp/offer-listing/
    :param url:offer listing url
    :rtype: seller details as json
    '''
    # Add some recent user agent to prevent blocking from amazon
    headers = {
                'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }

    for retry in range(5):
        try:
            print("Downloading and processing page :", url)
            response = requests.get(url, headers=headers)
            if response.status_code == 403:
                raise ValueError("Captcha found. Retrying")

            response_text = response.text
            parser = html.fromstring(response_text)
            base_url = "https://www.amazon.de/"
            parser.make_links_absolute(base_url)
            XPATH_PRODUCT_LISTINGS = "//div[contains(@class, 'a-row a-spacing-mini olpOffer')]"
            # Parsing seller list
            listings = parser.xpath(XPATH_PRODUCT_LISTINGS)
            offer_list = []

            if not listings:
                print("no sellers found")
                return offer_list

            # parsing individual seller
            for listing in listings:
                XPATH_PRODUCT_PRICE = ".//span[contains(@class, 'olpOfferPrice')]//text()"
                XPATH_PRODUCT_CONDITION = ".//span[contains(@class, 'olpCondition')]//text()"
                XPATH_PRODUCT_SELLER1 = ".//h3[contains(@class, 'olpSellerName')]//a/text()"
                XPATH_PRODUCT_SELLER2 = ".//h3[contains(@class, 'olpSellerName')]//img//@alt"

                product_price = listing.xpath(XPATH_PRODUCT_PRICE)
                product_price = product_price[0].strip()
                product_condition = listing.xpath(XPATH_PRODUCT_CONDITION)
                seller1 = listing.xpath(XPATH_PRODUCT_SELLER1)
                seller2 = listing.xpath(XPATH_PRODUCT_SELLER2)

                # cleaning parsed data
                product_condition = ''.join(''.join(product_condition).split()) if product_condition else None
                product_seller = ''.join(seller1).strip() if seller1 else ''.join(seller2).strip()

                offer_details = {
                                'price': product_price,
                                'condition': product_condition,
                                'seller': product_seller,
                                'asin': asin,
                }
                print (product_price, product_condition, product_seller,asin)
                offer_list.append(offer_details)
            return offer_list

        except ParserError:
            print("empty page found")
            break
        except:
            print(traceback.format_exc())
            print("retying :", url)

if __name__ == '__main__':
    # defining arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('asin', help='unique product id, eg "B01DQ2B8UY"')
    args = parser.parse_args()
    asin = args.asin

    url = 'https://www.amazon.de/gp/offer-listing/'+asin+'/ref=dp_olp_used?ie=UTF8&condition=used'
    data = parse_offer_details(url)
	import requests
	from lxml import html
	from lxml.etree import ParserError
	import json
	from time import sleep
	import argparse
	import unicodecsv as csv
	import traceback


	def parse_offer_details(url):
	'''
	Function to parse seller details from amazon offer listing page
	eg:https://www.amazon.com/gp/offer-listing/
	:param url:offer listing url
	:rtype: seller details as json
	'''
	# Add some recent user agent to prevent blocking from amazon
	headers = {
	'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
	}

	for retry in range(5):
	try:
	print("Downloading and processing page :", url)
	response = requests.get(url, headers=headers)
	if response.status_code == 403:
	raise ValueError("Captcha found. Retrying")

	response_text = response.text
	parser = html.fromstring(response_text)
	base_url = "https://www.amazon.de/"
	parser.make_links_absolute(base_url)
	XPATH_PRODUCT_LISTINGS = "//div[contains(@class, 'a-row a-spacing-mini olpOffer')]"
	# Parsing seller list
	listings = parser.xpath(XPATH_PRODUCT_LISTINGS)
	offer_list = []

	if not listings:
	print("no sellers found")
	return offer_list

	# parsing individual seller
	for listing in listings:
	XPATH_PRODUCT_PRICE = ".//span[contains(@class, 'olpOfferPrice')]//text()"
	XPATH_PRODUCT_CONDITION = ".//span[contains(@class, 'olpCondition')]//text()"
	XPATH_PRODUCT_SELLER1 = ".//h3[contains(@class, 'olpSellerName')]//a/text()"
	XPATH_PRODUCT_SELLER2 = ".//h3[contains(@class, 'olpSellerName')]//img//@alt"

	product_price = listing.xpath(XPATH_PRODUCT_PRICE)
	product_price = product_price[0].strip()
	product_condition = listing.xpath(XPATH_PRODUCT_CONDITION)
	seller1 = listing.xpath(XPATH_PRODUCT_SELLER1)
	seller2 = listing.xpath(XPATH_PRODUCT_SELLER2)

	# cleaning parsed data
	product_condition = ''.join(''.join(product_condition).split()) if product_condition else None
	product_seller = ''.join(seller1).strip() if seller1 else ''.join(seller2).strip()

	offer_details = {
	'price': product_price,
	'condition': product_condition,
	'seller': product_seller,
	'asin': asin,
	}
	print (product_price, product_condition, product_seller,asin)
	offer_list.append(offer_details)
	return offer_list

	except ParserError:
	print("empty page found")
	break
	except:
	print(traceback.format_exc())
	print("retying :", url)

	if __name__ == '__main__':
	# defining arguments
	parser = argparse.ArgumentParser()
	parser.add_argument('asin', help='unique product id, eg "B01DQ2B8UY"')
	args = parser.parse_args()
	asin = args.asin

	url = 'https://www.amazon.de/gp/offer-listing/'+asin+'/ref=dp_olp_used?ie=UTF8&condition=used'
	data = parse_offer_details(url)