Skip to content

Instantly share code, notes, and snippets.

@markus2120
Created September 26, 2018 13:54
Show Gist options
  • Save markus2120/ba934c68166e0c06c5e0997de5a0d650 to your computer and use it in GitHub Desktop.
Save markus2120/ba934c68166e0c06c5e0997de5a0d650 to your computer and use it in GitHub Desktop.
import requests
from lxml import html
from lxml.etree import ParserError
import json
from time import sleep
import argparse
import unicodecsv as csv
import traceback
def parse_offer_details(url):
'''
Function to parse seller details from amazon offer listing page
eg:https://www.amazon.com/gp/offer-listing/
:param url:offer listing url
:rtype: seller details as json
'''
# Add some recent user agent to prevent blocking from amazon
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for retry in range(5):
try:
print("Downloading and processing page :", url)
response = requests.get(url, headers=headers)
if response.status_code == 403:
raise ValueError("Captcha found. Retrying")
response_text = response.text
parser = html.fromstring(response_text)
base_url = "https://www.amazon.de/"
parser.make_links_absolute(base_url)
XPATH_PRODUCT_LISTINGS = "//div[contains(@class, 'a-row a-spacing-mini olpOffer')]"
# Parsing seller list
listings = parser.xpath(XPATH_PRODUCT_LISTINGS)
offer_list = []
if not listings:
print("no sellers found")
return offer_list
# parsing individual seller
for listing in listings:
XPATH_PRODUCT_PRICE = ".//span[contains(@class, 'olpOfferPrice')]//text()"
XPATH_PRODUCT_CONDITION = ".//span[contains(@class, 'olpCondition')]//text()"
XPATH_PRODUCT_SELLER1 = ".//h3[contains(@class, 'olpSellerName')]//a/text()"
XPATH_PRODUCT_SELLER2 = ".//h3[contains(@class, 'olpSellerName')]//img//@alt"
product_price = listing.xpath(XPATH_PRODUCT_PRICE)
product_price = product_price[0].strip()
product_condition = listing.xpath(XPATH_PRODUCT_CONDITION)
seller1 = listing.xpath(XPATH_PRODUCT_SELLER1)
seller2 = listing.xpath(XPATH_PRODUCT_SELLER2)
# cleaning parsed data
product_condition = ''.join(''.join(product_condition).split()) if product_condition else None
product_seller = ''.join(seller1).strip() if seller1 else ''.join(seller2).strip()
offer_details = {
'price': product_price,
'condition': product_condition,
'seller': product_seller,
'asin': asin,
}
print (product_price, product_condition, product_seller,asin)
offer_list.append(offer_details)
return offer_list
except ParserError:
print("empty page found")
break
except:
print(traceback.format_exc())
print("retying :", url)
if __name__ == '__main__':
# defining arguments
parser = argparse.ArgumentParser()
parser.add_argument('asin', help='unique product id, eg "B01DQ2B8UY"')
args = parser.parse_args()
asin = args.asin
url = 'https://www.amazon.de/gp/offer-listing/'+asin+'/ref=dp_olp_used?ie=UTF8&condition=used'
data = parse_offer_details(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment