Skip to content

Instantly share code, notes, and snippets.

@victor-shelepen
Created April 28, 2013 12:34
Show Gist options
  • Save victor-shelepen/5476759 to your computer and use it in GitHub Desktop.
Save victor-shelepen/5476759 to your computer and use it in GitHub Desktop.
This script scrap pages and returns product info from an user wish list. I know that eBay has got own communication REST API. I've some problems during the registration. eBay REST API will the next step. I share this code to get advise from another programmers how I should write the code right. Thank you.
#!/usr/bin/env python
import logging
import urllib2
from lxml.html import etree
from lxml.html.clean import Cleaner
class EbayParser:
def __init__(self, logger_name='ebay_parser', logger_level=logging.DEBUG):
self.product_list = []
self.headers = {
"User-Agent": "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1",
"Accept": "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1",
"Accept-Language": "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7",
"Accept-Charset": "iso-8859-1, utf-8, utf-16, *;q=0.1",
"Accept-Encoding": "identity, *;q=0",
"Connection": "Keep-Alive"
}
self.pathes = {
'list': {
'link': '//div[contains(@class, wlgd-row)]//a[contains(@class, "item-tle")]/@href'
},
'item': {
'title': '//h1[contains(@id, "itemTitle")]/text()',
'block': '//form[contains(@name, "viactiondetails")]',
'price': '//span[contains(@id, "prcIsum")]/text()',
'available': '//span[contains(@id, "qtySubTxt")]/text()',
'shipping_summary': '//div[contains(@id, "shippingSummary")]'
}
}
self.cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)
self.logger = logging.getLogger(logger_name)
self.logger.setLevel(logger_level)
def _try_xpath(self, dom_item, path):
'''
- Try to get items from the dom model according the path.
'''
try:
result = dom_item.xpath(path)
if len(result) == 1:
return result[0]
else:
return result
except Exception:
self.logger.error("Wrong XPath - %s" % path)
return None
def _load(self, url, headers):
'''
- load data from the url using the headers.
This is a library function.
'''
request = urllib2.Request(url = url, headers = headers)
file = urllib2.urlopen(url=request)
data = file.read()
file.close()
return data
def _get_clean_text(self, dom):
'''
- It converts dom element to clean text.
'''
html = etree.tostring(dom)
html = self.cleaner.clean_html(html)
html = ' '.join(html.split())
return html
def GetData(self, url):
'''
- It loads a wishlist page. It scraps product urls from.
It also loads product pages. It scraps the product information.
'''
raw_html = self._load(url, self.headers)
list_tree = etree.HTML(raw_html)
href_list = self._try_xpath(list_tree, self.pathes['list']['link'])
product_list = []
for href in href_list:
product_list.append(self.GetProductData(href))
return product_list
def GetProductData(self, url):
'''
- It also loads product pages. It scraps the product information.
'''
raw_html = self._load(url, self.headers)
product = {
'product_url': url,
'title': '',
'price': '',
'available': '',
'shipping_summary': ''
}
list_tree = etree.HTML(raw_html)
product['title'] = self._try_xpath(list_tree, self.pathes['item']['title'])
block = self._try_xpath(list_tree, self.pathes['item']['block'])
product['price'] = self._try_xpath(block, self.pathes['item']['price'])
product['available'] = self._try_xpath(block, self.pathes['item']['available']).replace('available', '').strip();
product['shipping_summary'] = self._get_clean_text(self._try_xpath(block, self.pathes['item']['shipping_summary']));
return product
def get_ebay_wishlist(url):
'''
- Don't write classes. The logic is ready to be reused at another places.
'''
parser = EbayParser()
return parser.GetData(url)
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
url = "http://my.ebay.com/wishlist/?userid=vlikin"
wishlist = get_ebay_wishlist(url)
print wishlist
@victor-shelepen
Copy link
Author

This script scrap pages and returns product info from an user wish list. I know that eBay has got own communication REST API. I've some problems during the registration. eBay REST API will the next step. I share this code to get advise from another programmers how I should write the code right. Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment