victor-shelepen/eBay Wish List Parser

## eBay Wish List Parser
#!/usr/bin/env python
import logging
import urllib2

from lxml.html import etree
from lxml.html.clean import Cleaner


class EbayParser:
  def __init__(self, logger_name='ebay_parser', logger_level=logging.DEBUG):
    self.product_list = []
    self.headers = {
      "User-Agent": "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1",
      "Accept": "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1",
      "Accept-Language": "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7",
      "Accept-Charset": "iso-8859-1, utf-8, utf-16, *;q=0.1",
      "Accept-Encoding": "identity, *;q=0",
      "Connection": "Keep-Alive"
    }
    self.pathes = {
      'list': {
        'link': '//div[contains(@class, wlgd-row)]//a[contains(@class, "item-tle")]/@href'
      },
      'item': {
        'title': '//h1[contains(@id, "itemTitle")]/text()',
        'block': '//form[contains(@name, "viactiondetails")]',
        'price': '//span[contains(@id, "prcIsum")]/text()',
        'available': '//span[contains(@id, "qtySubTxt")]/text()',
        'shipping_summary': '//div[contains(@id, "shippingSummary")]'
      }
    }
    self.cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)
    self.logger = logging.getLogger(logger_name)
    self.logger.setLevel(logger_level)

  def _try_xpath(self, dom_item, path):
    '''
      - Try to get items from the dom model according the path.
    '''
    try:
      result = dom_item.xpath(path)
      if len(result) == 1:
        return result[0]
      else:
        return result
    except Exception:
      self.logger.error("Wrong XPath - %s" % path)
      return None

  def _load(self, url, headers):
    '''
      - load data from the url using the headers.
      This is a library function.
    '''
    request = urllib2.Request(url = url, headers = headers)
    file = urllib2.urlopen(url=request)
    data = file.read()
    file.close()
    return data

  def _get_clean_text(self, dom):
    '''
      - It converts dom element to clean text.
    '''
    html = etree.tostring(dom)
    html = self.cleaner.clean_html(html)
    html = ' '.join(html.split())
    return html

  def GetData(self, url):
    '''
      - It loads a wishlist page. It scraps product urls from.
      It also loads product pages. It scraps the product information.
    '''
    raw_html = self._load(url, self.headers)
    list_tree = etree.HTML(raw_html)
    href_list = self._try_xpath(list_tree, self.pathes['list']['link'])
    product_list = []
    for href in href_list:
      product_list.append(self.GetProductData(href))
    return product_list


  def GetProductData(self, url):
    '''
      - It also loads product pages. It scraps the product information.
    '''
    raw_html = self._load(url, self.headers)
    product = {
      'product_url': url,
      'title': '',
      'price': '',
      'available': '',
      'shipping_summary': ''
    }
    list_tree = etree.HTML(raw_html)
    product['title'] = self._try_xpath(list_tree, self.pathes['item']['title'])
    block = self._try_xpath(list_tree, self.pathes['item']['block'])
    product['price'] = self._try_xpath(block, self.pathes['item']['price'])
    product['available'] = self._try_xpath(block, self.pathes['item']['available']).replace('available', '').strip();
    product['shipping_summary'] = self._get_clean_text(self._try_xpath(block, self.pathes['item']['shipping_summary']));

    return product


def get_ebay_wishlist(url):
  '''
    - Don't write classes. The logic is ready to be reused at another places.
  '''
  parser = EbayParser()
  return parser.GetData(url)

if __name__ == "__main__":
  logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
  url = "http://my.ebay.com/wishlist/?userid=vlikin"
  wishlist = get_ebay_wishlist(url)
  print wishlist
	#!/usr/bin/env python
	import logging
	import urllib2

	from lxml.html import etree
	from lxml.html.clean import Cleaner


	class EbayParser:
	def __init__(self, logger_name='ebay_parser', logger_level=logging.DEBUG):
	self.product_list = []
	self.headers = {
	"User-Agent": "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1",
	"Accept": "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, /;q=0.1",
	"Accept-Language": "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7",
	"Accept-Charset": "iso-8859-1, utf-8, utf-16, *;q=0.1",
	"Accept-Encoding": "identity, *;q=0",
	"Connection": "Keep-Alive"
	}
	self.pathes = {
	'list': {
	'link': '//div[contains(@class, wlgd-row)]//a[contains(@class, "item-tle")]/@href'
	},
	'item': {
	'title': '//h1[contains(@id, "itemTitle")]/text()',
	'block': '//form[contains(@name, "viactiondetails")]',
	'price': '//span[contains(@id, "prcIsum")]/text()',
	'available': '//span[contains(@id, "qtySubTxt")]/text()',
	'shipping_summary': '//div[contains(@id, "shippingSummary")]'
	}
	}
	self.cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)
	self.logger = logging.getLogger(logger_name)
	self.logger.setLevel(logger_level)

	def _try_xpath(self, dom_item, path):
	'''
	- Try to get items from the dom model according the path.
	'''
	try:
	result = dom_item.xpath(path)
	if len(result) == 1:
	return result[0]
	else:
	return result
	except Exception:
	self.logger.error("Wrong XPath - %s" % path)
	return None

	def _load(self, url, headers):
	'''
	- load data from the url using the headers.
	This is a library function.
	'''
	request = urllib2.Request(url = url, headers = headers)
	file = urllib2.urlopen(url=request)
	data = file.read()
	file.close()
	return data

	def _get_clean_text(self, dom):
	'''
	- It converts dom element to clean text.
	'''
	html = etree.tostring(dom)
	html = self.cleaner.clean_html(html)
	html = ' '.join(html.split())
	return html

	def GetData(self, url):
	'''
	- It loads a wishlist page. It scraps product urls from.
	It also loads product pages. It scraps the product information.
	'''
	raw_html = self._load(url, self.headers)
	list_tree = etree.HTML(raw_html)
	href_list = self._try_xpath(list_tree, self.pathes['list']['link'])
	product_list = []
	for href in href_list:
	product_list.append(self.GetProductData(href))
	return product_list


	def GetProductData(self, url):
	'''
	- It also loads product pages. It scraps the product information.
	'''
	raw_html = self._load(url, self.headers)
	product = {
	'product_url': url,
	'title': '',
	'price': '',
	'available': '',
	'shipping_summary': ''
	}
	list_tree = etree.HTML(raw_html)
	product['title'] = self._try_xpath(list_tree, self.pathes['item']['title'])
	block = self._try_xpath(list_tree, self.pathes['item']['block'])
	product['price'] = self._try_xpath(block, self.pathes['item']['price'])
	product['available'] = self._try_xpath(block, self.pathes['item']['available']).replace('available', '').strip();
	product['shipping_summary'] = self._get_clean_text(self._try_xpath(block, self.pathes['item']['shipping_summary']));

	return product


	def get_ebay_wishlist(url):
	'''
	- Don't write classes. The logic is ready to be reused at another places.
	'''
	parser = EbayParser()
	return parser.GetData(url)

	if __name__ == "__main__":
	logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
	url = "http://my.ebay.com/wishlist/?userid=vlikin"
	wishlist = get_ebay_wishlist(url)
	print wishlist