Created
April 28, 2013 12:34
-
-
Save victor-shelepen/5476759 to your computer and use it in GitHub Desktop.
This script scrap pages and returns product info from an user wish list. I know that eBay has got own communication REST API. I've some problems during the registration. eBay REST API will the next step. I share this code to get advise from another programmers how I should write the code right. Thank you.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import logging | |
import urllib2 | |
from lxml.html import etree | |
from lxml.html.clean import Cleaner | |
class EbayParser: | |
def __init__(self, logger_name='ebay_parser', logger_level=logging.DEBUG): | |
self.product_list = [] | |
self.headers = { | |
"User-Agent": "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1", | |
"Accept": "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1", | |
"Accept-Language": "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7", | |
"Accept-Charset": "iso-8859-1, utf-8, utf-16, *;q=0.1", | |
"Accept-Encoding": "identity, *;q=0", | |
"Connection": "Keep-Alive" | |
} | |
self.pathes = { | |
'list': { | |
'link': '//div[contains(@class, wlgd-row)]//a[contains(@class, "item-tle")]/@href' | |
}, | |
'item': { | |
'title': '//h1[contains(@id, "itemTitle")]/text()', | |
'block': '//form[contains(@name, "viactiondetails")]', | |
'price': '//span[contains(@id, "prcIsum")]/text()', | |
'available': '//span[contains(@id, "qtySubTxt")]/text()', | |
'shipping_summary': '//div[contains(@id, "shippingSummary")]' | |
} | |
} | |
self.cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) | |
self.logger = logging.getLogger(logger_name) | |
self.logger.setLevel(logger_level) | |
def _try_xpath(self, dom_item, path): | |
''' | |
- Try to get items from the dom model according the path. | |
''' | |
try: | |
result = dom_item.xpath(path) | |
if len(result) == 1: | |
return result[0] | |
else: | |
return result | |
except Exception: | |
self.logger.error("Wrong XPath - %s" % path) | |
return None | |
def _load(self, url, headers): | |
''' | |
- load data from the url using the headers. | |
This is a library function. | |
''' | |
request = urllib2.Request(url = url, headers = headers) | |
file = urllib2.urlopen(url=request) | |
data = file.read() | |
file.close() | |
return data | |
def _get_clean_text(self, dom): | |
''' | |
- It converts dom element to clean text. | |
''' | |
html = etree.tostring(dom) | |
html = self.cleaner.clean_html(html) | |
html = ' '.join(html.split()) | |
return html | |
def GetData(self, url): | |
''' | |
- It loads a wishlist page. It scraps product urls from. | |
It also loads product pages. It scraps the product information. | |
''' | |
raw_html = self._load(url, self.headers) | |
list_tree = etree.HTML(raw_html) | |
href_list = self._try_xpath(list_tree, self.pathes['list']['link']) | |
product_list = [] | |
for href in href_list: | |
product_list.append(self.GetProductData(href)) | |
return product_list | |
def GetProductData(self, url): | |
''' | |
- It also loads product pages. It scraps the product information. | |
''' | |
raw_html = self._load(url, self.headers) | |
product = { | |
'product_url': url, | |
'title': '', | |
'price': '', | |
'available': '', | |
'shipping_summary': '' | |
} | |
list_tree = etree.HTML(raw_html) | |
product['title'] = self._try_xpath(list_tree, self.pathes['item']['title']) | |
block = self._try_xpath(list_tree, self.pathes['item']['block']) | |
product['price'] = self._try_xpath(block, self.pathes['item']['price']) | |
product['available'] = self._try_xpath(block, self.pathes['item']['available']).replace('available', '').strip(); | |
product['shipping_summary'] = self._get_clean_text(self._try_xpath(block, self.pathes['item']['shipping_summary'])); | |
return product | |
def get_ebay_wishlist(url): | |
''' | |
- Don't write classes. The logic is ready to be reused at another places. | |
''' | |
parser = EbayParser() | |
return parser.GetData(url) | |
if __name__ == "__main__": | |
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s') | |
url = "http://my.ebay.com/wishlist/?userid=vlikin" | |
wishlist = get_ebay_wishlist(url) | |
print wishlist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script scrap pages and returns product info from an user wish list. I know that eBay has got own communication REST API. I've some problems during the registration. eBay REST API will the next step. I share this code to get advise from another programmers how I should write the code right. Thank you.