Skip to content

Instantly share code, notes, and snippets.

@lovesh
Last active August 29, 2015 14:06
Show Gist options
  • Save lovesh/1f5e413581ffdf45de08 to your computer and use it in GitHub Desktop.
Save lovesh/1f5e413581ffdf45de08 to your computer and use it in GitHub Desktop.
playing with HTML DOM using lxml
import lxml.html
import urllib2
class DOM(object):
def __init__(self, url=None, html=None, utf8=False):
if url:
page = urllib2.urlopen(url)
html = page.read()
page.close()
if utf8:
html = html.decode('iso-8859-1').encode('utf8')
self.html = html
try:
self.document = lxml.html.document_fromstring(self.html)
except:
self.document = False
def getNodesWithXpath(self, xpath):
if self.document is not False:
self.nodes = self.document.xpath(xpath)
return self.nodes[:]
return False
def getLinksWithXpath(self, xpath):
if self.document is not False:
self.nodes = self.document.xpath(xpath)
links = [[anchor.text_content(), anchor.get('href')] for anchor in self.nodes]
return links
return False
def getImgUrlWithXpath(self, xpath):
self.nodes = self.document.xpath(xpath)
urls = [img.get('src') for img in self.nodes]
return urls
def parseTBody(self, tbody_xpath):
""" takes xpath of <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """
data = {}
tbody = self.document.xpath(tbody_xpath)[0]
trs = tbody.xpath('tr')
for tr in trs:
tds = tr.xpath('td')
if len(tds) > 1:
key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
value = tds[1].text_content().strip(':\n\t\r ')
data[key] = value
return data
def parseTBodyNode(self, tbody):
""" takes <tbody> of a table where each <tr> has 2 <td> and returns a dict where key is first <td>'s content and value is second <td>'s content """
data = {}
trs = tbody.xpath('tr')
for tr in trs:
tds = tr.xpath('td')
if len(tds) > 1:
key = tds[0].text_content().strip(':\n\t\r ').lower().replace('.', '')
value = tds[1].text_content().strip(':\n\t\r ')
data[key] = value
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment