Skip to content

Instantly share code, notes, and snippets.

@jirwin
Created March 5, 2012 09:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jirwin/1977593 to your computer and use it in GitHub Desktop.
Save jirwin/1977593 to your computer and use it in GitHub Desktop.
from lxml.html import parse
# this can be a full weburl as well
doc = parse('knives.html').getroot()
knives = {}
# start by iterating through every table cell
for td in doc.cssselect('td'):
# If the cell has align=center, valign=top, and isn't blank
if (td.get('align') == 'center' and td.get('valign') == 'top' and
td.text_content().strip()):
# This is the name of the knife
name = td.cssselect('b')[0].text_content().strip()
# descend further to find the price
for new_td in td.cssselect('div table tbody tr td'):
price = new_td.text_content().strip()
if price.startswith('$'):
# knives[name] = price[1:] <--- No dollar sign
knives[name] = price
# Print the values we saved
for name, price in knives.iteritems():
print "%s, %s" % (name, price)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment