Skip to content

Instantly share code, notes, and snippets.

@pudo
Created December 5, 2014 08:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pudo/dee44aa06e1fce584fe1 to your computer and use it in GitHub Desktop.
Save pudo/dee44aa06e1fce584fe1 to your computer and use it in GitHub Desktop.
import requests
from lxml import html
URL = 'http://neas.environment.gov.za/portal/ApplicationsPerEAP_Report.aspx'
sess = requests.Session()
r = sess.get(URL)
doc = html.fromstring(r.content)
form = {}
for input in doc.findall('.//input'):
name = input.get('name')
if name.startswith('__') or '$Content$' in name:
form[name] = input.get('value')
print form
r = sess.post(URL, data=form)
doc = html.fromstring(r.content)
def els_to_text(els):
texts = []
for el in els:
text = el.text_content()
text = text.strip()
texts.append(text)
return texts
headers = []
for table in doc.findall('.//table'):
for row in table.findall('.//tr'):
headers_ = row.findall('.//th')
if len(headers_):
headers = els_to_text(headers_)
row = els_to_text(row.findall('.//td'))
row = dict(zip(headers, row))
print row
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment