Mechanize. Should automatically handle sessions
import mechanize
url = r'http://waterquality.lcra.org/parameter.aspx?qrySite=12159'
request = mechanize.Request(url)
response = mechanize.urlopen(request)
forms = mechanize.ParseResponse(response, backwards_compat=False)
response.close()
form = forms[0]
print type(form)
print form.controls
for formkey in form.controls:
print formkey
#print formkey.value
try:
print formkey.options
except Exception, e:
print e
pass
#print form
#print mechanize.urlopen(form.click()).read()
RoboBrowser
from robobrowser import RoboBrowser
from requests import Session
import requests
session = Session()
headers = {'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
'HTTP_ACCEPT': 'text/html,application/xhtml+xml,application/xml; q=0.9,*/*; q=0.8',
'Content-Type': 'application/x-www-form-urlencoded'}
session.headers.update(headers)
browser = RoboBrowser(history=True, session=session, user_agent=headers['HTTP_USER_AGENT'])
browser.open('http://waterquality.lcra.org/parameter.aspx?qrySite=12159')
print ""
print browser.session.cookies.get_dict()
print browser.session.headers
print browser.response.headers
print browser.response.cookies.get_dict()
print requests.utils.dict_from_cookiejar(browser.session.cookies)
print ""
form = browser.get_form(action="events.aspx")
print form.keys()
for formkey in form.keys():
try:
#print formitem
#print type(formitem)
#will fail if the item is not an option
print type(form[formkey])
print int(form[formkey].options[0])
form[formkey].value = form[formkey].options[0]
except Exception,e :
print form[formkey].value
print "skip", formkey
print "because", e
pass
print form.serialize().to_requests(form.method.upper())
browser.submit_form(form)
print browser.response
print browser.response.text
grid = browser.select('#GridView1')
print grid
Working ghost.py simplied
from ghost import Ghost
from bs4 import BeautifulSoup
ghost = Ghost()
with ghost.start() as session:
page, extra_resources = session.open("http://waterquality.lcra.org/parameter.aspx?qrySite=12159")
result, resources = session.evaluate("checkAll(document.form1.multiple,true);checkAll(document.form1.SelectCategories,true);")
#session.capture_to("gotcha.png")
result, resources = session.evaluate("document.getElementById('form1').submit();", expect_loading=True)
session.wait_for_page_loaded()
#session.capture_to("res.png")
print "!!!!!", result.headers
soup = BeautifulSoup(result.content, 'html.parser')
gridview = soup.find(id="GridView1")
results = []
#get the headers and the index of them
headers = [head.string for head in gridview.findAll('th')]
#uses \xa0 for blank
for row in gridview.findAll('tr'):
vals = [aux.string for aux in row.findAll('td')]
results.append(dict(zip(headers, vals)))
print results