Skip to content

Instantly share code, notes, and snippets.

@nathanhilbert
Last active August 29, 2015 14:25
Show Gist options
  • Save nathanhilbert/79bc915a002b985ca027 to your computer and use it in GitHub Desktop.
Save nathanhilbert/79bc915a002b985ca027 to your computer and use it in GitHub Desktop.
waterquality.lcra.org attempts

Mechanize. Should automatically handle sessions


import mechanize

url = r'http://waterquality.lcra.org/parameter.aspx?qrySite=12159'
request = mechanize.Request(url)
response = mechanize.urlopen(request)
forms = mechanize.ParseResponse(response, backwards_compat=False)
response.close()

form = forms[0]
print type(form)
print form.controls

for formkey in form.controls:
    print formkey
    #print formkey.value
    try:
        print formkey.options
    except Exception, e:
        print e
        pass

#print form


#print mechanize.urlopen(form.click()).read()

RoboBrowser


from robobrowser import RoboBrowser 
from requests import Session
import requests

session = Session()



headers = {'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
        'HTTP_ACCEPT': 'text/html,application/xhtml+xml,application/xml; q=0.9,*/*; q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded'}

session.headers.update(headers)

browser = RoboBrowser(history=True, session=session, user_agent=headers['HTTP_USER_AGENT'])

browser.open('http://waterquality.lcra.org/parameter.aspx?qrySite=12159')

print ""
print browser.session.cookies.get_dict()
print browser.session.headers
print browser.response.headers
print browser.response.cookies.get_dict()
print requests.utils.dict_from_cookiejar(browser.session.cookies)
print ""





form = browser.get_form(action="events.aspx")
print form.keys()

for formkey in form.keys():
    try:
        #print formitem
        #print type(formitem)
        #will fail if the item is not an option

        print type(form[formkey])
        print int(form[formkey].options[0])
        form[formkey].value = form[formkey].options[0]
    except Exception,e :
        print form[formkey].value
        print "skip", formkey
        print "because", e
        pass

print form.serialize().to_requests(form.method.upper())

browser.submit_form(form)

print browser.response
print browser.response.text

grid = browser.select('#GridView1')
print grid

Working ghost.py simplied

from ghost import Ghost
from bs4 import BeautifulSoup


ghost = Ghost()



with ghost.start() as session:
    page, extra_resources = session.open("http://waterquality.lcra.org/parameter.aspx?qrySite=12159")
    result, resources = session.evaluate("checkAll(document.form1.multiple,true);checkAll(document.form1.SelectCategories,true);")

    #session.capture_to("gotcha.png")

    result, resources = session.evaluate("document.getElementById('form1').submit();", expect_loading=True)
    session.wait_for_page_loaded()
    #session.capture_to("res.png")
print "!!!!!", result.headers
soup = BeautifulSoup(result.content, 'html.parser')

gridview = soup.find(id="GridView1")


results = []
#get the headers and the index of them
headers = [head.string for head in gridview.findAll('th')]

#uses \xa0 for blank

for row in gridview.findAll('tr'):
    vals = [aux.string for aux in row.findAll('td')]
    results.append(dict(zip(headers, vals)))

print results


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment