Skip to content

Instantly share code, notes, and snippets.

@lossyrob
Last active January 15, 2017 02:49
Show Gist options
  • Save lossyrob/db5ebc427e43bf211f7874a0b7fdb3c6 to your computer and use it in GitHub Desktop.
Save lossyrob/db5ebc427e43bf211f7874a0b7fdb3c6 to your computer and use it in GitHub Desktop.
Scraper for the EPA Greenhouse Gas Program data site. Implemented as part of DataRefugePhilly (http://www.ppehlab.org/datarefuge)
import os, csv, sys, traceback
import mechanicalsoup
from subprocess import call
browser = mechanicalsoup.Browser()
def pull_table_links(s):
tables = s.select("table")
if tables and len(tables[0].select('tr')) > 1:
return map(lambda x: x.select("a")[0].attrs['href'], page.soup.select("tr")[0].select('td'))
else:
return None
def process_facility_info(p):
table_links = pull_table_links(p.soup)
pass
def process_step_3(p):
def get_form(p):
form = list(filter(lambda x: any([i for i in x.select('input') if 'value' in i.attrs and 'STEP 4: Enter Search Criteria' in i.attrs['value']]), p.soup.select('form')))[0]
table = form.select('table')[0]
rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'checkbox' and 'value' in i.attrs, r.select('input'))), table.select('tr')))
columns, extras = [], []
for r in rows:
cells = r.select('td')
checkbox = cells[0].select('input')[0]
i_name = checkbox.attrs['name']
i_value = checkbox.attrs['value']
name = cells[1].contents[0]
desc = cells[2].contents[0]
columns.append((name, desc))
extras.append((i_name, i_value))
return (form, columns, extras)
(form, columns, extras) = get_form(p)
step4_page = browser.submit(form, p.url, extra=extras)
form = list(filter(lambda x: 'name' in x.attrs and x.attrs['name'] == "QForm", step4_page.soup.select('form')))[0]
extras = [('csv_output','Output to CSV File')]
final_page = browser.submit(form, p.url, extra=extras)
links = final_page.soup.select('a')
for l in links:
print(l)
csv_a = list(filter(lambda x: 'href' in x.attrs and x.attrs['href'].endswith(".CSV"), links))
print(csv_a)
if len(csv_a) != 1:
print(final_page.soup)
raise Exception("Couldn't get CSV!")
return csv_a[0].attrs['href']
def process_step_2(generate_page):
def get_radios(p):
form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 3: Select Columns' in i.attrs['value']]), p.soup.select('form')))[0]
table = form.select('table')[0]
rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'radio', r.select('input'))), table.select('tr')))
table_selections = []
for r in rows:
cells = r.select('td')
if 'style' in cells[0].attrs and cells[0].attrs['style'] == "font-style: italic; color: grey;":
continue
if any(filter(lambda x: 'disabled' in x.attrs, cells[0].select('input'))):
continue
radio = cells[0].select('input')[0]
i_name = radio.attrs['name']
i_value = radio.attrs['value']
name = cells[1].contents[0]
desc = ""
if len(cells[2].contents) > 0:
desc = cells[2].contents[0]
table_selections.append((name, desc, i_name, i_value))
return (form, table_selections)
p = generate_page()
try:
count = len(get_radios(p)[1])
results = []
for i in range(0, count):
p = generate_page()
radios = get_radios(p)
form = radios[0]
(name, desc, i_name, i_value) = radios[1][i]
step3_page = browser.submit(form, p.url, extra=[(i_name, i_value)])
csv_link = process_step_3(step3_page)
results.append((name, desc, csv_link))
print(" %s" % name)
print(" STEP 2 TABLE COUNT %d" % count)
except:
print(p.soup)
raise
return results
if __name__ == "__main__":
top_map = { }
if not os.path.exists('fails.csv'):
page = browser.get("https://www.epa.gov/enviro/greenhouse-gas-customized-search")
rows = filter(lambda x: x.select('th'), page.soup.select("table")[0].select('tr'))
count = 0
for row in rows:
a = row.select('th')[0].select('a')[0]
title = a.contents[0]
href = a.attrs['href']
desc = row.select('td')[0].contents[0]
# Handle facility page subpages
if href == "https://ofmpub.epa.gov/enviro/ad_hoc_table_column_select_v2.retrieval_list?database_type=GHG&selected_subjects=Facility+Information&subject_selection=+&table_1=+":
p = browser.get(href)
try:
t = p.soup.select('table')[0]
except:
print(p)
raise
for row in t.select("tr"):
a = row.select('a')[0]
sub_title = a.contents[0]
sub_href = "https:" + a.attrs['href']
sub_desc = row.select('td')[1].contents[0]
top_map[sub_href] = ("%s: %s" % (title, sub_title), "%s | %s" % (desc, sub_desc))
else:
top_map[href] = (title, desc)
else:
with open('fails.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
top_map[row[0]] = (row[1], row[2])
in_reporting_program = []
results = []
fails = []
for href in top_map:
(main_title, main_desc) = top_map[href]
print("Processing top level: %s" % top_map[href][0])
print(" %s" % href)
p = browser.get(href)
try:
# Check to see if it actually has the data
if any(filter(lambda e: 'This data cannot be downloaded from the Envirofacts interface.' in e.contents[0], p.soup.select('h5'))):
print("CANNOT PROCESS, IN 'GHG Reporting Program Data Sets'")
in_reporting_program.append(href)
else:
# Find the step 2 button form
form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 2' in i.attrs['value']]), p.soup.select('form')))[0]
print(" ENTERING STEP 2")
def generate_page():
return browser.submit(form, p.url)
results.extend(process_step_2(generate_page))
except Exception as e:
# print(p.soup)
print("FAILED!")
traceback.print_exc(file=sys.stdout)
fails.append((href, main_title, main_desc))
txt = []
if not os.path.exists("file_descriptions.csv"):
txt.append('"Title","Description","Table_Title","Table_Description","File"')
for (name, desc, csv_link) in results:
l = '"%s","%s","%s","%s","%s"' % (main_title, main_desc, name, desc, os.path.basename(csv_link))
txt.append(l)
print(l)
call("wget %s" % csv_link, shell=True)
open("file_directory.csv", 'a').write('\n'.join(txt) + '\n')
if fails:
def m(f):
(href, title, desc) = f
return '"%s","%s","%s"' % (href, title, desc)
failed_csv = '\n'.join(map(m, fails))
open("fails.csv", 'w').write(failed_csv + "\n")
print("%d FAILED" % len(fails))
diff --git a/mechanicalsoup/browser.py b/mechanicalsoup/browser.py
index c5741f2..2942fe5 100644
--- a/mechanicalsoup/browser.py
+++ b/mechanicalsoup/browser.py
@@ -106,7 +106,17 @@ class Browser(object):
if method.lower() == "get":
kwargs["params"] = data
else:
- kwargs["data"] = data
+ d = []
+ d.extend(list(data.items()))
+ if "extra" in kwargs:
+ d.extend(kwargs['extra'])
+ del kwargs['extra']
+
+ kwargs["data"] = d
+
+ print("REQUEST DATA")
+ for (k, v) in kwargs["data"]:
+ print(" %s = %s" % (k, v))
return requests.Request(method, url, files=files, **kwargs)
def _prepare_request(self, form, url=None, **kwargs):
@lossyrob
Copy link
Author

The MechanicalSoup patch is necessary, since the EPA site contains forms that repeat the name "table_1" for many input values, and MechanicalSoup sanely assumes unique names on form inputs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment