Last active
January 15, 2017 02:49
-
-
Save lossyrob/db5ebc427e43bf211f7874a0b7fdb3c6 to your computer and use it in GitHub Desktop.
Scraper for the EPA Greenhouse Gas Program data site. Implemented as part of DataRefugePhilly (http://www.ppehlab.org/datarefuge)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, csv, sys, traceback | |
import mechanicalsoup | |
from subprocess import call | |
browser = mechanicalsoup.Browser() | |
def pull_table_links(s): | |
tables = s.select("table") | |
if tables and len(tables[0].select('tr')) > 1: | |
return map(lambda x: x.select("a")[0].attrs['href'], page.soup.select("tr")[0].select('td')) | |
else: | |
return None | |
def process_facility_info(p): | |
table_links = pull_table_links(p.soup) | |
pass | |
def process_step_3(p): | |
def get_form(p): | |
form = list(filter(lambda x: any([i for i in x.select('input') if 'value' in i.attrs and 'STEP 4: Enter Search Criteria' in i.attrs['value']]), p.soup.select('form')))[0] | |
table = form.select('table')[0] | |
rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'checkbox' and 'value' in i.attrs, r.select('input'))), table.select('tr'))) | |
columns, extras = [], [] | |
for r in rows: | |
cells = r.select('td') | |
checkbox = cells[0].select('input')[0] | |
i_name = checkbox.attrs['name'] | |
i_value = checkbox.attrs['value'] | |
name = cells[1].contents[0] | |
desc = cells[2].contents[0] | |
columns.append((name, desc)) | |
extras.append((i_name, i_value)) | |
return (form, columns, extras) | |
(form, columns, extras) = get_form(p) | |
step4_page = browser.submit(form, p.url, extra=extras) | |
form = list(filter(lambda x: 'name' in x.attrs and x.attrs['name'] == "QForm", step4_page.soup.select('form')))[0] | |
extras = [('csv_output','Output to CSV File')] | |
final_page = browser.submit(form, p.url, extra=extras) | |
links = final_page.soup.select('a') | |
for l in links: | |
print(l) | |
csv_a = list(filter(lambda x: 'href' in x.attrs and x.attrs['href'].endswith(".CSV"), links)) | |
print(csv_a) | |
if len(csv_a) != 1: | |
print(final_page.soup) | |
raise Exception("Couldn't get CSV!") | |
return csv_a[0].attrs['href'] | |
def process_step_2(generate_page): | |
def get_radios(p): | |
form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 3: Select Columns' in i.attrs['value']]), p.soup.select('form')))[0] | |
table = form.select('table')[0] | |
rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'radio', r.select('input'))), table.select('tr'))) | |
table_selections = [] | |
for r in rows: | |
cells = r.select('td') | |
if 'style' in cells[0].attrs and cells[0].attrs['style'] == "font-style: italic; color: grey;": | |
continue | |
if any(filter(lambda x: 'disabled' in x.attrs, cells[0].select('input'))): | |
continue | |
radio = cells[0].select('input')[0] | |
i_name = radio.attrs['name'] | |
i_value = radio.attrs['value'] | |
name = cells[1].contents[0] | |
desc = "" | |
if len(cells[2].contents) > 0: | |
desc = cells[2].contents[0] | |
table_selections.append((name, desc, i_name, i_value)) | |
return (form, table_selections) | |
p = generate_page() | |
try: | |
count = len(get_radios(p)[1]) | |
results = [] | |
for i in range(0, count): | |
p = generate_page() | |
radios = get_radios(p) | |
form = radios[0] | |
(name, desc, i_name, i_value) = radios[1][i] | |
step3_page = browser.submit(form, p.url, extra=[(i_name, i_value)]) | |
csv_link = process_step_3(step3_page) | |
results.append((name, desc, csv_link)) | |
print(" %s" % name) | |
print(" STEP 2 TABLE COUNT %d" % count) | |
except: | |
print(p.soup) | |
raise | |
return results | |
if __name__ == "__main__": | |
top_map = { } | |
if not os.path.exists('fails.csv'): | |
page = browser.get("https://www.epa.gov/enviro/greenhouse-gas-customized-search") | |
rows = filter(lambda x: x.select('th'), page.soup.select("table")[0].select('tr')) | |
count = 0 | |
for row in rows: | |
a = row.select('th')[0].select('a')[0] | |
title = a.contents[0] | |
href = a.attrs['href'] | |
desc = row.select('td')[0].contents[0] | |
# Handle facility page subpages | |
if href == "https://ofmpub.epa.gov/enviro/ad_hoc_table_column_select_v2.retrieval_list?database_type=GHG&selected_subjects=Facility+Information&subject_selection=+&table_1=+": | |
p = browser.get(href) | |
try: | |
t = p.soup.select('table')[0] | |
except: | |
print(p) | |
raise | |
for row in t.select("tr"): | |
a = row.select('a')[0] | |
sub_title = a.contents[0] | |
sub_href = "https:" + a.attrs['href'] | |
sub_desc = row.select('td')[1].contents[0] | |
top_map[sub_href] = ("%s: %s" % (title, sub_title), "%s | %s" % (desc, sub_desc)) | |
else: | |
top_map[href] = (title, desc) | |
else: | |
with open('fails.csv', 'r') as csvfile: | |
reader = csv.reader(csvfile) | |
for row in reader: | |
top_map[row[0]] = (row[1], row[2]) | |
in_reporting_program = [] | |
results = [] | |
fails = [] | |
for href in top_map: | |
(main_title, main_desc) = top_map[href] | |
print("Processing top level: %s" % top_map[href][0]) | |
print(" %s" % href) | |
p = browser.get(href) | |
try: | |
# Check to see if it actually has the data | |
if any(filter(lambda e: 'This data cannot be downloaded from the Envirofacts interface.' in e.contents[0], p.soup.select('h5'))): | |
print("CANNOT PROCESS, IN 'GHG Reporting Program Data Sets'") | |
in_reporting_program.append(href) | |
else: | |
# Find the step 2 button form | |
form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 2' in i.attrs['value']]), p.soup.select('form')))[0] | |
print(" ENTERING STEP 2") | |
def generate_page(): | |
return browser.submit(form, p.url) | |
results.extend(process_step_2(generate_page)) | |
except Exception as e: | |
# print(p.soup) | |
print("FAILED!") | |
traceback.print_exc(file=sys.stdout) | |
fails.append((href, main_title, main_desc)) | |
txt = [] | |
if not os.path.exists("file_descriptions.csv"): | |
txt.append('"Title","Description","Table_Title","Table_Description","File"') | |
for (name, desc, csv_link) in results: | |
l = '"%s","%s","%s","%s","%s"' % (main_title, main_desc, name, desc, os.path.basename(csv_link)) | |
txt.append(l) | |
print(l) | |
call("wget %s" % csv_link, shell=True) | |
open("file_directory.csv", 'a').write('\n'.join(txt) + '\n') | |
if fails: | |
def m(f): | |
(href, title, desc) = f | |
return '"%s","%s","%s"' % (href, title, desc) | |
failed_csv = '\n'.join(map(m, fails)) | |
open("fails.csv", 'w').write(failed_csv + "\n") | |
print("%d FAILED" % len(fails)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/mechanicalsoup/browser.py b/mechanicalsoup/browser.py | |
index c5741f2..2942fe5 100644 | |
--- a/mechanicalsoup/browser.py | |
+++ b/mechanicalsoup/browser.py | |
@@ -106,7 +106,17 @@ class Browser(object): | |
if method.lower() == "get": | |
kwargs["params"] = data | |
else: | |
- kwargs["data"] = data | |
+ d = [] | |
+ d.extend(list(data.items())) | |
+ if "extra" in kwargs: | |
+ d.extend(kwargs['extra']) | |
+ del kwargs['extra'] | |
+ | |
+ kwargs["data"] = d | |
+ | |
+ print("REQUEST DATA") | |
+ for (k, v) in kwargs["data"]: | |
+ print(" %s = %s" % (k, v)) | |
return requests.Request(method, url, files=files, **kwargs) | |
def _prepare_request(self, form, url=None, **kwargs): |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The MechanicalSoup patch is necessary, since the EPA site contains forms that repeat the name "table_1" for many input values, and MechanicalSoup sanely assumes unique names on form inputs.