lossyrob/main.py

## __main__.py
import os, csv, sys, traceback
import mechanicalsoup
from subprocess import call

browser = mechanicalsoup.Browser()

def pull_table_links(s):
    tables = s.select("table")
    if tables and len(tables[0].select('tr')) > 1:
        return map(lambda x: x.select("a")[0].attrs['href'], page.soup.select("tr")[0].select('td'))
    else:
        return None

def process_facility_info(p):
    table_links = pull_table_links(p.soup)
    pass

def process_step_3(p):
    def get_form(p):
        form = list(filter(lambda x: any([i for i in x.select('input') if 'value' in i.attrs and 'STEP 4: Enter Search Criteria' in i.attrs['value']]), p.soup.select('form')))[0]
        table = form.select('table')[0]
        rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'checkbox' and 'value' in i.attrs, r.select('input'))), table.select('tr')))

        columns, extras = [], []

        for r in rows:
            cells = r.select('td')
            checkbox = cells[0].select('input')[0]
            i_name = checkbox.attrs['name']
            i_value = checkbox.attrs['value']
            name = cells[1].contents[0]
            desc = cells[2].contents[0]

            columns.append((name, desc))
            extras.append((i_name, i_value))

        return (form, columns, extras)

    (form, columns, extras) = get_form(p)
    step4_page = browser.submit(form, p.url, extra=extras)

    form = list(filter(lambda x: 'name' in x.attrs and x.attrs['name'] == "QForm", step4_page.soup.select('form')))[0]
    extras = [('csv_output','Output to CSV File')]
    final_page = browser.submit(form, p.url, extra=extras)
    links = final_page.soup.select('a')
    for l in links:
        print(l)
    csv_a = list(filter(lambda x: 'href' in x.attrs and x.attrs['href'].endswith(".CSV"), links))
    print(csv_a)
    if len(csv_a) != 1:
        print(final_page.soup)
        raise Exception("Couldn't get CSV!")

    return csv_a[0].attrs['href']

def process_step_2(generate_page):
    def get_radios(p):
        form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 3: Select Columns' in i.attrs['value']]), p.soup.select('form')))[0]
        table = form.select('table')[0]
        rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'radio', r.select('input'))), table.select('tr')))

        table_selections = []

        for r in rows:
            cells = r.select('td')
            if 'style' in cells[0].attrs and cells[0].attrs['style'] == "font-style: italic; color: grey;":
                continue
            if any(filter(lambda x: 'disabled' in x.attrs, cells[0].select('input'))):
                continue
            radio = cells[0].select('input')[0]
            i_name = radio.attrs['name']
            i_value = radio.attrs['value']
            name = cells[1].contents[0]
            desc = ""
            if len(cells[2].contents) > 0:
                desc = cells[2].contents[0]
            table_selections.append((name, desc, i_name, i_value))

        return (form, table_selections)

    p = generate_page()

    try:
        count = len(get_radios(p)[1])
        results = []
        for i in range(0, count):
            p = generate_page()
            radios = get_radios(p)
            form = radios[0]
            (name, desc, i_name, i_value) = radios[1][i]

            step3_page = browser.submit(form, p.url, extra=[(i_name, i_value)])
            csv_link = process_step_3(step3_page)
            results.append((name, desc, csv_link))

        print("          %s" % name)

        print(" STEP 2 TABLE COUNT %d" % count)
    except:
        print(p.soup)
        raise
    return results

if __name__ == "__main__":
    top_map = { }
    if not os.path.exists('fails.csv'):
        page = browser.get("https://www.epa.gov/enviro/greenhouse-gas-customized-search")
        rows = filter(lambda x: x.select('th'), page.soup.select("table")[0].select('tr'))

        count = 0
        for row in rows:
            a = row.select('th')[0].select('a')[0]
            title = a.contents[0]
            href = a.attrs['href']
            desc = row.select('td')[0].contents[0]

            # Handle facility page subpages
            if href == "https://ofmpub.epa.gov/enviro/ad_hoc_table_column_select_v2.retrieval_list?database_type=GHG&selected_subjects=Facility+Information&subject_selection=+&table_1=+":
                p = browser.get(href)
                try:
                    t = p.soup.select('table')[0]
                except:
                    print(p)
                    raise
                for row in t.select("tr"):
                    a = row.select('a')[0]
                    sub_title = a.contents[0]
                    sub_href = "https:" + a.attrs['href']
                    sub_desc = row.select('td')[1].contents[0]
                    top_map[sub_href] = ("%s: %s" % (title, sub_title), "%s | %s" % (desc, sub_desc))
            else:
                top_map[href] = (title, desc)

    else:
        with open('fails.csv', 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                top_map[row[0]] = (row[1], row[2])

    in_reporting_program = []

    results = []
    fails = []

    for href in top_map:
        (main_title, main_desc) = top_map[href]
        print("Processing top level: %s" % top_map[href][0])
        print("  %s" % href)
        p = browser.get(href)
        try:
            # Check to see if it actually has the data
            if any(filter(lambda e: 'This data cannot be downloaded from the Envirofacts interface.' in e.contents[0], p.soup.select('h5'))):
                print("CANNOT PROCESS, IN 'GHG Reporting Program Data Sets'")
                in_reporting_program.append(href)
            else:
                # Find the step 2 button form
                form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 2' in i.attrs['value']]), p.soup.select('form')))[0]
                print("  ENTERING STEP 2")
                def generate_page():
                    return browser.submit(form, p.url)
                results.extend(process_step_2(generate_page))
        except Exception as e:
            # print(p.soup)
            print("FAILED!")
            traceback.print_exc(file=sys.stdout)
            fails.append((href, main_title, main_desc))

    txt = []
    if not os.path.exists("file_descriptions.csv"):
        txt.append('"Title","Description","Table_Title","Table_Description","File"')

    for (name, desc, csv_link) in results:
        l = '"%s","%s","%s","%s","%s"' % (main_title, main_desc, name, desc, os.path.basename(csv_link))
        txt.append(l)
        print(l)
        call("wget %s" % csv_link, shell=True)

    open("file_directory.csv", 'a').write('\n'.join(txt) + '\n')

    if fails:
        def m(f):
            (href, title, desc) = f
            return '"%s","%s","%s"' % (href, title, desc)
        failed_csv = '\n'.join(map(m, fails))
        open("fails.csv", 'w').write(failed_csv + "\n")
        print("%d FAILED" % len(fails))

## mechanicalsoup.patch
diff --git a/mechanicalsoup/browser.py b/mechanicalsoup/browser.py
index c5741f2..2942fe5 100644
--- a/mechanicalsoup/browser.py
+++ b/mechanicalsoup/browser.py
@@ -106,7 +106,17 @@ class Browser(object):
         if method.lower() == "get":
             kwargs["params"] = data
         else:
-            kwargs["data"] = data
+            d = []
+            d.extend(list(data.items()))
+            if "extra" in kwargs:
+                d.extend(kwargs['extra'])
+                del kwargs['extra']
+
+            kwargs["data"] = d
+
+        print("REQUEST DATA")
+        for (k, v) in kwargs["data"]:
+            print("   %s = %s" % (k, v))
         return requests.Request(method, url, files=files, **kwargs)

     def _prepare_request(self, form, url=None, **kwargs):
	import os, csv, sys, traceback
	import mechanicalsoup
	from subprocess import call

	browser = mechanicalsoup.Browser()

	def pull_table_links(s):
	tables = s.select("table")
	if tables and len(tables[0].select('tr')) > 1:
	return map(lambda x: x.select("a")[0].attrs['href'], page.soup.select("tr")[0].select('td'))
	else:
	return None

	def process_facility_info(p):
	table_links = pull_table_links(p.soup)
	pass

	def process_step_3(p):
	def get_form(p):
	form = list(filter(lambda x: any([i for i in x.select('input') if 'value' in i.attrs and 'STEP 4: Enter Search Criteria' in i.attrs['value']]), p.soup.select('form')))[0]
	table = form.select('table')[0]
	rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'checkbox' and 'value' in i.attrs, r.select('input'))), table.select('tr')))

	columns, extras = [], []

	for r in rows:
	cells = r.select('td')
	checkbox = cells[0].select('input')[0]
	i_name = checkbox.attrs['name']
	i_value = checkbox.attrs['value']
	name = cells[1].contents[0]
	desc = cells[2].contents[0]

	columns.append((name, desc))
	extras.append((i_name, i_value))

	return (form, columns, extras)

	(form, columns, extras) = get_form(p)
	step4_page = browser.submit(form, p.url, extra=extras)

	form = list(filter(lambda x: 'name' in x.attrs and x.attrs['name'] == "QForm", step4_page.soup.select('form')))[0]
	extras = [('csv_output','Output to CSV File')]
	final_page = browser.submit(form, p.url, extra=extras)
	links = final_page.soup.select('a')
	for l in links:
	print(l)
	csv_a = list(filter(lambda x: 'href' in x.attrs and x.attrs['href'].endswith(".CSV"), links))
	print(csv_a)
	if len(csv_a) != 1:
	print(final_page.soup)
	raise Exception("Couldn't get CSV!")

	return csv_a[0].attrs['href']

	def process_step_2(generate_page):
	def get_radios(p):
	form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 3: Select Columns' in i.attrs['value']]), p.soup.select('form')))[0]
	table = form.select('table')[0]
	rows = list(filter(lambda r: any(filter(lambda i: i.attrs['type'] == 'radio', r.select('input'))), table.select('tr')))

	table_selections = []

	for r in rows:
	cells = r.select('td')
	if 'style' in cells[0].attrs and cells[0].attrs['style'] == "font-style: italic; color: grey;":
	continue
	if any(filter(lambda x: 'disabled' in x.attrs, cells[0].select('input'))):
	continue
	radio = cells[0].select('input')[0]
	i_name = radio.attrs['name']
	i_value = radio.attrs['value']
	name = cells[1].contents[0]
	desc = ""
	if len(cells[2].contents) > 0:
	desc = cells[2].contents[0]
	table_selections.append((name, desc, i_name, i_value))

	return (form, table_selections)

	p = generate_page()

	try:
	count = len(get_radios(p)[1])
	results = []
	for i in range(0, count):
	p = generate_page()
	radios = get_radios(p)
	form = radios[0]
	(name, desc, i_name, i_value) = radios[1][i]

	step3_page = browser.submit(form, p.url, extra=[(i_name, i_value)])
	csv_link = process_step_3(step3_page)
	results.append((name, desc, csv_link))

	print(" %s" % name)

	print(" STEP 2 TABLE COUNT %d" % count)
	except:
	print(p.soup)
	raise
	return results

	if __name__ == "__main__":
	top_map = { }
	if not os.path.exists('fails.csv'):
	page = browser.get("https://www.epa.gov/enviro/greenhouse-gas-customized-search")
	rows = filter(lambda x: x.select('th'), page.soup.select("table")[0].select('tr'))

	count = 0
	for row in rows:
	a = row.select('th')[0].select('a')[0]
	title = a.contents[0]
	href = a.attrs['href']
	desc = row.select('td')[0].contents[0]

	# Handle facility page subpages
	if href == "https://ofmpub.epa.gov/enviro/ad_hoc_table_column_select_v2.retrieval_list?database_type=GHG&selected_subjects=Facility+Information&subject_selection=+&table_1=+":
	p = browser.get(href)
	try:
	t = p.soup.select('table')[0]
	except:
	print(p)
	raise
	for row in t.select("tr"):
	a = row.select('a')[0]
	sub_title = a.contents[0]
	sub_href = "https:" + a.attrs['href']
	sub_desc = row.select('td')[1].contents[0]
	top_map[sub_href] = ("%s: %s" % (title, sub_title), "%s \| %s" % (desc, sub_desc))
	else:
	top_map[href] = (title, desc)

	else:
	with open('fails.csv', 'r') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	top_map[row[0]] = (row[1], row[2])

	in_reporting_program = []

	results = []
	fails = []

	for href in top_map:
	(main_title, main_desc) = top_map[href]
	print("Processing top level: %s" % top_map[href][0])
	print(" %s" % href)
	p = browser.get(href)
	try:
	# Check to see if it actually has the data
	if any(filter(lambda e: 'This data cannot be downloaded from the Envirofacts interface.' in e.contents[0], p.soup.select('h5'))):
	print("CANNOT PROCESS, IN 'GHG Reporting Program Data Sets'")
	in_reporting_program.append(href)
	else:
	# Find the step 2 button form
	form = list(filter(lambda x: any([i for i in x.select('input') if 'Step 2' in i.attrs['value']]), p.soup.select('form')))[0]
	print(" ENTERING STEP 2")
	def generate_page():
	return browser.submit(form, p.url)
	results.extend(process_step_2(generate_page))
	except Exception as e:
	# print(p.soup)
	print("FAILED!")
	traceback.print_exc(file=sys.stdout)
	fails.append((href, main_title, main_desc))

	txt = []
	if not os.path.exists("file_descriptions.csv"):
	txt.append('"Title","Description","Table_Title","Table_Description","File"')

	for (name, desc, csv_link) in results:
	l = '"%s","%s","%s","%s","%s"' % (main_title, main_desc, name, desc, os.path.basename(csv_link))
	txt.append(l)
	print(l)
	call("wget %s" % csv_link, shell=True)

	open("file_directory.csv", 'a').write('\n'.join(txt) + '\n')

	if fails:
	def m(f):
	(href, title, desc) = f
	return '"%s","%s","%s"' % (href, title, desc)
	failed_csv = '\n'.join(map(m, fails))
	open("fails.csv", 'w').write(failed_csv + "\n")
	print("%d FAILED" % len(fails))
	diff --git a/mechanicalsoup/browser.py b/mechanicalsoup/browser.py
	index c5741f2..2942fe5 100644
	--- a/mechanicalsoup/browser.py
	+++ b/mechanicalsoup/browser.py
	@@ -106,7 +106,17 @@ class Browser(object):
	if method.lower() == "get":
	kwargs["params"] = data
	else:
	- kwargs["data"] = data
	+ d = []
	+ d.extend(list(data.items()))
	+ if "extra" in kwargs:
	+ d.extend(kwargs['extra'])
	+ del kwargs['extra']
	+
	+ kwargs["data"] = d
	+
	+ print("REQUEST DATA")
	+ for (k, v) in kwargs["data"]:
	+ print(" %s = %s" % (k, v))
	return requests.Request(method, url, files=files, **kwargs)

	def _prepare_request(self, form, url=None, **kwargs):