Skip to content

Instantly share code, notes, and snippets.

@CloudCray
Last active August 29, 2015 14:14
Show Gist options
  • Save CloudCray/82d91edf9c4c3816035a to your computer and use it in GitHub Desktop.
Save CloudCray/82d91edf9c4c3816035a to your computer and use it in GitHub Desktop.
Download csv of NDCA judges
import urllib
import bs4
import csv
p = urllib.parse
request = urllib.request
url = 'http://www.ndca.org/directories/'
urls = []
for i in range(18): # There are 436 adjudicators right now; the loads in sets of 25. 18 pages needed
query = {"categories": "Adjudicator",
"results": "{0}-500".format(str(i*25+1))} # There are less than 500 results, appearing in sets of 25
url_q = url + "?" + p.urlencode(query)
urls.append(url_q)
def table_to_list_of_dicts(table):
rows = table.findChildren(name="tr")
data = []
first_row = table.findChild(name="thead").findChild(name="tr")
row_index = 0
if not first_row:
first_row = rows[0]
row_index = 1
keys = list([str(x.text).strip() for x in first_row.children if x.name in ("td", "th")])
for row in rows[row_index:]:
vals = list([str(x.text).strip() for x in row.children if x.name in ("td", "th")])
d = {}
if len(vals) == len(keys) and vals != keys:
for i in range(len(keys)):
d[keys[i]] = vals[i]
data.append(d)
return data
def get_page_judges(url):
req = request.Request(url)
resp = request.urlopen(req).read()
soup = bs4.BeautifulSoup(resp)
table = soup.find("table", {"class": "data compact"})
data = table_to_list_of_dicts(table)
return data
data = []
for x in urls:
l = get_page_judges(x)
data += l
file_out = open("ndca-adjudicators.csv", "w", newline="\n")
writer = csv.writer(file_out)
keys = list(data[0].keys())
writer.writerow(keys)
for row in data:
writer.writerow([row[x] for x in keys])
file_out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment