Created
October 12, 2010 23:33
-
-
Save paulschreiber/623122 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# By Paul Schreiber <paulschreiber at gmail.com> | |
# Free to use as you see fit. Credit appreciated. | |
# | |
# version 1.3 | |
# | |
# Retrieved 2010-10-12: | |
# City and Town Clerks: http://gist.github.com/623148 | |
# County Clerks: http://gist.github.com/623121 | |
import os | |
import urllib | |
import sys | |
import re | |
countyBaseUrl = "https://webapps.sos.state.mi.us/mivote/CountyClerk.aspx?cty=%s" | |
localBaseUrl = "https://webapps.sos.state.mi.us/mivote/LocalClerk.aspx?jd=%s" | |
cachePath = "mi" | |
countyCachePath = "%s/counties" % (cachePath) | |
cityCachePath = "%s/cities" % (cachePath) | |
countyRe = re.compile('<span id="lblJuridName" class="jurisdictionName">(.+?)</span>') | |
addressRe = re.compile('<span id="lblAddress" class="pollText">(.*?)</span><br>') | |
cszRe = re.compile('<span id="lblCityStateZip" class="pollText">(.+?) MI (.+?)</span><br>') | |
phoneFaxRe = re.compile('<span id="lblPhoneFax" class="pollText">Ph: ?(.*?), ?Fax: ?(.*?)</span><br>') | |
emailRe = re.compile('<span id="lblEmail" class="pollText">E-mail:<BR>(.*?)</span>') | |
localClerkRe = re.compile('LocalClerk.aspx\?jd=([0-9]+)') | |
if not os.path.exists(countyCachePath): | |
os.makedirs(countyCachePath) | |
if not os.path.exists(cityCachePath): | |
os.makedirs(cityCachePath) | |
def cacheCountyClerks(): | |
for i in range(01,84): | |
countyIndex = "%02d" % i | |
url = countyBaseUrl % countyIndex | |
filePath = "%s/%s.html" % (countyCachePath, countyIndex) | |
print url | |
html = urllib.urlretrieve(url, filePath) | |
def parseCountyClerkData(): | |
parseData(countyCachePath, ("%s/counties.txt" % cachePath), "county") | |
def parseCityClerkData(): | |
parseData(cityCachePath, ("%s/cities.txt" % cachePath), "city") | |
def parseData(aCachePath, outputPath, dataType): | |
data = [] | |
cityHash = {} | |
if dataType == "city": | |
lines = open("%s/cities-ids.txt" % cachePath).read().split("\n") | |
for line in lines: | |
columns = line.split("\t") | |
county = columns[0] | |
for clerkId in columns[1:]: | |
cityHash[clerkId] = county | |
htmlFiles = os.listdir(aCachePath) | |
for f in htmlFiles: | |
if f[-5:] != ".html": | |
continue | |
filePath = "%s/%s" % (aCachePath, f) | |
html = open(filePath).read() | |
# start with the county name | |
line = [cityHash[f[0:-5]]] | |
countyCityList = countyRe.findall(html) | |
if len(countyCityList) == 1: | |
ccName = countyCityList[0] | |
if ccName[-5:] == " City": | |
ccName = ccName[:-5] | |
line.append(ccName) | |
else: | |
print "Can't find %s (%s)" % (dataType, f) | |
sys.exit() | |
addresses = addressRe.findall(html) | |
if len(addresses) == 1: | |
line.append(addresses[0]) | |
else: | |
print "%s: Can't find address (%s)" % (counties[0], f) | |
sys.exit() | |
csz = cszRe.findall(html) | |
if len(csz) == 1: | |
line.append(csz[0][0]) | |
line.append(csz[0][1]) | |
else: | |
print "%s: Can't find city/state/zip (%s)" % (counties[0], f) | |
sys.exit() | |
phoneFax = phoneFaxRe.findall(html) | |
if len(phoneFax) == 1: | |
line.append(phoneFax[0][0]) | |
line.append(phoneFax[0][1]) | |
else: | |
print "%s: Can't find phone + fax (%s)" % (counties[0], f) | |
sys.exit() | |
email = emailRe.findall(html) | |
if len(email) == 1: | |
line.append(email[0]) | |
else: | |
print "%s: Can't find email (%s)" % (counties[0], f) | |
sys.exit() | |
data.append("\t".join(line)) | |
f = open(outputPath, "w") | |
f.write("\n".join(data)) | |
f.close() | |
def cacheCityClerksIdList(): | |
data = [] | |
htmlFiles = os.listdir(countyCachePath) | |
for f in htmlFiles: | |
if f[-5:] != ".html": | |
continue | |
filePath = "%s/%s" % (countyCachePath, f) | |
html = open(filePath).read() | |
clerkList = [] | |
counties = countyRe.findall(html) | |
if len(counties) == 1: | |
clerkList.append(counties[0][:-7]) | |
else: | |
print "Can't find county (%s)" % (f) | |
sys.exit() | |
localClerks = localClerkRe.findall(html) | |
if len(localClerks) > 0: | |
for c in localClerks: | |
clerkList.append(c) | |
else: | |
print "Can't find clerks (%s)" % (f) | |
sys.exit() | |
data.append("\t".join(clerkList)) | |
f = open("%s/cities-ids.txt" % cachePath, "w") | |
f.write("\n".join(data)) | |
f.close() | |
def cacheCityClerks(): | |
lines = open("%s/cities-ids.txt" % cachePath).read().split("\n") | |
for line in lines: | |
for clerkId in line.split("\t")[1:]: | |
if len(clerkId) == 0: | |
continue | |
url = localBaseUrl % clerkId | |
filePath = "%s/%s.html" % (cityCachePath, clerkId) | |
print url | |
html = urllib.urlretrieve(url, filePath) | |
# cacheCountyClerks() | |
# parseCountyClerkData() | |
# cacheCityClerksIdList() | |
# cacheCityClerks() | |
# parseCityClerkData() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment