Skip to content

Instantly share code, notes, and snippets.

@paulschreiber
Created October 12, 2010 23:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paulschreiber/623122 to your computer and use it in GitHub Desktop.
Save paulschreiber/623122 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# By Paul Schreiber <paulschreiber at gmail.com>
# Free to use as you see fit. Credit appreciated.
#
# version 1.3
#
# Retrieved 2010-10-12:
# City and Town Clerks: http://gist.github.com/623148
# County Clerks: http://gist.github.com/623121
import os
import urllib
import sys
import re
countyBaseUrl = "https://webapps.sos.state.mi.us/mivote/CountyClerk.aspx?cty=%s"
localBaseUrl = "https://webapps.sos.state.mi.us/mivote/LocalClerk.aspx?jd=%s"
cachePath = "mi"
countyCachePath = "%s/counties" % (cachePath)
cityCachePath = "%s/cities" % (cachePath)
countyRe = re.compile('<span id="lblJuridName" class="jurisdictionName">(.+?)</span>')
addressRe = re.compile('<span id="lblAddress" class="pollText">(.*?)</span><br>')
cszRe = re.compile('<span id="lblCityStateZip" class="pollText">(.+?) MI (.+?)</span><br>')
phoneFaxRe = re.compile('<span id="lblPhoneFax" class="pollText">Ph: ?(.*?), ?Fax: ?(.*?)</span><br>')
emailRe = re.compile('<span id="lblEmail" class="pollText">E-mail:<BR>(.*?)</span>')
localClerkRe = re.compile('LocalClerk.aspx\?jd=([0-9]+)')
if not os.path.exists(countyCachePath):
os.makedirs(countyCachePath)
if not os.path.exists(cityCachePath):
os.makedirs(cityCachePath)
def cacheCountyClerks():
for i in range(01,84):
countyIndex = "%02d" % i
url = countyBaseUrl % countyIndex
filePath = "%s/%s.html" % (countyCachePath, countyIndex)
print url
html = urllib.urlretrieve(url, filePath)
def parseCountyClerkData():
parseData(countyCachePath, ("%s/counties.txt" % cachePath), "county")
def parseCityClerkData():
parseData(cityCachePath, ("%s/cities.txt" % cachePath), "city")
def parseData(aCachePath, outputPath, dataType):
data = []
cityHash = {}
if dataType == "city":
lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
for line in lines:
columns = line.split("\t")
county = columns[0]
for clerkId in columns[1:]:
cityHash[clerkId] = county
htmlFiles = os.listdir(aCachePath)
for f in htmlFiles:
if f[-5:] != ".html":
continue
filePath = "%s/%s" % (aCachePath, f)
html = open(filePath).read()
# start with the county name
line = [cityHash[f[0:-5]]]
countyCityList = countyRe.findall(html)
if len(countyCityList) == 1:
ccName = countyCityList[0]
if ccName[-5:] == " City":
ccName = ccName[:-5]
line.append(ccName)
else:
print "Can't find %s (%s)" % (dataType, f)
sys.exit()
addresses = addressRe.findall(html)
if len(addresses) == 1:
line.append(addresses[0])
else:
print "%s: Can't find address (%s)" % (counties[0], f)
sys.exit()
csz = cszRe.findall(html)
if len(csz) == 1:
line.append(csz[0][0])
line.append(csz[0][1])
else:
print "%s: Can't find city/state/zip (%s)" % (counties[0], f)
sys.exit()
phoneFax = phoneFaxRe.findall(html)
if len(phoneFax) == 1:
line.append(phoneFax[0][0])
line.append(phoneFax[0][1])
else:
print "%s: Can't find phone + fax (%s)" % (counties[0], f)
sys.exit()
email = emailRe.findall(html)
if len(email) == 1:
line.append(email[0])
else:
print "%s: Can't find email (%s)" % (counties[0], f)
sys.exit()
data.append("\t".join(line))
f = open(outputPath, "w")
f.write("\n".join(data))
f.close()
def cacheCityClerksIdList():
data = []
htmlFiles = os.listdir(countyCachePath)
for f in htmlFiles:
if f[-5:] != ".html":
continue
filePath = "%s/%s" % (countyCachePath, f)
html = open(filePath).read()
clerkList = []
counties = countyRe.findall(html)
if len(counties) == 1:
clerkList.append(counties[0][:-7])
else:
print "Can't find county (%s)" % (f)
sys.exit()
localClerks = localClerkRe.findall(html)
if len(localClerks) > 0:
for c in localClerks:
clerkList.append(c)
else:
print "Can't find clerks (%s)" % (f)
sys.exit()
data.append("\t".join(clerkList))
f = open("%s/cities-ids.txt" % cachePath, "w")
f.write("\n".join(data))
f.close()
def cacheCityClerks():
lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
for line in lines:
for clerkId in line.split("\t")[1:]:
if len(clerkId) == 0:
continue
url = localBaseUrl % clerkId
filePath = "%s/%s.html" % (cityCachePath, clerkId)
print url
html = urllib.urlretrieve(url, filePath)
# cacheCountyClerks()
# parseCountyClerkData()
# cacheCityClerksIdList()
# cacheCityClerks()
# parseCityClerkData()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment