paulschreiber/mi-clerk-fetcher.py

## mi-clerk-fetcher.py
#!/usr/bin/python

# By Paul Schreiber <paulschreiber at gmail.com>
# Free to use as you see fit. Credit appreciated.
#
# version 1.3
#
# Retrieved 2010-10-12:
# City and Town Clerks: http://gist.github.com/623148
# County Clerks: http://gist.github.com/623121

import os
import urllib
import sys
import re

countyBaseUrl = "https://webapps.sos.state.mi.us/mivote/CountyClerk.aspx?cty=%s"
localBaseUrl = "https://webapps.sos.state.mi.us/mivote/LocalClerk.aspx?jd=%s"
cachePath = "mi"
countyCachePath = "%s/counties" % (cachePath)
cityCachePath = "%s/cities" % (cachePath)

countyRe = re.compile('<span id="lblJuridName" class="jurisdictionName">(.+?)</span>')
addressRe = re.compile('<span id="lblAddress" class="pollText">(.*?)</span><br>')
cszRe = re.compile('<span id="lblCityStateZip" class="pollText">(.+?) MI (.+?)</span><br>')
phoneFaxRe = re.compile('<span id="lblPhoneFax" class="pollText">Ph: ?(.*?), ?Fax: ?(.*?)</span><br>')

emailRe = re.compile('<span id="lblEmail" class="pollText">E-mail:<BR>(.*?)</span>')

localClerkRe = re.compile('LocalClerk.aspx\?jd=([0-9]+)')

if not os.path.exists(countyCachePath):
	os.makedirs(countyCachePath)

if not os.path.exists(cityCachePath):
	os.makedirs(cityCachePath)

def cacheCountyClerks():
	for i in range(01,84):
		countyIndex = "%02d" % i
		url = countyBaseUrl % countyIndex
		filePath = "%s/%s.html" % (countyCachePath, countyIndex)

		print url
		html = urllib.urlretrieve(url, filePath)

def parseCountyClerkData():
	parseData(countyCachePath, ("%s/counties.txt" % cachePath), "county")

def parseCityClerkData():
	parseData(cityCachePath, ("%s/cities.txt" % cachePath), "city")


def parseData(aCachePath, outputPath, dataType):
	data = []

	cityHash = {}
	if dataType == "city":
		lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
		for line in lines:
			columns = line.split("\t")
			county = columns[0]
			for clerkId in columns[1:]:
				cityHash[clerkId] = county


	htmlFiles = os.listdir(aCachePath)
	for f in htmlFiles:
		if f[-5:] != ".html":
			continue


		filePath = "%s/%s" % (aCachePath, f)
		html = open(filePath).read()

		# start with the county name
		line = [cityHash[f[0:-5]]]

		countyCityList = countyRe.findall(html)
		if len(countyCityList) == 1:
			ccName = countyCityList[0]
			if ccName[-5:] == " City":
				ccName = ccName[:-5]
			line.append(ccName)
		else:
			print "Can't find %s (%s)" % (dataType, f)
			sys.exit()

		addresses = addressRe.findall(html)
		if len(addresses) == 1:
			line.append(addresses[0])
		else:
			print "%s: Can't find address (%s)" % (counties[0], f)
			sys.exit()

		csz = cszRe.findall(html)
		if len(csz) == 1:
			line.append(csz[0][0])
			line.append(csz[0][1])
		else:
			print "%s: Can't find city/state/zip (%s)" % (counties[0], f)
			sys.exit()

		phoneFax = phoneFaxRe.findall(html)
		if len(phoneFax) == 1:
			line.append(phoneFax[0][0])
			line.append(phoneFax[0][1])
		else:
			print "%s: Can't find phone + fax (%s)" % (counties[0], f)
			sys.exit()

		email = emailRe.findall(html)
		if len(email) == 1:
			line.append(email[0])
		else:
			print "%s: Can't find email (%s)" % (counties[0], f)
			sys.exit()


		data.append("\t".join(line))

	f = open(outputPath, "w")
	f.write("\n".join(data))
	f.close()

def cacheCityClerksIdList():
	data = []

	htmlFiles = os.listdir(countyCachePath)
	for f in htmlFiles:
		if f[-5:] != ".html":
			continue

		filePath = "%s/%s" % (countyCachePath, f)
		html = open(filePath).read()

		clerkList = []
		counties = countyRe.findall(html)
		if len(counties) == 1:
			clerkList.append(counties[0][:-7])
		else:
			print "Can't find county (%s)" % (f)
			sys.exit()

		localClerks = localClerkRe.findall(html)
		if len(localClerks) > 0:
			for c in localClerks:
				clerkList.append(c)
		else:
			print "Can't find clerks (%s)" % (f)
			sys.exit()

		data.append("\t".join(clerkList))

	f = open("%s/cities-ids.txt" % cachePath, "w")
	f.write("\n".join(data))
	f.close()


def cacheCityClerks():
	lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
	for line in lines:
		for clerkId in line.split("\t")[1:]:
			if len(clerkId) == 0:
				continue
			url = localBaseUrl % clerkId
			filePath = "%s/%s.html" % (cityCachePath, clerkId)

			print url
			html = urllib.urlretrieve(url, filePath)


# cacheCountyClerks()
# parseCountyClerkData()
# cacheCityClerksIdList()
# cacheCityClerks()
# parseCityClerkData()
	#!/usr/bin/python

	# By Paul Schreiber <paulschreiber at gmail.com>
	# Free to use as you see fit. Credit appreciated.
	#
	# version 1.3
	#
	# Retrieved 2010-10-12:
	# City and Town Clerks: http://gist.github.com/623148
	# County Clerks: http://gist.github.com/623121

	import os
	import urllib
	import sys
	import re

	countyBaseUrl = "https://webapps.sos.state.mi.us/mivote/CountyClerk.aspx?cty=%s"
	localBaseUrl = "https://webapps.sos.state.mi.us/mivote/LocalClerk.aspx?jd=%s"
	cachePath = "mi"
	countyCachePath = "%s/counties" % (cachePath)
	cityCachePath = "%s/cities" % (cachePath)

	countyRe = re.compile('<span id="lblJuridName" class="jurisdictionName">(.+?)</span>')
	addressRe = re.compile('<span id="lblAddress" class="pollText">(.*?)</span><br>')
	cszRe = re.compile('<span id="lblCityStateZip" class="pollText">(.+?) MI (.+?)</span><br>')
	phoneFaxRe = re.compile('<span id="lblPhoneFax" class="pollText">Ph: ?(.?), ?Fax: ?(.?)</span><br>')

	emailRe = re.compile('<span id="lblEmail" class="pollText">E-mail:<BR>(.*?)</span>')

	localClerkRe = re.compile('LocalClerk.aspx\?jd=([0-9]+)')

	if not os.path.exists(countyCachePath):
	os.makedirs(countyCachePath)

	if not os.path.exists(cityCachePath):
	os.makedirs(cityCachePath)

	def cacheCountyClerks():
	for i in range(01,84):
	countyIndex = "%02d" % i
	url = countyBaseUrl % countyIndex
	filePath = "%s/%s.html" % (countyCachePath, countyIndex)

	print url
	html = urllib.urlretrieve(url, filePath)

	def parseCountyClerkData():
	parseData(countyCachePath, ("%s/counties.txt" % cachePath), "county")

	def parseCityClerkData():
	parseData(cityCachePath, ("%s/cities.txt" % cachePath), "city")


	def parseData(aCachePath, outputPath, dataType):
	data = []

	cityHash = {}
	if dataType == "city":
	lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
	for line in lines:
	columns = line.split("\t")
	county = columns[0]
	for clerkId in columns[1:]:
	cityHash[clerkId] = county


	htmlFiles = os.listdir(aCachePath)
	for f in htmlFiles:
	if f[-5:] != ".html":
	continue


	filePath = "%s/%s" % (aCachePath, f)
	html = open(filePath).read()

	# start with the county name
	line = [cityHash[f[0:-5]]]

	countyCityList = countyRe.findall(html)
	if len(countyCityList) == 1:
	ccName = countyCityList[0]
	if ccName[-5:] == " City":
	ccName = ccName[:-5]
	line.append(ccName)
	else:
	print "Can't find %s (%s)" % (dataType, f)
	sys.exit()

	addresses = addressRe.findall(html)
	if len(addresses) == 1:
	line.append(addresses[0])
	else:
	print "%s: Can't find address (%s)" % (counties[0], f)
	sys.exit()

	csz = cszRe.findall(html)
	if len(csz) == 1:
	line.append(csz[0][0])
	line.append(csz[0][1])
	else:
	print "%s: Can't find city/state/zip (%s)" % (counties[0], f)
	sys.exit()

	phoneFax = phoneFaxRe.findall(html)
	if len(phoneFax) == 1:
	line.append(phoneFax[0][0])
	line.append(phoneFax[0][1])
	else:
	print "%s: Can't find phone + fax (%s)" % (counties[0], f)
	sys.exit()

	email = emailRe.findall(html)
	if len(email) == 1:
	line.append(email[0])
	else:
	print "%s: Can't find email (%s)" % (counties[0], f)
	sys.exit()


	data.append("\t".join(line))

	f = open(outputPath, "w")
	f.write("\n".join(data))
	f.close()

	def cacheCityClerksIdList():
	data = []

	htmlFiles = os.listdir(countyCachePath)
	for f in htmlFiles:
	if f[-5:] != ".html":
	continue

	filePath = "%s/%s" % (countyCachePath, f)
	html = open(filePath).read()

	clerkList = []
	counties = countyRe.findall(html)
	if len(counties) == 1:
	clerkList.append(counties[0][:-7])
	else:
	print "Can't find county (%s)" % (f)
	sys.exit()

	localClerks = localClerkRe.findall(html)
	if len(localClerks) > 0:
	for c in localClerks:
	clerkList.append(c)
	else:
	print "Can't find clerks (%s)" % (f)
	sys.exit()

	data.append("\t".join(clerkList))

	f = open("%s/cities-ids.txt" % cachePath, "w")
	f.write("\n".join(data))
	f.close()


	def cacheCityClerks():
	lines = open("%s/cities-ids.txt" % cachePath).read().split("\n")
	for line in lines:
	for clerkId in line.split("\t")[1:]:
	if len(clerkId) == 0:
	continue
	url = localBaseUrl % clerkId
	filePath = "%s/%s.html" % (cityCachePath, clerkId)

	print url
	html = urllib.urlretrieve(url, filePath)


	# cacheCountyClerks()
	# parseCountyClerkData()
	# cacheCityClerksIdList()
	# cacheCityClerks()
	# parseCityClerkData()