Create a gist now

Instantly share code, notes, and snippets.

Embed
Script to convert tables of legislative votes in several MS Word documents into pipe-delimited documents.
import win32com.client as win32
from glob import glob
import re
# Grab all the .Doc files from these two
# directories so we can open them with win32.
house_files = glob("House/*.Doc")
senate_files = glob("Senate/*.Doc")
# File where we'll dump all of the output.
oFile = open("legVoting.txt", "w+")
# Open up an instance of Microsoft Word.
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False # Don't actually want to "launch" word.
# Start with the header for the pipe-delimited file.
oText = "LEGISLATOR|HOUSE|LEGISLATION|LONG_TITLE|VOTE_DATE|RESP|PASS_REQ|YES|NO|ABSENT|NOT_VOTING\n"
oFile.write(oText)
def writeVotes(f, house="HOUSE"):
# Pull the legislator's last name from the filename using RE.
name = re.findall('[_\.]([A-Za-z\-\.]+).Doc$', f)[0].upper()
word.Documents.Open(f)
doc = word.ActiveDocument
table = doc.Tables(1)
row_count = table.Rows.Count
# Loop through the table and pull out the text.
for row in xrange(1, row_count + 1):
legis = table.Cell(Row=row, Column=1).Range.Text
long_title = table.Cell(Row=row, Column=2).Range.Text
vote_date = table.Cell(Row=row, Column=3).Range.Text
resp = table.Cell(Row=row, Column=4).Range.Text
pass_req = table.Cell(Row=row, Column=5).Range.Text
yes = table.Cell(Row=row, Column=6).Range.Text
no = table.Cell(Row=row, Column=7).Range.Text
absent = table.Cell(Row=row, Column=8).Range.Text
not_voting = table.Cell(Row=row, Column=9).Range.Text
# Join the output, separating each element with a pipe.
oText = "|".join([name, house, legis, long_title, vote_date, resp, pass_req, yes, no, absent, not_voting])
# Make sure it's utf-8, replace a BEL character
# that popped up in the file and extra linebreaks.
oText = oText.encode('utf-8').replace('\a', '').replace('\n', '') + "\n"
oFile.write(oText)
doc.Close(True)
def collectVotes():
# Loop through House Word files and pull out tables.
for f in house_files:
writeVotes(f, "HOUSE")
# Loop through Senate word files and pull out tables.
for f in senate_files:
writeVotes(f, "SENATE")
oFile.close()
if __name__ == '__main__':
collectVotes()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment