Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@shaunagm
Last active March 23, 2022 16:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shaunagm/2392b35468cce34291a56201717e4de5 to your computer and use it in GitHub Desktop.
Save shaunagm/2392b35468cce34291a56201717e4de5 to your computer and use it in GitHub Desktop.
from parsons import Table
# Download NC Voter data file from: https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip
# Extract (very large, I had to use an external haard drive)
# load the file into a Parsons table
# note #1 - need to specify encoding as it's not default UTF-8
# used "file -i ncvoter_Statewide.txt" on Linux to find file's encoding which was charset=iso-8859-1
# python codecs list says charset=iso-8859-1 is called latin_1
# note #2 - tab-delimited, so specify delimiter="\t" (default is comma, aka CSV)
table = Table.from_csv("/media/shauna/DeepThought/parsons/ncvoter_Statewide.txt", encoding="latin_1", delimiter="\t")
print(table.first) # check it worked
# this is a huge table and takes a long time so for now let's use a subset, skip this line for doing real analysis
subset = Table(table.table.head(n=50000))
# answering questions like "how many people in a given city have active voter status and are Black?"
# looked at head of file in excel to get column names and value formats for use below
# less performant way:
# note: variables within brackets are column names, it {voter_status_desc} is checking the column voter_status_desc
selected_rows = subset.select_rows("{voter_status_desc} == 'ACTIVE' and {res_city_desc} == 'BURLINGTON' and {race_code} == 'B'")
selected_rows.num_rows
# more performant way, using petl aggregate function:
count_by_county = subset.table.aggregate(['res_city_desc', 'race_code', 'voter_status_desc'], len)
count_table = Table(count_by_county) # turn Petl table back into a Parsons table
# hack to view all rows created by aggregate function
for row in count_table:
print(row)
# Question: can we compare the frequency of denied voter registration status by race?
# There's probably a way to do this by generating new columns based on existing columns but none of us knew :(
# So here's how to do it in plain Python
race_by_vote_status = Table(subset.table.aggregate(['race_code', 'voter_status_desc'], len))
summary_dict = {}
for row in race_by_vote_status:
# if key for a given race code doesn't exist, create it, with counts initialized to 0
if row['race_code'] not in summary_dict:
summary_dict[row['race_code']] = {"denied_count": 0, "total_count": 0}
# if this row is for 'denied' status
if row["voter_status_desc"] == "DENIED":
if row["value"]: # if there's a non-null value
summary_dict[row['race_code']]["denied_count"] = row["value"] # set denied_count to row value
# for all rows, add to total count for their race code
summary_dict[row["race_code"]]["total_count"] += row["value"]
# get actual percentage
for row, value in summary_dict.items():
percentage = value['denied_count']/value['total_count']
print(f"{row}: {percentage}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment