Last active
March 23, 2022 16:29
-
-
Save shaunagm/2392b35468cce34291a56201717e4de5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from parsons import Table | |
# Download NC Voter data file from: https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip | |
# Extract (very large, I had to use an external haard drive) | |
# load the file into a Parsons table | |
# note #1 - need to specify encoding as it's not default UTF-8 | |
# used "file -i ncvoter_Statewide.txt" on Linux to find file's encoding which was charset=iso-8859-1 | |
# python codecs list says charset=iso-8859-1 is called latin_1 | |
# note #2 - tab-delimited, so specify delimiter="\t" (default is comma, aka CSV) | |
table = Table.from_csv("/media/shauna/DeepThought/parsons/ncvoter_Statewide.txt", encoding="latin_1", delimiter="\t") | |
print(table.first) # check it worked | |
# this is a huge table and takes a long time so for now let's use a subset, skip this line for doing real analysis | |
subset = Table(table.table.head(n=50000)) | |
# answering questions like "how many people in a given city have active voter status and are Black?" | |
# looked at head of file in excel to get column names and value formats for use below | |
# less performant way: | |
# note: variables within brackets are column names, it {voter_status_desc} is checking the column voter_status_desc | |
selected_rows = subset.select_rows("{voter_status_desc} == 'ACTIVE' and {res_city_desc} == 'BURLINGTON' and {race_code} == 'B'") | |
selected_rows.num_rows | |
# more performant way, using petl aggregate function: | |
count_by_county = subset.table.aggregate(['res_city_desc', 'race_code', 'voter_status_desc'], len) | |
count_table = Table(count_by_county) # turn Petl table back into a Parsons table | |
# hack to view all rows created by aggregate function | |
for row in count_table: | |
print(row) | |
# Question: can we compare the frequency of denied voter registration status by race? | |
# There's probably a way to do this by generating new columns based on existing columns but none of us knew :( | |
# So here's how to do it in plain Python | |
race_by_vote_status = Table(subset.table.aggregate(['race_code', 'voter_status_desc'], len)) | |
summary_dict = {} | |
for row in race_by_vote_status: | |
# if key for a given race code doesn't exist, create it, with counts initialized to 0 | |
if row['race_code'] not in summary_dict: | |
summary_dict[row['race_code']] = {"denied_count": 0, "total_count": 0} | |
# if this row is for 'denied' status | |
if row["voter_status_desc"] == "DENIED": | |
if row["value"]: # if there's a non-null value | |
summary_dict[row['race_code']]["denied_count"] = row["value"] # set denied_count to row value | |
# for all rows, add to total count for their race code | |
summary_dict[row["race_code"]]["total_count"] += row["value"] | |
# get actual percentage | |
for row, value in summary_dict.items(): | |
percentage = value['denied_count']/value['total_count'] | |
print(f"{row}: {percentage}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment