shaunagm/voter_file.py

## voter_file.py
from parsons import Table

# Download NC Voter data file from: https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip
# Extract (very large, I had to use an external haard drive)

# load the file into a Parsons table

# note #1 - need to specify encoding as it's not default UTF-8
# used "file -i ncvoter_Statewide.txt" on Linux to find file's encoding which was charset=iso-8859-1
# python codecs list says charset=iso-8859-1 is called latin_1

# note #2 - tab-delimited, so specify delimiter="\t" (default is comma, aka CSV)

table = Table.from_csv("/media/shauna/DeepThought/parsons/ncvoter_Statewide.txt", encoding="latin_1", delimiter="\t")
print(table.first)  # check it worked

# this is a huge table and takes a long time so for now let's use a subset, skip this line for doing real analysis
subset = Table(table.table.head(n=50000))

# answering questions like "how many people in a given city have active voter status and are Black?"
# looked at head of file in excel to get column names and value formats for use below

# less performant way:
# note: variables within brackets are column names, it {voter_status_desc} is checking the column voter_status_desc
selected_rows = subset.select_rows("{voter_status_desc} == 'ACTIVE' and {res_city_desc} == 'BURLINGTON' and {race_code} == 'B'")
selected_rows.num_rows

# more performant way, using petl aggregate function:
count_by_county = subset.table.aggregate(['res_city_desc', 'race_code', 'voter_status_desc'], len)
count_table = Table(count_by_county)  # turn Petl table back into a Parsons table

# hack to view all rows created by aggregate function
for row in count_table:
  print(row)

# Question: can we compare the frequency of denied voter registration status by race?
# There's probably a way to do this by generating new columns based on existing columns but none of us knew :(
# So here's how to do it in plain Python

race_by_vote_status = Table(subset.table.aggregate(['race_code', 'voter_status_desc'], len))

summary_dict = {}
for row in race_by_vote_status:
  # if key for a given race code doesn't exist, create it, with counts initialized to 0
  if row['race_code'] not in summary_dict:
    summary_dict[row['race_code']] = {"denied_count": 0, "total_count": 0}
  # if this row is for 'denied' status
  if row["voter_status_desc"] == "DENIED":
    if row["value"]:  # if there's a non-null value
      summary_dict[row['race_code']]["denied_count"] = row["value"]  # set denied_count to row value
  # for all rows, add to total count for their race code
  summary_dict[row["race_code"]]["total_count"] += row["value"]

# get actual percentage
for row, value in summary_dict.items():
  percentage = value['denied_count']/value['total_count']
  print(f"{row}: {percentage}")
	from parsons import Table

	# Download NC Voter data file from: https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip
	# Extract (very large, I had to use an external haard drive)

	# load the file into a Parsons table

	# note #1 - need to specify encoding as it's not default UTF-8
	# used "file -i ncvoter_Statewide.txt" on Linux to find file's encoding which was charset=iso-8859-1
	# python codecs list says charset=iso-8859-1 is called latin_1

	# note #2 - tab-delimited, so specify delimiter="\t" (default is comma, aka CSV)

	table = Table.from_csv("/media/shauna/DeepThought/parsons/ncvoter_Statewide.txt", encoding="latin_1", delimiter="\t")
	print(table.first) # check it worked

	# this is a huge table and takes a long time so for now let's use a subset, skip this line for doing real analysis
	subset = Table(table.table.head(n=50000))

	# answering questions like "how many people in a given city have active voter status and are Black?"
	# looked at head of file in excel to get column names and value formats for use below

	# less performant way:
	# note: variables within brackets are column names, it {voter_status_desc} is checking the column voter_status_desc
	selected_rows = subset.select_rows("{voter_status_desc} == 'ACTIVE' and {res_city_desc} == 'BURLINGTON' and {race_code} == 'B'")
	selected_rows.num_rows

	# more performant way, using petl aggregate function:
	count_by_county = subset.table.aggregate(['res_city_desc', 'race_code', 'voter_status_desc'], len)
	count_table = Table(count_by_county) # turn Petl table back into a Parsons table

	# hack to view all rows created by aggregate function
	for row in count_table:
	print(row)

	# Question: can we compare the frequency of denied voter registration status by race?
	# There's probably a way to do this by generating new columns based on existing columns but none of us knew :(
	# So here's how to do it in plain Python

	race_by_vote_status = Table(subset.table.aggregate(['race_code', 'voter_status_desc'], len))

	summary_dict = {}
	for row in race_by_vote_status:
	# if key for a given race code doesn't exist, create it, with counts initialized to 0
	if row['race_code'] not in summary_dict:
	summary_dict[row['race_code']] = {"denied_count": 0, "total_count": 0}
	# if this row is for 'denied' status
	if row["voter_status_desc"] == "DENIED":
	if row["value"]: # if there's a non-null value
	summary_dict[row['race_code']]["denied_count"] = row["value"] # set denied_count to row value
	# for all rows, add to total count for their race code
	summary_dict[row["race_code"]]["total_count"] += row["value"]

	# get actual percentage
	for row, value in summary_dict.items():
	percentage = value['denied_count']/value['total_count']
	print(f"{row}: {percentage}")