Skip to content

Instantly share code, notes, and snippets.

@mekhami
Last active April 10, 2018 18:36
Show Gist options
  • Save mekhami/3760135cded4373f7baea1b3a58ff981 to your computer and use it in GitHub Desktop.
Save mekhami/3760135cded4373f7baea1b3a58ff981 to your computer and use it in GitHub Desktop.
import os
from typing import Generator, List
from time import perf_counter
# This establishes a Type alias, so our annotations can be clearer
Filepath = str
class CensusRow():
"""
A CensusRow object is yielded by Census.rows()
Keyword Arguments:
state -- The name of the state
town -- The name of the town
population -- The number of people who live in the town
"""
def __init__(self, state: str, town: str, population: int):
self.state = state
self.town = town
self.population = population
class Census():
"""
The Census Object allows efficient analysis of a census data file.
The file should be a whitespace-separated table of values such as a .tsv
Keyword Arguments:
data -- A String representation of a full file path pointing to a census data file
"""
def __init__(self, data: Filepath):
self.data = data
self.rows = []
with open(self.data, 'r') as f:
self.headers = f.readline().split()
self.rows = [row for row in generate_rows()]
def _parse_row(self, rowstring):
if len(rowstring.split()) == 4:
row = rowstring.split()
state = ' '.join(row[:1])
town = row[2]
population = int(row[3])
row = CensusRow(state, town, population)
else:
state, town, population = rowstring.split()
row = CensusRow(state, town, int(population)
return CensusRow
def generate_rows(self) -> Generator[CensusRow, None, None]:
"""
Yields each row of the Census Data File as a CensusRow object
"""
with open(self.data, 'r') as f:
# The first line is headers, so disregard that one
f.readline()
for line in f:
yield _parse_row(line)
def benford_frequencies(self) -> List[int]:
"""
Returns a List indexed 0-9, indicating the number of occurrences of each digit
as the first digit of a listed population number.
Technically, the List could be returned indexed 0-8, but since its function is to
lookup the frequency of a digit, I thought it best to pad the start of the list and
then use it as 1-indexed instead of 0-indexed.
"""
digit_frequency = [0 for _ in range(10)]
for row in self.rows:
first_digit = int(str(row.population)[0])
digit_frequency[first_digit] += 1
return digit_frequency
if __name__ == '__main__':
start = perf_counter()
# First, locate the census data file
cwd = os.path.dirname(__file__)
data_file = os.path.join(cwd, 'census_2009')
# Initialize a new Census object from that data file
census = Census(data_file)
# Calculate the frequency of occurrence for the first digit of population numbers
digit_frequency = census.benford_frequencies()
# Since we also need a percentage, we will need the total number of occurrences
total = sum(digit_frequency)
# Print out the table headers
print('\n{:<8s}{:<8s}{:<8s}'.format('Digit', 'Count', '%'))
# Print out the frequencies
for digit in range(1, 10):
count = digit_frequency[digit]
percent = (count / total) * 100
print('{:<8}{:<8}{:0.1f}'.format(digit, count, percent))
end = perf_counter()
time_in_ms = (end - start) * 1000
print('\nTime Elapsed: {:0.2f} ms'.format(time_in_ms))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment