Last active
April 10, 2018 18:36
-
-
Save mekhami/3760135cded4373f7baea1b3a58ff981 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from typing import Generator, List | |
from time import perf_counter | |
# This establishes a Type alias, so our annotations can be clearer | |
Filepath = str | |
class CensusRow(): | |
""" | |
A CensusRow object is yielded by Census.rows() | |
Keyword Arguments: | |
state -- The name of the state | |
town -- The name of the town | |
population -- The number of people who live in the town | |
""" | |
def __init__(self, state: str, town: str, population: int): | |
self.state = state | |
self.town = town | |
self.population = population | |
class Census(): | |
""" | |
The Census Object allows efficient analysis of a census data file. | |
The file should be a whitespace-separated table of values such as a .tsv | |
Keyword Arguments: | |
data -- A String representation of a full file path pointing to a census data file | |
""" | |
def __init__(self, data: Filepath): | |
self.data = data | |
self.rows = [] | |
with open(self.data, 'r') as f: | |
self.headers = f.readline().split() | |
self.rows = [row for row in generate_rows()] | |
def _parse_row(self, rowstring): | |
if len(rowstring.split()) == 4: | |
row = rowstring.split() | |
state = ' '.join(row[:1]) | |
town = row[2] | |
population = int(row[3]) | |
row = CensusRow(state, town, population) | |
else: | |
state, town, population = rowstring.split() | |
row = CensusRow(state, town, int(population) | |
return CensusRow | |
def generate_rows(self) -> Generator[CensusRow, None, None]: | |
""" | |
Yields each row of the Census Data File as a CensusRow object | |
""" | |
with open(self.data, 'r') as f: | |
# The first line is headers, so disregard that one | |
f.readline() | |
for line in f: | |
yield _parse_row(line) | |
def benford_frequencies(self) -> List[int]: | |
""" | |
Returns a List indexed 0-9, indicating the number of occurrences of each digit | |
as the first digit of a listed population number. | |
Technically, the List could be returned indexed 0-8, but since its function is to | |
lookup the frequency of a digit, I thought it best to pad the start of the list and | |
then use it as 1-indexed instead of 0-indexed. | |
""" | |
digit_frequency = [0 for _ in range(10)] | |
for row in self.rows: | |
first_digit = int(str(row.population)[0]) | |
digit_frequency[first_digit] += 1 | |
return digit_frequency | |
if __name__ == '__main__': | |
start = perf_counter() | |
# First, locate the census data file | |
cwd = os.path.dirname(__file__) | |
data_file = os.path.join(cwd, 'census_2009') | |
# Initialize a new Census object from that data file | |
census = Census(data_file) | |
# Calculate the frequency of occurrence for the first digit of population numbers | |
digit_frequency = census.benford_frequencies() | |
# Since we also need a percentage, we will need the total number of occurrences | |
total = sum(digit_frequency) | |
# Print out the table headers | |
print('\n{:<8s}{:<8s}{:<8s}'.format('Digit', 'Count', '%')) | |
# Print out the frequencies | |
for digit in range(1, 10): | |
count = digit_frequency[digit] | |
percent = (count / total) * 100 | |
print('{:<8}{:<8}{:0.1f}'.format(digit, count, percent)) | |
end = perf_counter() | |
time_in_ms = (end - start) * 1000 | |
print('\nTime Elapsed: {:0.2f} ms'.format(time_in_ms)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment