mekhami/census.py

## census.py
import os
from typing import Generator, List
from time import perf_counter

# This establishes a Type alias, so our annotations can be clearer
Filepath = str

class CensusRow():
    """
    A CensusRow object is yielded by Census.rows()

    Keyword Arguments:
    state -- The name of the state
    town -- The name of the town
    population -- The number of people who live in the town
    """
    def __init__(self, state: str, town: str, population: int):
        self.state = state
        self.town = town
        self.population = population


class Census():
    """
    The Census Object allows efficient analysis of a census data file.
    The file should be a whitespace-separated table of values such as a .tsv

    Keyword Arguments:
    data -- A String representation of a full file path pointing to a census data file
    """
    def __init__(self, data: Filepath):
        self.data = data
        self.rows = []

        with open(self.data, 'r') as f:
            self.headers = f.readline().split()
            self.rows = [row for row in generate_rows()]

    def _parse_row(self, rowstring):
        if len(rowstring.split()) == 4:
            row = rowstring.split()
            state = ' '.join(row[:1])
            town = row[2]
            population = int(row[3])
            row = CensusRow(state, town, population)
        else:
            state, town, population = rowstring.split()
            row = CensusRow(state, town, int(population)
        return CensusRow


    def generate_rows(self) -> Generator[CensusRow, None, None]:
        """
        Yields each row of the Census Data File as a CensusRow object
        """
        with open(self.data, 'r') as f:
            # The first line is headers, so disregard that one
            f.readline()

            for line in f:
                yield _parse_row(line)

    def benford_frequencies(self) -> List[int]:
        """
        Returns a List indexed 0-9, indicating the number of occurrences of each digit
        as the first digit of a listed population number.

        Technically, the List could be returned indexed 0-8, but since its function is to
        lookup the frequency of a digit, I thought it best to pad the start of the list and
        then use it as 1-indexed instead of 0-indexed.
        """
        digit_frequency = [0 for _ in range(10)]

        for row in self.rows:
            first_digit = int(str(row.population)[0])
            digit_frequency[first_digit] += 1

        return digit_frequency


if __name__ == '__main__':
    start = perf_counter()
    # First, locate the census data file
    cwd = os.path.dirname(__file__)
    data_file = os.path.join(cwd, 'census_2009')

    # Initialize a new Census object from that data file
    census = Census(data_file)

    # Calculate the frequency of occurrence for the first digit of population numbers
    digit_frequency = census.benford_frequencies()

    # Since we also need a percentage, we will need the total number of occurrences
    total = sum(digit_frequency)

    # Print out the table headers
    print('\n{:<8s}{:<8s}{:<8s}'.format('Digit', 'Count', '%'))

    # Print out the frequencies
    for digit in range(1, 10):
        count = digit_frequency[digit]
        percent = (count / total) * 100
        print('{:<8}{:<8}{:0.1f}'.format(digit, count, percent))

    end = perf_counter()

    time_in_ms = (end - start) * 1000

    print('\nTime Elapsed: {:0.2f} ms'.format(time_in_ms))
	import os
	from typing import Generator, List
	from time import perf_counter

	# This establishes a Type alias, so our annotations can be clearer
	Filepath = str

	class CensusRow():
	"""
	A CensusRow object is yielded by Census.rows()

	Keyword Arguments:
	state -- The name of the state
	town -- The name of the town
	population -- The number of people who live in the town
	"""
	def __init__(self, state: str, town: str, population: int):
	self.state = state
	self.town = town
	self.population = population


	class Census():
	"""
	The Census Object allows efficient analysis of a census data file.
	The file should be a whitespace-separated table of values such as a .tsv

	Keyword Arguments:
	data -- A String representation of a full file path pointing to a census data file
	"""
	def __init__(self, data: Filepath):
	self.data = data
	self.rows = []

	with open(self.data, 'r') as f:
	self.headers = f.readline().split()
	self.rows = [row for row in generate_rows()]

	def _parse_row(self, rowstring):
	if len(rowstring.split()) == 4:
	row = rowstring.split()
	state = ' '.join(row[:1])
	town = row[2]
	population = int(row[3])
	row = CensusRow(state, town, population)
	else:
	state, town, population = rowstring.split()
	row = CensusRow(state, town, int(population)
	return CensusRow


	def generate_rows(self) -> Generator[CensusRow, None, None]:
	"""
	Yields each row of the Census Data File as a CensusRow object
	"""
	with open(self.data, 'r') as f:
	# The first line is headers, so disregard that one
	f.readline()

	for line in f:
	yield _parse_row(line)

	def benford_frequencies(self) -> List[int]:
	"""
	Returns a List indexed 0-9, indicating the number of occurrences of each digit
	as the first digit of a listed population number.

	Technically, the List could be returned indexed 0-8, but since its function is to
	lookup the frequency of a digit, I thought it best to pad the start of the list and
	then use it as 1-indexed instead of 0-indexed.
	"""
	digit_frequency = [0 for _ in range(10)]

	for row in self.rows:
	first_digit = int(str(row.population)[0])
	digit_frequency[first_digit] += 1

	return digit_frequency


	if __name__ == '__main__':
	start = perf_counter()
	# First, locate the census data file
	cwd = os.path.dirname(__file__)
	data_file = os.path.join(cwd, 'census_2009')

	# Initialize a new Census object from that data file
	census = Census(data_file)

	# Calculate the frequency of occurrence for the first digit of population numbers
	digit_frequency = census.benford_frequencies()

	# Since we also need a percentage, we will need the total number of occurrences
	total = sum(digit_frequency)

	# Print out the table headers
	print('\n{:<8s}{:<8s}{:<8s}'.format('Digit', 'Count', '%'))

	# Print out the frequencies
	for digit in range(1, 10):
	count = digit_frequency[digit]
	percent = (count / total) * 100
	print('{:<8}{:<8}{:0.1f}'.format(digit, count, percent))

	end = perf_counter()

	time_in_ms = (end - start) * 1000

	print('\nTime Elapsed: {:0.2f} ms'.format(time_in_ms))