lieuzhenghong/not_any_slower.py

## not_any_slower.py
def convert_point_distances_to_tract_distances(pttm, DM_PATH, SAVE_FILE_TO):
    '''Returns a JSON file giving driving durations from all points in the Census Tract
    to all other points in every other Census Tract.
    Takes in three things:

        1. A mapping from points to tracts (Dict[PointId : TractId]
        2. The distance matrix file path
        3. The path to save the output tract distance file to

    Reads line by line
    Returns a JSON file with the following schema:
    {
        tractid: {tractid: distance, ... },
        tractid: {tractid: distance, ...},
        ...
    }
    Each entry in the dictionary is the sum of driving durations from each point in
    the tract to each other point in the tract.

    Note that the duration from tract_id to tract_id can (in fact is almost always nonzero)
    because it measures the sum of durations from all points in the tract
    '''
    tract_distances = {}

    with open(DM_PATH, 'rt') as fh:
        line = fh.readline()
        # This is currently looking at the durations from point i to all other points
        i = 0
        while line:
            # Get the distances from point i; two spaces is not a typo
            dist = [float(x) for x in line.split('  ')]
            # Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
            print(f'Now processing line {i}..')
            if i not in pttm:
                print(f'Point ID not in point_to_tract_mapping: {i}')
            # Otherwise, update the tract-pairwise-distance matrix with the pointwise distances
            else:
                for j in range(len(dist)):
                    if j not in pttm:
                        print(f'Point ID not in point_to_tract_mapping: {j}')
                    elif pttm[i] not in tract_distances:
                        tract_distances[pttm[i]] = {pttm[j]: dist[j]}
                    elif pttm[j] not in tract_distances[pttm[i]]:
                        tract_distances[pttm[i]][pttm[j]] = dist[j]
                    else:
                        tract_distances[pttm[i]][pttm[j]] += dist[j]
                    # print(tract_distances)
            i += 1
            line = fh.readline()

## slower_and_slower.py
def calc_knn_sum_durations_by_tract(pttm, N, DM_PATH, SAVE_FILE_TO):
    '''Returns a dictionary and prints a file of
    {
        tractid: Float
    }
    where the value is the sum of k-nearest-neighbour driving durations for
    each point in the tract'''
    knn_sum_dd_by_tract = {}

    # I know I'm violating DRY here, but refactoring would take too much time
    with open(DM_PATH, 'rt') as fh:
        line = fh.readline()
        # This is currently looking at the durations from point i to all other points
        i = 0
        while line:
            # Get the distances from point i; two spaces is not a typo
            dist = [float(x) for x in line.split('  ')]

            # Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
            print(f'Now processing line {i}..')
            if i not in pttm:
                print(f'Point ID not in point_to_tract_mapping: {i}')
            else:
                # We want to get the nearest N neighbours, but at
                # the same time we want to make sure the tracts are in
                # the point-to-tract mapping.

                # So we sort, but maintain the index
                # enumerate gives a enumeration giving indexes and values
                # key=lambda i:i[1] sorts based on the original values

                sorted_dist = [i for i in sorted(
                    enumerate(dist), key=lambda a:a[1])]

                num_neighbours = 0
                j = 0

                while num_neighbours < N:
                    if j not in pttm:
                        print(f'Point ID not in point_to_tract_mapping: {j}')
                    else:
                        if pttm[i] not in knn_sum_dd_by_tract:
                            knn_sum_dd_by_tract[pttm[i]] = sorted_dist[j]
                        else:
                            knn_sum_dd_by_tract[pttm[i]] += sorted_dist[j]
                        num_neighbours += 1
                    j += 1

                print(f"Sum of KNN distances for point {i} in tract {pttm[i]}: {knn_sum_dd_by_tract[pttm[i]}")
            i += 1
            line = fh.readline()

    with open(f'{SAVE_FILE_TO}', 'w') as f:
        f.write(json.dumps(knn_sum_dd_by_tract))

    return knn_sum_dd_by_tract
	def convert_point_distances_to_tract_distances(pttm, DM_PATH, SAVE_FILE_TO):
	'''Returns a JSON file giving driving durations from all points in the Census Tract
	to all other points in every other Census Tract.
	Takes in three things:

	1. A mapping from points to tracts (Dict[PointId : TractId]
	2. The distance matrix file path
	3. The path to save the output tract distance file to

	Reads line by line
	Returns a JSON file with the following schema:
	{
	tractid: {tractid: distance, ... },
	tractid: {tractid: distance, ...},
	...
	}
	Each entry in the dictionary is the sum of driving durations from each point in
	the tract to each other point in the tract.

	Note that the duration from tract_id to tract_id can (in fact is almost always nonzero)
	because it measures the sum of durations from all points in the tract
	'''
	tract_distances = {}

	with open(DM_PATH, 'rt') as fh:
	line = fh.readline()
	# This is currently looking at the durations from point i to all other points
	i = 0
	while line:
	# Get the distances from point i; two spaces is not a typo
	dist = [float(x) for x in line.split(' ')]
	# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
	print(f'Now processing line {i}..')
	if i not in pttm:
	print(f'Point ID not in point_to_tract_mapping: {i}')
	# Otherwise, update the tract-pairwise-distance matrix with the pointwise distances
	else:
	for j in range(len(dist)):
	if j not in pttm:
	print(f'Point ID not in point_to_tract_mapping: {j}')
	elif pttm[i] not in tract_distances:
	tract_distances[pttm[i]] = {pttm[j]: dist[j]}
	elif pttm[j] not in tract_distances[pttm[i]]:
	tract_distances[pttm[i]][pttm[j]] = dist[j]
	else:
	tract_distances[pttm[i]][pttm[j]] += dist[j]
	# print(tract_distances)
	i += 1
	line = fh.readline()
	def calc_knn_sum_durations_by_tract(pttm, N, DM_PATH, SAVE_FILE_TO):
	'''Returns a dictionary and prints a file of
	{
	tractid: Float
	}
	where the value is the sum of k-nearest-neighbour driving durations for
	each point in the tract'''
	knn_sum_dd_by_tract = {}

	# I know I'm violating DRY here, but refactoring would take too much time
	with open(DM_PATH, 'rt') as fh:
	line = fh.readline()
	# This is currently looking at the durations from point i to all other points
	i = 0
	while line:
	# Get the distances from point i; two spaces is not a typo
	dist = [float(x) for x in line.split(' ')]

	# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
	print(f'Now processing line {i}..')
	if i not in pttm:
	print(f'Point ID not in point_to_tract_mapping: {i}')
	else:
	# We want to get the nearest N neighbours, but at
	# the same time we want to make sure the tracts are in
	# the point-to-tract mapping.

	# So we sort, but maintain the index
	# enumerate gives a enumeration giving indexes and values
	# key=lambda i:i[1] sorts based on the original values

	sorted_dist = [i for i in sorted(
	enumerate(dist), key=lambda a:a[1])]

	num_neighbours = 0
	j = 0

	while num_neighbours < N:
	if j not in pttm:
	print(f'Point ID not in point_to_tract_mapping: {j}')
	else:
	if pttm[i] not in knn_sum_dd_by_tract:
	knn_sum_dd_by_tract[pttm[i]] = sorted_dist[j]
	else:
	knn_sum_dd_by_tract[pttm[i]] += sorted_dist[j]
	num_neighbours += 1
	j += 1

	print(f"Sum of KNN distances for point {i} in tract {pttm[i]}: {knn_sum_dd_by_tract[pttm[i]}")
	i += 1
	line = fh.readline()

	with open(f'{SAVE_FILE_TO}', 'w') as f:
	f.write(json.dumps(knn_sum_dd_by_tract))

	return knn_sum_dd_by_tract