Skip to content

Instantly share code, notes, and snippets.

@lieuzhenghong
Last active February 3, 2020 18:23
Show Gist options
  • Save lieuzhenghong/498d6c66e6537342d404ac2d6e0e11c9 to your computer and use it in GitHub Desktop.
Save lieuzhenghong/498d6c66e6537342d404ac2d6e0e11c9 to your computer and use it in GitHub Desktop.
why is this code running slower and slower, but very similar code isn't?
def convert_point_distances_to_tract_distances(pttm, DM_PATH, SAVE_FILE_TO):
'''Returns a JSON file giving driving durations from all points in the Census Tract
to all other points in every other Census Tract.
Takes in three things:
1. A mapping from points to tracts (Dict[PointId : TractId]
2. The distance matrix file path
3. The path to save the output tract distance file to
Reads line by line
Returns a JSON file with the following schema:
{
tractid: {tractid: distance, ... },
tractid: {tractid: distance, ...},
...
}
Each entry in the dictionary is the sum of driving durations from each point in
the tract to each other point in the tract.
Note that the duration from tract_id to tract_id can (in fact is almost always nonzero)
because it measures the sum of durations from all points in the tract
'''
tract_distances = {}
with open(DM_PATH, 'rt') as fh:
line = fh.readline()
# This is currently looking at the durations from point i to all other points
i = 0
while line:
# Get the distances from point i; two spaces is not a typo
dist = [float(x) for x in line.split(' ')]
# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
print(f'Now processing line {i}..')
if i not in pttm:
print(f'Point ID not in point_to_tract_mapping: {i}')
# Otherwise, update the tract-pairwise-distance matrix with the pointwise distances
else:
for j in range(len(dist)):
if j not in pttm:
print(f'Point ID not in point_to_tract_mapping: {j}')
elif pttm[i] not in tract_distances:
tract_distances[pttm[i]] = {pttm[j]: dist[j]}
elif pttm[j] not in tract_distances[pttm[i]]:
tract_distances[pttm[i]][pttm[j]] = dist[j]
else:
tract_distances[pttm[i]][pttm[j]] += dist[j]
# print(tract_distances)
i += 1
line = fh.readline()
def calc_knn_sum_durations_by_tract(pttm, N, DM_PATH, SAVE_FILE_TO):
'''Returns a dictionary and prints a file of
{
tractid: Float
}
where the value is the sum of k-nearest-neighbour driving durations for
each point in the tract'''
knn_sum_dd_by_tract = {}
# I know I'm violating DRY here, but refactoring would take too much time
with open(DM_PATH, 'rt') as fh:
line = fh.readline()
# This is currently looking at the durations from point i to all other points
i = 0
while line:
# Get the distances from point i; two spaces is not a typo
dist = [float(x) for x in line.split(' ')]
# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it
print(f'Now processing line {i}..')
if i not in pttm:
print(f'Point ID not in point_to_tract_mapping: {i}')
else:
# We want to get the nearest N neighbours, but at
# the same time we want to make sure the tracts are in
# the point-to-tract mapping.
# So we sort, but maintain the index
# enumerate gives a enumeration giving indexes and values
# key=lambda i:i[1] sorts based on the original values
sorted_dist = [i for i in sorted(
enumerate(dist), key=lambda a:a[1])]
num_neighbours = 0
j = 0
while num_neighbours < N:
if j not in pttm:
print(f'Point ID not in point_to_tract_mapping: {j}')
else:
if pttm[i] not in knn_sum_dd_by_tract:
knn_sum_dd_by_tract[pttm[i]] = sorted_dist[j]
else:
knn_sum_dd_by_tract[pttm[i]] += sorted_dist[j]
num_neighbours += 1
j += 1
print(f"Sum of KNN distances for point {i} in tract {pttm[i]}: {knn_sum_dd_by_tract[pttm[i]}")
i += 1
line = fh.readline()
with open(f'{SAVE_FILE_TO}', 'w') as f:
f.write(json.dumps(knn_sum_dd_by_tract))
return knn_sum_dd_by_tract
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment