Last active
February 3, 2020 18:23
-
-
Save lieuzhenghong/498d6c66e6537342d404ac2d6e0e11c9 to your computer and use it in GitHub Desktop.
why is this code running slower and slower, but very similar code isn't?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def convert_point_distances_to_tract_distances(pttm, DM_PATH, SAVE_FILE_TO): | |
'''Returns a JSON file giving driving durations from all points in the Census Tract | |
to all other points in every other Census Tract. | |
Takes in three things: | |
1. A mapping from points to tracts (Dict[PointId : TractId] | |
2. The distance matrix file path | |
3. The path to save the output tract distance file to | |
Reads line by line | |
Returns a JSON file with the following schema: | |
{ | |
tractid: {tractid: distance, ... }, | |
tractid: {tractid: distance, ...}, | |
... | |
} | |
Each entry in the dictionary is the sum of driving durations from each point in | |
the tract to each other point in the tract. | |
Note that the duration from tract_id to tract_id can (in fact is almost always nonzero) | |
because it measures the sum of durations from all points in the tract | |
''' | |
tract_distances = {} | |
with open(DM_PATH, 'rt') as fh: | |
line = fh.readline() | |
# This is currently looking at the durations from point i to all other points | |
i = 0 | |
while line: | |
# Get the distances from point i; two spaces is not a typo | |
dist = [float(x) for x in line.split(' ')] | |
# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it | |
print(f'Now processing line {i}..') | |
if i not in pttm: | |
print(f'Point ID not in point_to_tract_mapping: {i}') | |
# Otherwise, update the tract-pairwise-distance matrix with the pointwise distances | |
else: | |
for j in range(len(dist)): | |
if j not in pttm: | |
print(f'Point ID not in point_to_tract_mapping: {j}') | |
elif pttm[i] not in tract_distances: | |
tract_distances[pttm[i]] = {pttm[j]: dist[j]} | |
elif pttm[j] not in tract_distances[pttm[i]]: | |
tract_distances[pttm[i]][pttm[j]] = dist[j] | |
else: | |
tract_distances[pttm[i]][pttm[j]] += dist[j] | |
# print(tract_distances) | |
i += 1 | |
line = fh.readline() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calc_knn_sum_durations_by_tract(pttm, N, DM_PATH, SAVE_FILE_TO): | |
'''Returns a dictionary and prints a file of | |
{ | |
tractid: Float | |
} | |
where the value is the sum of k-nearest-neighbour driving durations for | |
each point in the tract''' | |
knn_sum_dd_by_tract = {} | |
# I know I'm violating DRY here, but refactoring would take too much time | |
with open(DM_PATH, 'rt') as fh: | |
line = fh.readline() | |
# This is currently looking at the durations from point i to all other points | |
i = 0 | |
while line: | |
# Get the distances from point i; two spaces is not a typo | |
dist = [float(x) for x in line.split(' ')] | |
# Very rarely, points may not lie within tracts. This is strange, but we'll ignore it | |
print(f'Now processing line {i}..') | |
if i not in pttm: | |
print(f'Point ID not in point_to_tract_mapping: {i}') | |
else: | |
# We want to get the nearest N neighbours, but at | |
# the same time we want to make sure the tracts are in | |
# the point-to-tract mapping. | |
# So we sort, but maintain the index | |
# enumerate gives a enumeration giving indexes and values | |
# key=lambda i:i[1] sorts based on the original values | |
sorted_dist = [i for i in sorted( | |
enumerate(dist), key=lambda a:a[1])] | |
num_neighbours = 0 | |
j = 0 | |
while num_neighbours < N: | |
if j not in pttm: | |
print(f'Point ID not in point_to_tract_mapping: {j}') | |
else: | |
if pttm[i] not in knn_sum_dd_by_tract: | |
knn_sum_dd_by_tract[pttm[i]] = sorted_dist[j] | |
else: | |
knn_sum_dd_by_tract[pttm[i]] += sorted_dist[j] | |
num_neighbours += 1 | |
j += 1 | |
print(f"Sum of KNN distances for point {i} in tract {pttm[i]}: {knn_sum_dd_by_tract[pttm[i]}") | |
i += 1 | |
line = fh.readline() | |
with open(f'{SAVE_FILE_TO}', 'w') as f: | |
f.write(json.dumps(knn_sum_dd_by_tract)) | |
return knn_sum_dd_by_tract |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment