Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Code for calculating distances between all shops on
import json
from geopy.distance import vincenty
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter
# markers.json includes lat and lng data on all of the shops on Vinylhub
with open("./markers.json") as f:
shops = json.load(f)
# Filter shops that have not been closed
open_shops = filter(lambda shop: "(Permanently Closed)" not in shop["title"], shops["results"])
def calculate_distances(shops):
Calculate the distance between each pair of shops using the vincenty distance
# ToDo this takes a LONG time (~10 minutes). Any possible optimizations?
:param shops: list of shop objects with lat and lng information
:return: a numpy array with the distance between each pair of shop
l = map(lambda a: (a["latlng"]["lat"], a["latlng"]["lng"]), shops)
matrix = np.asanyarray(l)
return squareform(pdist(matrix, (lambda u, v: vincenty(u, v, iterations=100).kilometers)))
distance_matrix = calculate_distances(open_shops)
# save distance matrix so we can avoid expensive calculations"./distance_matrix.npy", distance_matrix)
# distance_matrix = np.load("./distance_matrix.npy") # if pre-calculated
def find_most_remote_shop(shops, distance_matrix):
Find the most remote shop
:param shops: A list of shop items
:param distance_matrix: A numpy array mapping the distances from one shop to the next
:return: The shop that is furthest away from any of the other shops, the shop closest to it
and the distance between them
minimum_distances = []
for row in distance_matrix:
# find the shortest non-zero distance for each shop
shortest = 40075 # earths circumference
for column in row:
if column != 0.0 and column < shortest:
shortest = column
# find the max min and the shop that it belongs too
max_distance = max(minimum_distances)
indices = np.where(distance_matrix == max_distance)
return shops[indices[0][0]], shops[indices[0][1]], max_distance
print(find_most_remote_shop(open_shops, distance_matrix))
def find_most_distant_pair(shops, distance_matrix):
Find the two shops that are the most distant from one another
:param shops: A list of shop items
:param distance_matrix: A numpy array mapping the distances from one shop to the next
:return: the two shops and the distance between them
indices = np.where(distance_matrix == distance_matrix.max())
return shops[indices[0][0]], distance_matrix.max(), shops[indices[0][1]]
print(find_most_distant_pair(open_shops, distance_matrix))
# use the dbscan algo to calculate clusters
db = DBSCAN(eps=1, min_samples=5, metric="precomputed").fit(distance_matrix)
cluster_counter = Counter(db.labels_)
biggest_clusters = []
# exclude the most common number, -1, represents shops that are not in any clusters
for i, c in enumerate(cluster_counter.most_common(11)[-10:]):
cluster_markers = []
labels = db.labels_.tolist()
cluster_indices = [i for i, x in enumerate(labels) if x == c[0]]
j = 0
for index in cluster_indices:
j += 1
with open("./clusters.json", "w") as fp:
json.dump(biggest_clusters, fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment