Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code for calculating distances between all shops on Vinylhub.com
scikit-learn==0.19.0
geopy==1.11.0
numpy==1.13.1
scipy==0.19.1
import json
from geopy.distance import vincenty
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter
# markers.json includes lat and lng data on all of the shops on Vinylhub
with open("./markers.json") as f:
shops = json.load(f)
# Filter shops that have not been closed
open_shops = filter(lambda shop: "(Permanently Closed)" not in shop["title"], shops["results"])
def calculate_distances(shops):
"""
Calculate the distance between each pair of shops using the vincenty distance
http://geopy.readthedocs.io/en/1.10.0/index.html?highlight=vincenty#module-geopy.distance
# ToDo this takes a LONG time (~10 minutes). Any possible optimizations?
:param shops: list of shop objects with lat and lng information
:return: a numpy array with the distance between each pair of shop
"""
l = map(lambda a: (a["latlng"]["lat"], a["latlng"]["lng"]), shops)
matrix = np.asanyarray(l)
return squareform(pdist(matrix, (lambda u, v: vincenty(u, v, iterations=100).kilometers)))
distance_matrix = calculate_distances(open_shops)
# save distance matrix so we can avoid expensive calculations
np.save("./distance_matrix.npy", distance_matrix)
# distance_matrix = np.load("./distance_matrix.npy") # if pre-calculated
def find_most_remote_shop(shops, distance_matrix):
"""
Find the most remote shop
:param shops: A list of shop items
:param distance_matrix: A numpy array mapping the distances from one shop to the next
:return: The shop that is furthest away from any of the other shops, the shop closest to it
and the distance between them
"""
minimum_distances = []
for row in distance_matrix:
# find the shortest non-zero distance for each shop
shortest = 40075 # earths circumference
for column in row:
if column != 0.0 and column < shortest:
shortest = column
minimum_distances.append(shortest)
# find the max min and the shop that it belongs too
max_distance = max(minimum_distances)
indices = np.where(distance_matrix == max_distance)
return shops[indices[0][0]], shops[indices[0][1]], max_distance
print(find_most_remote_shop(open_shops, distance_matrix))
def find_most_distant_pair(shops, distance_matrix):
"""
Find the two shops that are the most distant from one another
:param shops: A list of shop items
:param distance_matrix: A numpy array mapping the distances from one shop to the next
:return: the two shops and the distance between them
"""
indices = np.where(distance_matrix == distance_matrix.max())
return shops[indices[0][0]], distance_matrix.max(), shops[indices[0][1]]
print(find_most_distant_pair(open_shops, distance_matrix))
# use the dbscan algo to calculate clusters http://scikit-learn.org/stable/modules/clustering.html#dbscan
db = DBSCAN(eps=1, min_samples=5, metric="precomputed").fit(distance_matrix)
cluster_counter = Counter(db.labels_)
biggest_clusters = []
# exclude the most common number, -1, represents shops that are not in any clusters
for i, c in enumerate(cluster_counter.most_common(11)[-10:]):
cluster_markers = []
labels = db.labels_.tolist()
cluster_indices = [i for i, x in enumerate(labels) if x == c[0]]
j = 0
for index in cluster_indices:
cluster_markers.append(open_shops[index])
j += 1
biggest_clusters.append(cluster_markers)
with open("./clusters.json", "w") as fp:
json.dump(biggest_clusters, fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment