Created
October 9, 2017 13:59
-
-
Save Kalli/273f600e3f8219afc17ced957acc6555 to your computer and use it in GitHub Desktop.
Code for calculating distances between all shops on Vinylhub.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scikit-learn==0.19.0 | |
geopy==1.11.0 | |
numpy==1.13.1 | |
scipy==0.19.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from geopy.distance import vincenty | |
from scipy.spatial.distance import pdist, squareform | |
from sklearn.cluster import DBSCAN | |
import numpy as np | |
from collections import Counter | |
# markers.json includes lat and lng data on all of the shops on Vinylhub | |
with open("./markers.json") as f: | |
shops = json.load(f) | |
# Filter shops that have not been closed | |
open_shops = filter(lambda shop: "(Permanently Closed)" not in shop["title"], shops["results"]) | |
def calculate_distances(shops): | |
""" | |
Calculate the distance between each pair of shops using the vincenty distance | |
http://geopy.readthedocs.io/en/1.10.0/index.html?highlight=vincenty#module-geopy.distance | |
# ToDo this takes a LONG time (~10 minutes). Any possible optimizations? | |
:param shops: list of shop objects with lat and lng information | |
:return: a numpy array with the distance between each pair of shop | |
""" | |
l = map(lambda a: (a["latlng"]["lat"], a["latlng"]["lng"]), shops) | |
matrix = np.asanyarray(l) | |
return squareform(pdist(matrix, (lambda u, v: vincenty(u, v, iterations=100).kilometers))) | |
distance_matrix = calculate_distances(open_shops) | |
# save distance matrix so we can avoid expensive calculations | |
np.save("./distance_matrix.npy", distance_matrix) | |
# distance_matrix = np.load("./distance_matrix.npy") # if pre-calculated | |
def find_most_remote_shop(shops, distance_matrix): | |
""" | |
Find the most remote shop | |
:param shops: A list of shop items | |
:param distance_matrix: A numpy array mapping the distances from one shop to the next | |
:return: The shop that is furthest away from any of the other shops, the shop closest to it | |
and the distance between them | |
""" | |
minimum_distances = [] | |
for row in distance_matrix: | |
# find the shortest non-zero distance for each shop | |
shortest = 40075 # earths circumference | |
for column in row: | |
if column != 0.0 and column < shortest: | |
shortest = column | |
minimum_distances.append(shortest) | |
# find the max min and the shop that it belongs too | |
max_distance = max(minimum_distances) | |
indices = np.where(distance_matrix == max_distance) | |
return shops[indices[0][0]], shops[indices[0][1]], max_distance | |
print(find_most_remote_shop(open_shops, distance_matrix)) | |
def find_most_distant_pair(shops, distance_matrix): | |
""" | |
Find the two shops that are the most distant from one another | |
:param shops: A list of shop items | |
:param distance_matrix: A numpy array mapping the distances from one shop to the next | |
:return: the two shops and the distance between them | |
""" | |
indices = np.where(distance_matrix == distance_matrix.max()) | |
return shops[indices[0][0]], distance_matrix.max(), shops[indices[0][1]] | |
print(find_most_distant_pair(open_shops, distance_matrix)) | |
# use the dbscan algo to calculate clusters http://scikit-learn.org/stable/modules/clustering.html#dbscan | |
db = DBSCAN(eps=1, min_samples=5, metric="precomputed").fit(distance_matrix) | |
cluster_counter = Counter(db.labels_) | |
biggest_clusters = [] | |
# exclude the most common number, -1, represents shops that are not in any clusters | |
for i, c in enumerate(cluster_counter.most_common(11)[-10:]): | |
cluster_markers = [] | |
labels = db.labels_.tolist() | |
cluster_indices = [i for i, x in enumerate(labels) if x == c[0]] | |
j = 0 | |
for index in cluster_indices: | |
cluster_markers.append(open_shops[index]) | |
j += 1 | |
biggest_clusters.append(cluster_markers) | |
with open("./clusters.json", "w") as fp: | |
json.dump(biggest_clusters, fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment