Skip to content

Instantly share code, notes, and snippets.

@ozgurozkok
Created January 30, 2023 17:18
Show Gist options
  • Save ozgurozkok/8b2d5efee88f87fa4666ec8f671813e7 to your computer and use it in GitHub Desktop.
Save ozgurozkok/8b2d5efee88f87fa4666ec8f671813e7 to your computer and use it in GitHub Desktop.
clustering locations using Python, MySQL, and KMeans from the scikit-learn library with a condition that two points with the same title and within 100m of each other should be in the same cluster
import mysql.connector
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance
def haversine_distance(lat1, lon1, lat2, lon2):
"""
Calculate the haversine distance between two points on the earth (specified in decimal degrees)
"""
lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
a = np.sin((lat2-lat1)/2.0)**2 + \
np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
return 6371 * 2 * np.arcsin(np.sqrt(a))
# Connect to MySQL database and fetch data
cnx = mysql.connector.connect(user='user', password='password',
host='host', database='database')
cursor = cnx.cursor()
query = "SELECT title, latitude, longitude FROM locations"
cursor.execute(query)
locations = cursor.fetchall()
# Create a dictionary to store the cluster index for each location
clusters = {}
cluster_index = 0
# Loop through each location
for i, location in enumerate(locations):
title, latitude, longitude = location
found = False
# Check if this location has the same title as a location in an existing cluster
for ci, cl in clusters.items():
for cj, loc in enumerate(cl):
cj_title, cj_latitude, cj_longitude = loc
if title == cj_title:
# Calculate the haversine distance between the two locations
d = haversine_distance(latitude, longitude, cj_latitude, cj_longitude)
if d <= 0.1: # Within 100m
found = True
clusters[ci].append(location)
break
if found:
break
if not found:
# Create a new cluster for this location
clusters[cluster_index] = [location]
cluster_index += 1
# Convert the clusters to a numpy array
clusters_np = np.array([[latitude, longitude] for cl in clusters.values() for title, latitude, longitude in cl])
# Perform k-means clustering
kmeans = KMeans(n_clusters=len(clusters))
kmeans.fit(clusters_np)
# Get the cluster labels for each location
labels = kmeans.labels_
# Print the result
label_index = 0
for ci, cl in clusters.items():
for loc in cl:
title, latitude, longitude = loc
print("Location:", title, "Coordinate:", (latitude, longitude), "Cluster:", labels[label_index])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment