Created
January 30, 2023 17:18
-
-
Save ozgurozkok/8b2d5efee88f87fa4666ec8f671813e7 to your computer and use it in GitHub Desktop.
clustering locations using Python, MySQL, and KMeans from the scikit-learn library with a condition that two points with the same title and within 100m of each other should be in the same cluster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mysql.connector | |
import numpy as np | |
from sklearn.cluster import KMeans | |
from scipy.spatial import distance | |
def haversine_distance(lat1, lon1, lat2, lon2): | |
""" | |
Calculate the haversine distance between two points on the earth (specified in decimal degrees) | |
""" | |
lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2]) | |
a = np.sin((lat2-lat1)/2.0)**2 + \ | |
np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2 | |
return 6371 * 2 * np.arcsin(np.sqrt(a)) | |
# Connect to MySQL database and fetch data | |
cnx = mysql.connector.connect(user='user', password='password', | |
host='host', database='database') | |
cursor = cnx.cursor() | |
query = "SELECT title, latitude, longitude FROM locations" | |
cursor.execute(query) | |
locations = cursor.fetchall() | |
# Create a dictionary to store the cluster index for each location | |
clusters = {} | |
cluster_index = 0 | |
# Loop through each location | |
for i, location in enumerate(locations): | |
title, latitude, longitude = location | |
found = False | |
# Check if this location has the same title as a location in an existing cluster | |
for ci, cl in clusters.items(): | |
for cj, loc in enumerate(cl): | |
cj_title, cj_latitude, cj_longitude = loc | |
if title == cj_title: | |
# Calculate the haversine distance between the two locations | |
d = haversine_distance(latitude, longitude, cj_latitude, cj_longitude) | |
if d <= 0.1: # Within 100m | |
found = True | |
clusters[ci].append(location) | |
break | |
if found: | |
break | |
if not found: | |
# Create a new cluster for this location | |
clusters[cluster_index] = [location] | |
cluster_index += 1 | |
# Convert the clusters to a numpy array | |
clusters_np = np.array([[latitude, longitude] for cl in clusters.values() for title, latitude, longitude in cl]) | |
# Perform k-means clustering | |
kmeans = KMeans(n_clusters=len(clusters)) | |
kmeans.fit(clusters_np) | |
# Get the cluster labels for each location | |
labels = kmeans.labels_ | |
# Print the result | |
label_index = 0 | |
for ci, cl in clusters.items(): | |
for loc in cl: | |
title, latitude, longitude = loc | |
print("Location:", title, "Coordinate:", (latitude, longitude), "Cluster:", labels[label_index]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment