ozgurozkok/clustering.py

## clustering.py
import mysql.connector
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the haversine distance between two points on the earth (specified in decimal degrees)
    """
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return 6371 * 2 * np.arcsin(np.sqrt(a))

# Connect to MySQL database and fetch data
cnx = mysql.connector.connect(user='user', password='password',
                              host='host', database='database')
cursor = cnx.cursor()
query = "SELECT title, latitude, longitude FROM locations"
cursor.execute(query)
locations = cursor.fetchall()

# Create a dictionary to store the cluster index for each location
clusters = {}
cluster_index = 0

# Loop through each location
for i, location in enumerate(locations):
    title, latitude, longitude = location
    found = False

    # Check if this location has the same title as a location in an existing cluster
    for ci, cl in clusters.items():
        for cj, loc in enumerate(cl):
            cj_title, cj_latitude, cj_longitude = loc
            if title == cj_title:
                # Calculate the haversine distance between the two locations
                d = haversine_distance(latitude, longitude, cj_latitude, cj_longitude)
                if d <= 0.1: # Within 100m
                    found = True
                    clusters[ci].append(location)
                    break
        if found:
            break
    if not found:
        # Create a new cluster for this location
        clusters[cluster_index] = [location]
        cluster_index += 1

# Convert the clusters to a numpy array
clusters_np = np.array([[latitude, longitude] for cl in clusters.values() for title, latitude, longitude in cl])

# Perform k-means clustering
kmeans = KMeans(n_clusters=len(clusters))
kmeans.fit(clusters_np)

# Get the cluster labels for each location
labels = kmeans.labels_

# Print the result
label_index = 0
for ci, cl in clusters.items():
    for loc in cl:
        title, latitude, longitude = loc
        print("Location:", title, "Coordinate:", (latitude, longitude), "Cluster:", labels[label_index])
	import mysql.connector
	import numpy as np
	from sklearn.cluster import KMeans
	from scipy.spatial import distance

	def haversine_distance(lat1, lon1, lat2, lon2):
	"""
	Calculate the haversine distance between two points on the earth (specified in decimal degrees)
	"""
	lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])

	a = np.sin((lat2-lat1)/2.0)**2 + \
	np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

	return 6371 * 2 * np.arcsin(np.sqrt(a))

	# Connect to MySQL database and fetch data
	cnx = mysql.connector.connect(user='user', password='password',
	host='host', database='database')
	cursor = cnx.cursor()
	query = "SELECT title, latitude, longitude FROM locations"
	cursor.execute(query)
	locations = cursor.fetchall()

	# Create a dictionary to store the cluster index for each location
	clusters = {}
	cluster_index = 0

	# Loop through each location
	for i, location in enumerate(locations):
	title, latitude, longitude = location
	found = False

	# Check if this location has the same title as a location in an existing cluster
	for ci, cl in clusters.items():
	for cj, loc in enumerate(cl):
	cj_title, cj_latitude, cj_longitude = loc
	if title == cj_title:
	# Calculate the haversine distance between the two locations
	d = haversine_distance(latitude, longitude, cj_latitude, cj_longitude)
	if d <= 0.1: # Within 100m
	found = True
	clusters[ci].append(location)
	break
	if found:
	break
	if not found:
	# Create a new cluster for this location
	clusters[cluster_index] = [location]
	cluster_index += 1

	# Convert the clusters to a numpy array
	clusters_np = np.array([[latitude, longitude] for cl in clusters.values() for title, latitude, longitude in cl])

	# Perform k-means clustering
	kmeans = KMeans(n_clusters=len(clusters))
	kmeans.fit(clusters_np)

	# Get the cluster labels for each location
	labels = kmeans.labels_

	# Print the result
	label_index = 0
	for ci, cl in clusters.items():
	for loc in cl:
	title, latitude, longitude = loc
	print("Location:", title, "Coordinate:", (latitude, longitude), "Cluster:", labels[label_index])