Skip to content

Instantly share code, notes, and snippets.

@kaotika
Last active August 29, 2015 14:13
Show Gist options
  • Save kaotika/841db0b4949445a47ddf to your computer and use it in GitHub Desktop.
Save kaotika/841db0b4949445a47ddf to your computer and use it in GitHub Desktop.
distance based clustering based on pandas and scipy.cKDTree
import cProfile
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
def getRandomData(samples):
points = np.random.rand(samples, 2) * 0.5
data = pd.DataFrame(points, columns=['lat', 'lng'])
# place points over Berlin
data.lat += 13.1
data.lng += 52.2
return data
#@profile
def clustering(tree, data, radius, centerMultiplier=0.5):
""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
data['clusterNr'] = -1
i = 0
unclusteredIndexes = pd.DataFrame(data.index)
while len(unclusteredIndexes.index) > 0:
index = unclusteredIndexes.iloc[0][0]
point = data.lat[index], data.lng[index]
# get all points in half radius
#clusterGroup = tree.query_ball_point(point, radius*centerMultiplier) # not present in scipy 0.11 and its not faster
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))
# set found points to current cluster nr
data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i
# get all points outside half radius and full radius
#clusterForSecondStage = list(
#set(tree.query_ball_point(point, radius)) - set(clusterGroup)
#) # not present in scipy 0.11 and its not faster
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))
# delete all indexes we found
unclusteredIndexes = unclusteredIndexes[
~unclusteredIndexes.index.isin(clusterGroup + clusterForSecondStage)
]
i+=1
# build new tree with found clusters
clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
{'lat' : np.mean, 'lng': np.mean}
)
clusterTree = cKDTree(zip(clusters.lat, clusters.lng))
# find nearest cluster for every non clustered point
#tempData = data[data.clusterNr == -1]
tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
tempData['_points'] = zip(tempData.lat, tempData.lng)
def assignNearestCluster(x):
return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]
tempData.clusterNr = tempData._points.map(assignNearestCluster)
del tempData['_points']
# combine both datasets
data = tempData.combine_first(data)
return data
#@profile
def clustering2(tree, data, radius, centerMultiplier=0.5):
""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
data['clusterNr'] = -1
i = 0
unclusteredIndexes = sorted(data.index)
while len(unclusteredIndexes) > 0:
index = unclusteredIndexes[0]
point = data.lat[index], data.lng[index]
# get all points in half radius
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))
# set found points to current cluster nr
data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i
# get all points outside half radius and full radius
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))
# delete all indexes we found
unclusteredIndexes = sorted(set(unclusteredIndexes) - set(clusterGroup) - set(clusterForSecondStage))
i+=1
# build new tree with found clusters
clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
{'lat' : np.mean, 'lng': np.mean}
)
clusterTree = cKDTree(zip(clusters.lat, clusters.lng))
# find nearest cluster for every non clustered point
tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
tempData['_points'] = zip(tempData.lat, tempData.lng)
def assignNearestCluster(x):
return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]
tempData.clusterNr = tempData._points.map(assignNearestCluster)
del tempData['_points']
# combine both datasets
data = tempData.combine_first(data)
return data
#@profile
def main():
samples = 1000
radius = 0.1
data = getRandomData(samples)
tree = cKDTree(zip(data.lat, data.lng))
clusters = clustering(tree, data, radius)
clusters = clustering2(tree, data, radius)
clusters['count'] = -1
print clusters[clusters.clusterNr > -1].groupby("clusterNr").agg({
'lat' : np.mean,
'lng': np.mean,
"count": "count"
})
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment