Last active
August 29, 2015 14:13
-
-
Save kaotika/841db0b4949445a47ddf to your computer and use it in GitHub Desktop.
distance based clustering based on pandas and scipy.cKDTree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cProfile | |
import pandas as pd | |
import numpy as np | |
from scipy.spatial import cKDTree | |
def getRandomData(samples): | |
points = np.random.rand(samples, 2) * 0.5 | |
data = pd.DataFrame(points, columns=['lat', 'lng']) | |
# place points over Berlin | |
data.lat += 13.1 | |
data.lng += 52.2 | |
return data | |
#@profile | |
def clustering(tree, data, radius, centerMultiplier=0.5): | |
""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """ | |
data['clusterNr'] = -1 | |
i = 0 | |
unclusteredIndexes = pd.DataFrame(data.index) | |
while len(unclusteredIndexes.index) > 0: | |
index = unclusteredIndexes.iloc[0][0] | |
point = data.lat[index], data.lng[index] | |
# get all points in half radius | |
#clusterGroup = tree.query_ball_point(point, radius*centerMultiplier) # not present in scipy 0.11 and its not faster | |
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier) | |
clusterGroup = list(np.sort(nn[1][nn[0] != np.inf])) | |
# set found points to current cluster nr | |
data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i | |
# get all points outside half radius and full radius | |
#clusterForSecondStage = list( | |
#set(tree.query_ball_point(point, radius)) - set(clusterGroup) | |
#) # not present in scipy 0.11 and its not faster | |
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius) | |
clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup)) | |
# delete all indexes we found | |
unclusteredIndexes = unclusteredIndexes[ | |
~unclusteredIndexes.index.isin(clusterGroup + clusterForSecondStage) | |
] | |
i+=1 | |
# build new tree with found clusters | |
clusters = data[data.clusterNr != -1].groupby("clusterNr").agg( | |
{'lat' : np.mean, 'lng': np.mean} | |
) | |
clusterTree = cKDTree(zip(clusters.lat, clusters.lng)) | |
# find nearest cluster for every non clustered point | |
#tempData = data[data.clusterNr == -1] | |
tempData = pd.DataFrame(data.loc[data.clusterNr == -1]) | |
tempData['_points'] = zip(tempData.lat, tempData.lng) | |
def assignNearestCluster(x): | |
return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1] | |
tempData.clusterNr = tempData._points.map(assignNearestCluster) | |
del tempData['_points'] | |
# combine both datasets | |
data = tempData.combine_first(data) | |
return data | |
#@profile | |
def clustering2(tree, data, radius, centerMultiplier=0.5): | |
""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """ | |
data['clusterNr'] = -1 | |
i = 0 | |
unclusteredIndexes = sorted(data.index) | |
while len(unclusteredIndexes) > 0: | |
index = unclusteredIndexes[0] | |
point = data.lat[index], data.lng[index] | |
# get all points in half radius | |
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier) | |
clusterGroup = list(np.sort(nn[1][nn[0] != np.inf])) | |
# set found points to current cluster nr | |
data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i | |
# get all points outside half radius and full radius | |
nn = tree.query(point, k=len(data.index), distance_upper_bound=radius) | |
clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup)) | |
# delete all indexes we found | |
unclusteredIndexes = sorted(set(unclusteredIndexes) - set(clusterGroup) - set(clusterForSecondStage)) | |
i+=1 | |
# build new tree with found clusters | |
clusters = data[data.clusterNr != -1].groupby("clusterNr").agg( | |
{'lat' : np.mean, 'lng': np.mean} | |
) | |
clusterTree = cKDTree(zip(clusters.lat, clusters.lng)) | |
# find nearest cluster for every non clustered point | |
tempData = pd.DataFrame(data.loc[data.clusterNr == -1]) | |
tempData['_points'] = zip(tempData.lat, tempData.lng) | |
def assignNearestCluster(x): | |
return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1] | |
tempData.clusterNr = tempData._points.map(assignNearestCluster) | |
del tempData['_points'] | |
# combine both datasets | |
data = tempData.combine_first(data) | |
return data | |
#@profile | |
def main(): | |
samples = 1000 | |
radius = 0.1 | |
data = getRandomData(samples) | |
tree = cKDTree(zip(data.lat, data.lng)) | |
clusters = clustering(tree, data, radius) | |
clusters = clustering2(tree, data, radius) | |
clusters['count'] = -1 | |
print clusters[clusters.clusterNr > -1].groupby("clusterNr").agg({ | |
'lat' : np.mean, | |
'lng': np.mean, | |
"count": "count" | |
}) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment