kaotika/pandas_kdtree_clustering.py

## pandas_kdtree_clustering.py
import cProfile
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

def getRandomData(samples):
    points = np.random.rand(samples, 2) * 0.5
    data = pd.DataFrame(points, columns=['lat', 'lng'])
    # place points over Berlin
    data.lat += 13.1
    data.lng += 52.2
    return data

#@profile
def clustering(tree, data, radius, centerMultiplier=0.5):
    """ Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
    data['clusterNr'] = -1
    i = 0

    unclusteredIndexes = pd.DataFrame(data.index)
    while len(unclusteredIndexes.index) > 0:
        index = unclusteredIndexes.iloc[0][0]
        point = data.lat[index], data.lng[index]

        # get all points in half radius
        #clusterGroup = tree.query_ball_point(point, radius*centerMultiplier) # not present in scipy 0.11 and its not faster
        nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
        clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))

        # set found points to current cluster nr
        data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i

        # get all points outside half radius and full radius
        #clusterForSecondStage = list(
            #set(tree.query_ball_point(point, radius)) - set(clusterGroup)
        #)      # not present in scipy 0.11 and its not faster
        nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
        clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))

        # delete all indexes we found
        unclusteredIndexes = unclusteredIndexes[
            ~unclusteredIndexes.index.isin(clusterGroup + clusterForSecondStage)
        ]

        i+=1

    # build new tree with found clusters
    clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
        {'lat' : np.mean, 'lng': np.mean}
    )
    clusterTree = cKDTree(zip(clusters.lat, clusters.lng))

    # find nearest cluster for every non clustered point
    #tempData = data[data.clusterNr == -1]
    tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
    tempData['_points'] = zip(tempData.lat, tempData.lng)

    def assignNearestCluster(x):
        return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]

    tempData.clusterNr = tempData._points.map(assignNearestCluster)
    del tempData['_points']

    # combine both datasets
    data = tempData.combine_first(data)
    return data

#@profile
def clustering2(tree, data, radius, centerMultiplier=0.5):
    """ Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
    data['clusterNr'] = -1
    i = 0

    unclusteredIndexes = sorted(data.index)
    while len(unclusteredIndexes) > 0:
        index = unclusteredIndexes[0]
        point = data.lat[index], data.lng[index]

        # get all points in half radius
        nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
        clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))

        # set found points to current cluster nr
        data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i

        # get all points outside half radius and full radius
        nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
        clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))

        # delete all indexes we found
        unclusteredIndexes = sorted(set(unclusteredIndexes) - set(clusterGroup) - set(clusterForSecondStage))
        i+=1

    # build new tree with found clusters
    clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
        {'lat' : np.mean, 'lng': np.mean}
    )
    clusterTree = cKDTree(zip(clusters.lat, clusters.lng))

    # find nearest cluster for every non clustered point
    tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
    tempData['_points'] = zip(tempData.lat, tempData.lng)

    def assignNearestCluster(x):
        return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]

    tempData.clusterNr = tempData._points.map(assignNearestCluster)
    del tempData['_points']

    # combine both datasets
    data = tempData.combine_first(data)
    return data

#@profile
def main():
    samples = 1000
    radius = 0.1
    data = getRandomData(samples)
    tree = cKDTree(zip(data.lat, data.lng))
    clusters = clustering(tree, data, radius)
    clusters = clustering2(tree, data, radius)
    clusters['count'] = -1
    print clusters[clusters.clusterNr > -1].groupby("clusterNr").agg({
        'lat' : np.mean,
        'lng': np.mean,
        "count": "count"
    })

if __name__ == "__main__":
    main()
	import cProfile
	import pandas as pd
	import numpy as np
	from scipy.spatial import cKDTree

	def getRandomData(samples):
	points = np.random.rand(samples, 2) * 0.5
	data = pd.DataFrame(points, columns=['lat', 'lng'])
	# place points over Berlin
	data.lat += 13.1
	data.lng += 52.2
	return data

	#@profile
	def clustering(tree, data, radius, centerMultiplier=0.5):
	""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
	data['clusterNr'] = -1
	i = 0

	unclusteredIndexes = pd.DataFrame(data.index)
	while len(unclusteredIndexes.index) > 0:
	index = unclusteredIndexes.iloc[0][0]
	point = data.lat[index], data.lng[index]

	# get all points in half radius
	#clusterGroup = tree.query_ball_point(point, radius*centerMultiplier) # not present in scipy 0.11 and its not faster
	nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
	clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))

	# set found points to current cluster nr
	data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i

	# get all points outside half radius and full radius
	#clusterForSecondStage = list(
	#set(tree.query_ball_point(point, radius)) - set(clusterGroup)
	#) # not present in scipy 0.11 and its not faster
	nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
	clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))

	# delete all indexes we found
	unclusteredIndexes = unclusteredIndexes[
	~unclusteredIndexes.index.isin(clusterGroup + clusterForSecondStage)
	]

	i+=1

	# build new tree with found clusters
	clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
	{'lat' : np.mean, 'lng': np.mean}
	)
	clusterTree = cKDTree(zip(clusters.lat, clusters.lng))

	# find nearest cluster for every non clustered point
	#tempData = data[data.clusterNr == -1]
	tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
	tempData['_points'] = zip(tempData.lat, tempData.lng)

	def assignNearestCluster(x):
	return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]

	tempData.clusterNr = tempData._points.map(assignNearestCluster)
	del tempData['_points']

	# combine both datasets
	data = tempData.combine_first(data)
	return data

	#@profile
	def clustering2(tree, data, radius, centerMultiplier=0.5):
	""" Perform a simple distance based clustering. Add a column clusterNr to original dataset. """
	data['clusterNr'] = -1
	i = 0

	unclusteredIndexes = sorted(data.index)
	while len(unclusteredIndexes) > 0:
	index = unclusteredIndexes[0]
	point = data.lat[index], data.lng[index]

	# get all points in half radius
	nn = tree.query(point, k=len(data.index), distance_upper_bound=radius*centerMultiplier)
	clusterGroup = list(np.sort(nn[1][nn[0] != np.inf]))

	# set found points to current cluster nr
	data.ix[data.index.isin(clusterGroup), 'clusterNr'] = i

	# get all points outside half radius and full radius
	nn = tree.query(point, k=len(data.index), distance_upper_bound=radius)
	clusterForSecondStage = list(set(np.sort(nn[1][nn[0] != np.inf])) - set(clusterGroup))

	# delete all indexes we found
	unclusteredIndexes = sorted(set(unclusteredIndexes) - set(clusterGroup) - set(clusterForSecondStage))
	i+=1

	# build new tree with found clusters
	clusters = data[data.clusterNr != -1].groupby("clusterNr").agg(
	{'lat' : np.mean, 'lng': np.mean}
	)
	clusterTree = cKDTree(zip(clusters.lat, clusters.lng))

	# find nearest cluster for every non clustered point
	tempData = pd.DataFrame(data.loc[data.clusterNr == -1])
	tempData['_points'] = zip(tempData.lat, tempData.lng)

	def assignNearestCluster(x):
	return clusterTree.query((x[0], x[1]), k=1, distance_upper_bound=radius)[1]

	tempData.clusterNr = tempData._points.map(assignNearestCluster)
	del tempData['_points']

	# combine both datasets
	data = tempData.combine_first(data)
	return data

	#@profile
	def main():
	samples = 1000
	radius = 0.1
	data = getRandomData(samples)
	tree = cKDTree(zip(data.lat, data.lng))
	clusters = clustering(tree, data, radius)
	clusters = clustering2(tree, data, radius)
	clusters['count'] = -1
	print clusters[clusters.clusterNr > -1].groupby("clusterNr").agg({
	'lat' : np.mean,
	'lng': np.mean,
	"count": "count"
	})

	if __name__ == "__main__":
	main()