clrnd/clusters.py

## clusters.py
# coding: utf-8
import csv
import json
import numpy as np


def r_tree(points, K):
    """ STR method for building R-Tree like structure.
        http://stats.stackexchange.com/a/40785
        http://www.dtic.mil/dtic/tr/fulltext/u2/a324493.pdf (pag. 8)
    """
    # part size
    size = int(np.sqrt(K))

    # initialize array full of zeroes
    labels = np.zeros(points.shape[0], dtype=np.int)

    # sort by x
    xsort = points[points[:,0].argsort()]
    xparts = np.array_split(xsort, size)

    group = 0
    for px in xparts:
        # sort by y
        ysort = px[px[:,1].argsort()]
        yparts = np.array_split(ysort, size)

        for py in yparts:
            for l in py:
                idx = np.where(points == l)[0][0]
                labels[idx] = group
            group += 1

    return labels


with open('data.csv', 'rb') as f:
    # id,lon,lat
    # XD.1.1,-58.09203,-34.9823
    # ...
    data = list(csv.DictReader(f))
    # points = [[x1, y1], [x2, y2], ...]
    points = np.array(map(lambda e: (float(e['lon']), float(e['lat'])), data))

# labels = [n1, n1, n2, n1, ...] where nₙ is the cluster number for that index
labels = r_tree(points, 81)
print labels

# associate each id to it's cluster number
clusters = map(lambda x, l: (x['id'], l), data, labels.tolist())

with open('out.json', 'w') as f:
    json.dump({'data': clusters}, f)

with open('clusters.csv', 'wb') as f:
    w = csv.writer(f)
    w.writerow(['id', 'cluster'])
    w.writerows(clusters)
	# coding: utf-8
	import csv
	import json
	import numpy as np


	def r_tree(points, K):
	""" STR method for building R-Tree like structure.
	http://stats.stackexchange.com/a/40785
	http://www.dtic.mil/dtic/tr/fulltext/u2/a324493.pdf (pag. 8)
	"""
	# part size
	size = int(np.sqrt(K))

	# initialize array full of zeroes
	labels = np.zeros(points.shape[0], dtype=np.int)

	# sort by x
	xsort = points[points[:,0].argsort()]
	xparts = np.array_split(xsort, size)

	group = 0
	for px in xparts:
	# sort by y
	ysort = px[px[:,1].argsort()]
	yparts = np.array_split(ysort, size)

	for py in yparts:
	for l in py:
	idx = np.where(points == l)[0][0]
	labels[idx] = group
	group += 1

	return labels


	with open('data.csv', 'rb') as f:
	# id,lon,lat
	# XD.1.1,-58.09203,-34.9823
	# ...
	data = list(csv.DictReader(f))
	# points = [[x1, y1], [x2, y2], ...]
	points = np.array(map(lambda e: (float(e['lon']), float(e['lat'])), data))

	# labels = [n1, n1, n2, n1, ...] where nₙ is the cluster number for that index
	labels = r_tree(points, 81)
	print labels

	# associate each id to it's cluster number
	clusters = map(lambda x, l: (x['id'], l), data, labels.tolist())

	with open('out.json', 'w') as f:
	json.dump({'data': clusters}, f)

	with open('clusters.csv', 'wb') as f:
	w = csv.writer(f)
	w.writerow(['id', 'cluster'])
	w.writerows(clusters)