blogle/gist:30e0e88ceb963f6557f7

## gistfile1.py
import numpy as np
import networkx as nx
import heapq
from collections import defaultdict

def distance(u, v):
    return np.sum((u - v)**2)

class KDTree(object):

    def __init__(self, data, index=None, depth=0):
        """
        Creates a recursive space partitioning DataStructure where each node
        splits the dimension at the median of that axis. Similar to a BST,
        provides O(n log n) creation and O(log n) queries.
        Args:
            data  (np.array): dataset with shape n, k (n obs, k dim).
            Optional
            index (np.array): Index corresponding to each node in data
                              if left empty, the data is zero indexed.
            depth (int)     : This determines the axis in which to first
                              partition on e.g  0 -> x, 1 -> y, 2 -> z
        Notes:
            http://en.wikipedia.org/wiki/K-d_tree
            http://en.wikipedia.org/wiki/K-d_tree#mediaviewer/File:KDTree-animation.gif
        """
        # Build index at top level
        if type(index) == type(None):
            index = np.arange(data.shape[0])

        self.n = None
        self.k = None
        self.idx = None
        self.node = None
        self.axis = None
        self.left = None
        self.right = None
        self.children = None

        self._build(data, index, depth)

    def _build(self, data, index, depth):
        """Recursively builds the child nodes of the KDTree"""
        # If there is data to partition create nodes
        if data[index].size:

            # Store the dimensions of the data and the axis to partition on
            self.n, self.k = data[index].shape
            self.axis = (self.k + depth) % self.k

            # list of nodes beneath this node
            self.children = index

            # Find the index of the data sorted on the current axis
            # and the midpoint in which to partition
            idx_data = np.column_stack((data[index], index))
            sort_ax = idx_data[np.argsort(idx_data[:, self.axis]), -1].astype(int)
            partition = sort_ax.size / 2

            # Node index and data
            self.idx = sort_ax[partition]
            self.node = data[self.idx]

            # Build the branches, partitioning on the next axis
            self.left = KDTree(data, sort_ax[ : partition], depth+1)
            self.right = KDTree(data, sort_ax[partition+1:], depth+1)

    def near_branch(self, point):
        """Returns the branch nearest the input point"""
        if point[self.axis] < self.node[self.axis]:
            return self.left
        return self.right

    def far_branch(self, point):
        """Returns the branch furthest the input point"""
        if self.near_branch(point) == self.left:
            return self.right
        return self.left

    def orthogonal_dist(self, point):
        """computes the distance from a point to the partition"""
        orth_point = np.copy(point)
        orth_point[self.axis] = self.node[self.axis]
        return distance(point, self.node)

    def query(self, point, best=None):
        """Find the nearest neighbor of point in KDTree"""

        # Dead end backtrack up the tree
        if self.node is None:
            return best

        # Initialize best
        if best is None:
            best = (self.idx, self.node)

        # check if current node is closer than best
        if distance(self.node, point) < distance(best[1], point):
            best = (self.idx, self.node)

        # continue traversing the tree
        best = self.near_branch(point).query(point, best)

        # traverse the away branch if the orthogonal distance is less than best
        if self.orthogonal_dist(point) < distance(best[1], point):
            best = self.far_branch(point).query(point, best)
        return best

    def query_subset(self, point, subset):
        """Find the nearest neighbor of point in subset"""
        subset_vec = np.zeros(self.n)
        subset_vec[subset] = 1

        return self._query_subset(point, subset_vec, None)

    def _query_subset(self, point, subset, best=None):
        """Recursively implements constrained nearest neighbor search"""

        # Dead end backtrack up the tree
        if np.all(self.node == None):
            return best

        # Initialize node vectors
        idx_vec = np.empty_like(subset)
        child_vec = np.empty_like(subset)
        idx_vec[:] = child_vec[:] = 0
        idx_vec[self.idx] = child_vec[self.children] = 1

        # if point in subset, try to update best
        if np.dot(idx_vec, subset) != 0:
            # if closer than current best, or best is none update
            # is_closer is a thunk to prevent '__getitem__' error
            is_closer = lambda: distance(self.node, point) < distance(best[1], point)
            if np.all(best == None) or is_closer():
                best = (self.idx, self.node)

        near = self.near_branch(point)
        far = self.far_branch(point)

        # check the near branch, if its nodes intersect with the queried subset
        # otherwise move to the away branch
        if np.dot(child_vec, subset) > 0:
            best = near._query_subset(point, subset, best)
        else:
            best = far._query_subset(point, subset, best)

        # validate best, by ensuring closer point doesn't exist just beyond
        # partition if best still has yet to be found also look
        # into this further branch
        if (np.all(best != None) and self.orthogonal_dist(point) <
                distance(best[1], point)) or np.all(best == None):
            best = far._query_subset(point, subset, best)

        return best

class PriorityQueue(object):

    def __init__(self):
        """
        Queue implementing highest-priority-in first-out.

        Note:
        Priority is cost based, therefore smaller values are prioritized
        over larger values.
        """
        self._queue = []
        self._index = 0

    def push(self, item, priority):
        """
        Push an item into the queue.

        Args:
            item     (obj): Item to be stored in the queue
            priority (Num): Priority in which item will be retrieved from the queue
        """
        heapq.heappush(self._queue, (priority, self._index, item))
        self._index += 1

    def pop(self):
        """
        Removes the highest priority item from the queue

        Returns:
            obj: item with highest priority
        """
        return heapq.heappop(self._queue)[-1]

    def merge(self, other):
        """
        Given another queue, consumes each item in it
        and pushes the item and its priority into its own queue

        Args:
            other (PriorityQueue): Queue to be merged
        """
        while other._queue:
            priority,i,item = heapq.heappop(other._queue)
            self.push(item, priority)

    def top(self):
        """
        Allows peek at top item in the queue without removing it

        Returns:
            obj: if the queue is not empty otherwise None
        """
        try:
            return self._queue[0][-1]
        except:
            return None

def bvka_mst_edges(G, assume_connected=False, pos='coords'):

    V = set(G.nodes(data=False))
    pos = np.row_stack(nx.get_node_attributes(G, pos).values())

    kdtree = KDTree(pos)
    subgraphs = nx.utils.UnionFind()

    # This could be swapped for a defaultdict if preferred
    queues = defaultdict(PriorityQueue)

    for v in V:
        # Todo restrict this further to connected edges
        vm, _ = kdtree.query_subset(pos[v], list(V - {v}))
        dm = distance(pos[v], pos[vm])
        root = subgraphs[v]
        queues[root].push((v, vm), dm)

    Et = []
    while len(Et) != len(V) - 1:

        Ep = PriorityQueue()
        for C in set(map(subgraphs.__getitem__, subgraphs.parents.values())):

            (v, vm) = queues[C].top()
            component_set = [child for child, parent
                             in subgraphs.parents.iteritems()
                             if parent == C]
            disjoint_nodes = list(V - set(component_set))

            while vm in component_set:
                queues[C].pop()
                um, _ = kdtree.query_subset(pos[v], disjoint_nodes)
                dm = distance(pos[v], pos[vm])
                queues[C].push((v, um), dm)
                (v, vm) = queues[C].top()

            dm = distance(pos[v], pos[vm])
            Ep.push((v, vm, dm), dm)

        while Ep._queue:
            (um, vm, dm) = Ep.pop()

            component_i, component_j = subgraphs[um], subgraphs[vm]
            if component_i != component_j:
                # add the edge and merge the queues
                Et += [(um, vm)]

                subgraphs.union(um, vm)
                if component_i == subgraphs[um]:
                    major, minor = component_i, component_j
                else:
                    minor, major = component_i, component_j

                queues[major].merge(queues[minor])
                del(queues[minor])
    return Et
	import numpy as np
	import networkx as nx
	import heapq
	from collections import defaultdict

	def distance(u, v):
	return np.sum((u - v)**2)

	class KDTree(object):

	def __init__(self, data, index=None, depth=0):
	"""
	Creates a recursive space partitioning DataStructure where each node
	splits the dimension at the median of that axis. Similar to a BST,
	provides O(n log n) creation and O(log n) queries.
	Args:
	data (np.array): dataset with shape n, k (n obs, k dim).
	Optional
	index (np.array): Index corresponding to each node in data
	if left empty, the data is zero indexed.
	depth (int) : This determines the axis in which to first
	partition on e.g 0 -> x, 1 -> y, 2 -> z
	Notes:
	http://en.wikipedia.org/wiki/K-d_tree
	http://en.wikipedia.org/wiki/K-d_tree#mediaviewer/File:KDTree-animation.gif
	"""
	# Build index at top level
	if type(index) == type(None):
	index = np.arange(data.shape[0])

	self.n = None
	self.k = None
	self.idx = None
	self.node = None
	self.axis = None
	self.left = None
	self.right = None
	self.children = None

	self._build(data, index, depth)

	def _build(self, data, index, depth):
	"""Recursively builds the child nodes of the KDTree"""
	# If there is data to partition create nodes
	if data[index].size:

	# Store the dimensions of the data and the axis to partition on
	self.n, self.k = data[index].shape
	self.axis = (self.k + depth) % self.k

	# list of nodes beneath this node
	self.children = index

	# Find the index of the data sorted on the current axis
	# and the midpoint in which to partition
	idx_data = np.column_stack((data[index], index))
	sort_ax = idx_data[np.argsort(idx_data[:, self.axis]), -1].astype(int)
	partition = sort_ax.size / 2

	# Node index and data
	self.idx = sort_ax[partition]
	self.node = data[self.idx]

	# Build the branches, partitioning on the next axis
	self.left = KDTree(data, sort_ax[ : partition], depth+1)
	self.right = KDTree(data, sort_ax[partition+1:], depth+1)

	def near_branch(self, point):
	"""Returns the branch nearest the input point"""
	if point[self.axis] < self.node[self.axis]:
	return self.left
	return self.right

	def far_branch(self, point):
	"""Returns the branch furthest the input point"""
	if self.near_branch(point) == self.left:
	return self.right
	return self.left

	def orthogonal_dist(self, point):
	"""computes the distance from a point to the partition"""
	orth_point = np.copy(point)
	orth_point[self.axis] = self.node[self.axis]
	return distance(point, self.node)

	def query(self, point, best=None):
	"""Find the nearest neighbor of point in KDTree"""

	# Dead end backtrack up the tree
	if self.node is None:
	return best

	# Initialize best
	if best is None:
	best = (self.idx, self.node)

	# check if current node is closer than best
	if distance(self.node, point) < distance(best[1], point):
	best = (self.idx, self.node)

	# continue traversing the tree
	best = self.near_branch(point).query(point, best)

	# traverse the away branch if the orthogonal distance is less than best
	if self.orthogonal_dist(point) < distance(best[1], point):
	best = self.far_branch(point).query(point, best)
	return best

	def query_subset(self, point, subset):
	"""Find the nearest neighbor of point in subset"""
	subset_vec = np.zeros(self.n)
	subset_vec[subset] = 1

	return self._query_subset(point, subset_vec, None)

	def _query_subset(self, point, subset, best=None):
	"""Recursively implements constrained nearest neighbor search"""

	# Dead end backtrack up the tree
	if np.all(self.node == None):
	return best

	# Initialize node vectors
	idx_vec = np.empty_like(subset)
	child_vec = np.empty_like(subset)
	idx_vec[:] = child_vec[:] = 0
	idx_vec[self.idx] = child_vec[self.children] = 1

	# if point in subset, try to update best
	if np.dot(idx_vec, subset) != 0:
	# if closer than current best, or best is none update
	# is_closer is a thunk to prevent '__getitem__' error
	is_closer = lambda: distance(self.node, point) < distance(best[1], point)
	if np.all(best == None) or is_closer():
	best = (self.idx, self.node)

	near = self.near_branch(point)
	far = self.far_branch(point)

	# check the near branch, if its nodes intersect with the queried subset
	# otherwise move to the away branch
	if np.dot(child_vec, subset) > 0:
	best = near._query_subset(point, subset, best)
	else:
	best = far._query_subset(point, subset, best)

	# validate best, by ensuring closer point doesn't exist just beyond
	# partition if best still has yet to be found also look
	# into this further branch
	if (np.all(best != None) and self.orthogonal_dist(point) <
	distance(best[1], point)) or np.all(best == None):
	best = far._query_subset(point, subset, best)

	return best

	class PriorityQueue(object):

	def __init__(self):
	"""
	Queue implementing highest-priority-in first-out.

	Note:
	Priority is cost based, therefore smaller values are prioritized
	over larger values.
	"""
	self._queue = []
	self._index = 0

	def push(self, item, priority):
	"""
	Push an item into the queue.

	Args:
	item (obj): Item to be stored in the queue
	priority (Num): Priority in which item will be retrieved from the queue
	"""
	heapq.heappush(self._queue, (priority, self._index, item))
	self._index += 1

	def pop(self):
	"""
	Removes the highest priority item from the queue

	Returns:
	obj: item with highest priority
	"""
	return heapq.heappop(self._queue)[-1]

	def merge(self, other):
	"""
	Given another queue, consumes each item in it
	and pushes the item and its priority into its own queue

	Args:
	other (PriorityQueue): Queue to be merged
	"""
	while other._queue:
	priority,i,item = heapq.heappop(other._queue)
	self.push(item, priority)

	def top(self):
	"""
	Allows peek at top item in the queue without removing it

	Returns:
	obj: if the queue is not empty otherwise None
	"""
	try:
	return self._queue[0][-1]
	except:
	return None

	def bvka_mst_edges(G, assume_connected=False, pos='coords'):

	V = set(G.nodes(data=False))
	pos = np.row_stack(nx.get_node_attributes(G, pos).values())

	kdtree = KDTree(pos)
	subgraphs = nx.utils.UnionFind()

	# This could be swapped for a defaultdict if preferred
	queues = defaultdict(PriorityQueue)

	for v in V:
	# Todo restrict this further to connected edges
	vm, _ = kdtree.query_subset(pos[v], list(V - {v}))
	dm = distance(pos[v], pos[vm])
	root = subgraphs[v]
	queues[root].push((v, vm), dm)

	Et = []
	while len(Et) != len(V) - 1:

	Ep = PriorityQueue()
	for C in set(map(subgraphs.__getitem__, subgraphs.parents.values())):

	(v, vm) = queues[C].top()
	component_set = [child for child, parent
	in subgraphs.parents.iteritems()
	if parent == C]
	disjoint_nodes = list(V - set(component_set))

	while vm in component_set:
	queues[C].pop()
	um, _ = kdtree.query_subset(pos[v], disjoint_nodes)
	dm = distance(pos[v], pos[vm])
	queues[C].push((v, um), dm)
	(v, vm) = queues[C].top()

	dm = distance(pos[v], pos[vm])
	Ep.push((v, vm, dm), dm)

	while Ep._queue:
	(um, vm, dm) = Ep.pop()

	component_i, component_j = subgraphs[um], subgraphs[vm]
	if component_i != component_j:
	# add the edge and merge the queues
	Et += [(um, vm)]

	subgraphs.union(um, vm)
	if component_i == subgraphs[um]:
	major, minor = component_i, component_j
	else:
	minor, major = component_i, component_j

	queues[major].merge(queues[minor])
	del(queues[minor])
	return Et