gdbassett/MaximumEntropyGraph.py

## MaximumEntropyGraph.py
# Copywrite Gabriel Bassett 2018
# Not licensed for reuse

import copy
import operator
import uuid
import networkx as nx
import logging
import pprint
import simplejson as json
#logging.basicConfig(level=logging.DEBUG)

def maximum_entropy_graph(g, size, weight="weight", max_iter = None, ignore_components=False):
    ''' A method for condensing graphs.
        param x A networkx graph
        param size The number of nodes/clusters desired
        param weight The node attribute to use as the weight
        param max_iter The maximum number of iterations.  If None, Iterations will be set
          slightly over the number of edges to ensure the algorithm has an oportunity to run
          to completion.
        param ignore_components If True, size will be increased by the number of components - 1.
          If False and size < # of components, the algorthm will raise a ValueError.
        return a networkx graph with nodes representing clusters and edges between them.
          Cluster nodes will have nodes and edges attributes which will be lists of the
          nodes and edges encompassed.  Nodes will also have an attribute of the 'weight'
          argument and value of the aggregate weight encompased by the cluster.  Edges will
          have edges attributes represnting edges encompassed.
    '''

    if max_iter is None:
        # algorithm should run for number of edges iterations at most. This ensures completion
        max_iter = g.number_of_edges() * 1.01 + 1

    if g.is_directed():
        components = nx.components.number_weakly_connected_components(g)
    else:
        components = nx.components.number_connected_components(g)
    if components >= size:
        if not ignore_components:
            raise ValueError("Size is not larger than the number of components in the graph and so maximum entropy will be the list of components.")
        else:
            size = components + size - 1
            logging.warning("Size has been increased to {0} to account for the {1} components in graph.".format(size, components))

    G = copy.deepcopy(g)
    # delete properties so they don't conflict with ones we create
    for node in G.nodes_iter():
        w = G.node[node].get(weight, 0) # default to 0 weight
        G.node[node] = {weight: w, "cluster":False}
    for s,d in G.edges_iter():
        G.edge[s][d] = {}

    count = 0
    while count <= max_iter:
        count += 1
        logging.debug("Iteration: {0}".format(count))
        # Below method only if nodes can be in multiple clusters
        # test if enough clusters exist to to encompass all nodes in _size_ clusters ( a quick test)
            # test if the existing nodes encompass all nodes in _size_ number of clusters/nodes
                # if so return set of clusters
        if G.number_of_nodes() <= size:
            return G

        # If end condition not met:
        else:
            # Give each edge a score of the sum of the score of it's two nodes.
            for s,d in G.edges_iter():
                try:
                    G.edge[s][d][weight] = G.node[s].get(weight, 0) + G.node[d].get(weight, 0)
                except:
                    # print "source ", s, ", destination ", d
                    raise
                    #logging.debug("Returning with error creating weight for edge {0}->{1}.".format(s, d))
                    #return G
            # Remove the lowest scored edge and cluste the source and destination nodes
            edges = [(s, d, attr[weight]) for s, d, attr in G.edges_iter(data=True)]
            s, d, w = min(iter(edges), key=operator.itemgetter(2))
            logging.debug("Joining nodes {0} and {1} with weight of {2}.".format(s, d, w))

            # Collect edges
            predecessors = []
            successors = []
            #try:
            if True:
                if G.is_directed():
                    for pred in G.predecessors(d):
                        if pred not in (d, s):
                            predecessors += get_edges(G, pred, d)
                    for pred in G.predecessors(s):
                        if pred not in (d, s):
                            predecessors += get_edges(G, pred, s)
                    for suc in G.successors(d):
                        if suc not in (d, s):
                            successors += get_edges(G, d, suc)
                    for suc in G.successors(s):
                        if suc not in (d, s):
                            successors += get_edges(G, s, suc)
                else:
                    for neighbor in G.neighbors(d):
                        if neighbor not in (d, s):
                            #predecessors += get_edges(G, neighbor, d)
                            ret = get_edges(G, d, neighbor)
                            successors += ret
                            # print "adding successor ", ret
                    for neighbor in G.neighbors(s): # neighbors is the same as successors
                        if neighbor not in (d, s):
                            #predecessors += get_edges(G, neighbor, s)
                            ret = get_edges(G, s, neighbor)
                            successors += ret
                            # print "adding successor ", ret
            #except Exception as e:
                ##print "s: ", s, "d: ", d, "w: ", w, "count: ", count
                #print e
                #return G
                #raise e

            nodes = list()
            edges = list()
            # collect the nodes
            if G.node[s]["cluster"]:
                nodes += G.node[s]["nodes"]
                edges += G.node[s]["edges"]
            else:
                nodes += [(s, G.node[s])]
            if G.node[d]["cluster"]:
                nodes += G.node[d]["nodes"]
                edges += G.node[d]["edges"]
            else:
                nodes += [(d, G.node[d])]
            #collect edges between nodes
            edges += get_edges(G, s, s)
            edges += get_edges(G, s, d)
            edges += get_edges(G, d, d)
            edges += get_edges(G, d, s)

            G.remove_node(d)
            G.remove_node(s)

            #node_name = str(uuid.uuid4())
            node_name = max(G.nodes()) + 1
            G.add_node(node_name, {"cluster":True, weight: w, "nodes":nodes, "edges": edges})

            # with open("/Users/lists/tmp/successors.json", 'w') as filehandle: # DEBUG
            #     filehandle.write(json.dumps(successors, indent=4, separators=(',', ': ')))

            # aggregate and add edges. (duplicate edges will be aggregated)
            for edge in predecessors:
                if G.has_edge(edge[0], node_name):
                    #G.edge[edge[0]][node_name][weight] += edge[2][weight]
                    pass # edges only have derived weights.
                else:
                    G.add_edge(edge[0], node_name, attr_dict=edge[2])
                    # G.edge[edge[0]][node_name] = edge[2]
                    # print "Adding edge ", edge[0], " - ", node_name
            for edge in successors:
                if G.has_edge(node_name, edge[1]):
                    #G.edge[node_name][edge[1]][weight] += edge[2][weight]
                    pass
                else:
                    G.add_edge(node_name, edge[1], attr_dict=edge[2])
                    # G.edge[node_name][edge[1]] = edge[2]
                    #print "Adding edge ", node_name, " - ", edge[1]
    logging.debug("Maximum iterations reached.")
    print "Maximum iterations reached."
    return G

def get_edges(g2, s, d):
    edges = list()
    try:
    #if True:
        e = g2.edge[s][d]
        if g2.is_multigraph():
            edges += [(s, d, attr) for attr in e.values()]
        else: # hopefully this catches everything else
            edges += [(s, d, e)]
    except KeyError as ex:
    #    print "error with ", s, ", ", d
    #    print ex
        pass
    return edges

def serialize_graph(G):
    ''' a method to serialize the graph

        Most graph formats don't do well with the json attributes that store
          the aggregated node information so we serialize it.

        param G a networkx graph
        return a networkx graph with attributes serialized
    '''
    for s, d, data in G.edges(data=True):
        for k, v in data.iteritems():
            data[k] = json.dumps(v)
        G.edge[s][d] = data

    for node, data in G.nodes(data=True):
        for k, v in data.iteritems():
            data[k] = json.dumps(v)
        G.node[node] = data

    for node in G.nodes():
        G.node[node]['id'] = node
        G.node[node]['label'] = node

    return G

def deserialize_graph(G):
    ''' deserialize a graph serialized with serialize_graph()

        param G a networkx graph
        return a networkx graph with attributes serialized
    '''
    for s, d, data in G.edges(data=True):
        for k, v in data.iteritems():
            data[k] = json.loads(v)
        G.edge[s][d] = data

    for node, data in G.nodes(data=True):
        for k, v in data.iteritems():
            data[k] = json.loads(v)
        G.node[node] = data

    return G

def color_graph_from_entropy_cluster(G, entropy_graph):
    ''' Set a cluster id from the entropy graph on the nodes of the original graph

        param G networkx grpah
        param entropy_graph maximum entropy graph based on G
        return G with 'cluster' attributed added to nodes
    '''
    for node, data in entropy_graph.nodes(data=True):
        cluster_nodes = data['nodes']
        if type(cluster_nodes) == str:
            cluster_nodes = json.loads(cluster_nodes)
        for n, d in cluster_nodes:
            if G.has_node(n):
                G.node[n]['cluster'] = node

    return G
	# Copywrite Gabriel Bassett 2018
	# Not licensed for reuse

	import copy
	import operator
	import uuid
	import networkx as nx
	import logging
	import pprint
	import simplejson as json
	#logging.basicConfig(level=logging.DEBUG)

	def maximum_entropy_graph(g, size, weight="weight", max_iter = None, ignore_components=False):
	''' A method for condensing graphs.
	param x A networkx graph
	param size The number of nodes/clusters desired
	param weight The node attribute to use as the weight
	param max_iter The maximum number of iterations. If None, Iterations will be set
	slightly over the number of edges to ensure the algorithm has an oportunity to run
	to completion.
	param ignore_components If True, size will be increased by the number of components - 1.
	If False and size < # of components, the algorthm will raise a ValueError.
	return a networkx graph with nodes representing clusters and edges between them.
	Cluster nodes will have nodes and edges attributes which will be lists of the
	nodes and edges encompassed. Nodes will also have an attribute of the 'weight'
	argument and value of the aggregate weight encompased by the cluster. Edges will
	have edges attributes represnting edges encompassed.
	'''

	if max_iter is None:
	# algorithm should run for number of edges iterations at most. This ensures completion
	max_iter = g.number_of_edges() * 1.01 + 1

	if g.is_directed():
	components = nx.components.number_weakly_connected_components(g)
	else:
	components = nx.components.number_connected_components(g)
	if components >= size:
	if not ignore_components:
	raise ValueError("Size is not larger than the number of components in the graph and so maximum entropy will be the list of components.")
	else:
	size = components + size - 1
	logging.warning("Size has been increased to {0} to account for the {1} components in graph.".format(size, components))

	G = copy.deepcopy(g)
	# delete properties so they don't conflict with ones we create
	for node in G.nodes_iter():
	w = G.node[node].get(weight, 0) # default to 0 weight
	G.node[node] = {weight: w, "cluster":False}
	for s,d in G.edges_iter():
	G.edge[s][d] = {}

	count = 0
	while count <= max_iter:
	count += 1
	logging.debug("Iteration: {0}".format(count))
	# Below method only if nodes can be in multiple clusters
	# test if enough clusters exist to to encompass all nodes in _size_ clusters ( a quick test)
	# test if the existing nodes encompass all nodes in _size_ number of clusters/nodes
	# if so return set of clusters
	if G.number_of_nodes() <= size:
	return G

	# If end condition not met:
	else:
	# Give each edge a score of the sum of the score of it's two nodes.
	for s,d in G.edges_iter():
	try:
	G.edge[s][d][weight] = G.node[s].get(weight, 0) + G.node[d].get(weight, 0)
	except:
	# print "source ", s, ", destination ", d
	raise
	#logging.debug("Returning with error creating weight for edge {0}->{1}.".format(s, d))
	#return G
	# Remove the lowest scored edge and cluste the source and destination nodes
	edges = [(s, d, attr[weight]) for s, d, attr in G.edges_iter(data=True)]
	s, d, w = min(iter(edges), key=operator.itemgetter(2))
	logging.debug("Joining nodes {0} and {1} with weight of {2}.".format(s, d, w))

	# Collect edges
	predecessors = []
	successors = []
	#try:
	if True:
	if G.is_directed():
	for pred in G.predecessors(d):
	if pred not in (d, s):
	predecessors += get_edges(G, pred, d)
	for pred in G.predecessors(s):
	if pred not in (d, s):
	predecessors += get_edges(G, pred, s)
	for suc in G.successors(d):
	if suc not in (d, s):
	successors += get_edges(G, d, suc)
	for suc in G.successors(s):
	if suc not in (d, s):
	successors += get_edges(G, s, suc)
	else:
	for neighbor in G.neighbors(d):
	if neighbor not in (d, s):
	#predecessors += get_edges(G, neighbor, d)
	ret = get_edges(G, d, neighbor)
	successors += ret
	# print "adding successor ", ret
	for neighbor in G.neighbors(s): # neighbors is the same as successors
	if neighbor not in (d, s):
	#predecessors += get_edges(G, neighbor, s)
	ret = get_edges(G, s, neighbor)
	successors += ret
	# print "adding successor ", ret
	#except Exception as e:
	##print "s: ", s, "d: ", d, "w: ", w, "count: ", count
	#print e
	#return G
	#raise e

	nodes = list()
	edges = list()
	# collect the nodes
	if G.node[s]["cluster"]:
	nodes += G.node[s]["nodes"]
	edges += G.node[s]["edges"]
	else:
	nodes += [(s, G.node[s])]
	if G.node[d]["cluster"]:
	nodes += G.node[d]["nodes"]
	edges += G.node[d]["edges"]
	else:
	nodes += [(d, G.node[d])]
	#collect edges between nodes
	edges += get_edges(G, s, s)
	edges += get_edges(G, s, d)
	edges += get_edges(G, d, d)
	edges += get_edges(G, d, s)

	G.remove_node(d)
	G.remove_node(s)

	#node_name = str(uuid.uuid4())
	node_name = max(G.nodes()) + 1
	G.add_node(node_name, {"cluster":True, weight: w, "nodes":nodes, "edges": edges})

	# with open("/Users/lists/tmp/successors.json", 'w') as filehandle: # DEBUG
	# filehandle.write(json.dumps(successors, indent=4, separators=(',', ': ')))

	# aggregate and add edges. (duplicate edges will be aggregated)
	for edge in predecessors:
	if G.has_edge(edge[0], node_name):
	#G.edge[edge[0]][node_name][weight] += edge[2][weight]
	pass # edges only have derived weights.
	else:
	G.add_edge(edge[0], node_name, attr_dict=edge[2])
	# G.edge[edge[0]][node_name] = edge[2]
	# print "Adding edge ", edge[0], " - ", node_name
	for edge in successors:
	if G.has_edge(node_name, edge[1]):
	#G.edge[node_name][edge[1]][weight] += edge[2][weight]
	pass
	else:
	G.add_edge(node_name, edge[1], attr_dict=edge[2])
	# G.edge[node_name][edge[1]] = edge[2]
	#print "Adding edge ", node_name, " - ", edge[1]
	logging.debug("Maximum iterations reached.")
	print "Maximum iterations reached."
	return G

	def get_edges(g2, s, d):
	edges = list()
	try:
	#if True:
	e = g2.edge[s][d]
	if g2.is_multigraph():
	edges += [(s, d, attr) for attr in e.values()]
	else: # hopefully this catches everything else
	edges += [(s, d, e)]
	except KeyError as ex:
	# print "error with ", s, ", ", d
	# print ex
	pass
	return edges

	def serialize_graph(G):
	''' a method to serialize the graph

	Most graph formats don't do well with the json attributes that store
	the aggregated node information so we serialize it.

	param G a networkx graph
	return a networkx graph with attributes serialized
	'''
	for s, d, data in G.edges(data=True):
	for k, v in data.iteritems():
	data[k] = json.dumps(v)
	G.edge[s][d] = data

	for node, data in G.nodes(data=True):
	for k, v in data.iteritems():
	data[k] = json.dumps(v)
	G.node[node] = data

	for node in G.nodes():
	G.node[node]['id'] = node
	G.node[node]['label'] = node

	return G

	def deserialize_graph(G):
	''' deserialize a graph serialized with serialize_graph()

	param G a networkx graph
	return a networkx graph with attributes serialized
	'''
	for s, d, data in G.edges(data=True):
	for k, v in data.iteritems():
	data[k] = json.loads(v)
	G.edge[s][d] = data

	for node, data in G.nodes(data=True):
	for k, v in data.iteritems():
	data[k] = json.loads(v)
	G.node[node] = data

	return G

	def color_graph_from_entropy_cluster(G, entropy_graph):
	''' Set a cluster id from the entropy graph on the nodes of the original graph

	param G networkx grpah
	param entropy_graph maximum entropy graph based on G
	return G with 'cluster' attributed added to nodes
	'''
	for node, data in entropy_graph.nodes(data=True):
	cluster_nodes = data['nodes']
	if type(cluster_nodes) == str:
	cluster_nodes = json.loads(cluster_nodes)
	for n, d in cluster_nodes:
	if G.has_node(n):
	G.node[n]['cluster'] = node

	return G