Instantly share code, notes, and snippets.

Embed
What would you like to do?
A small set of functions to generate maximum entropy graph heirarchies for aggregating graphs
# Copywrite Gabriel Bassett 2018
# Not licensed for reuse
import copy
import operator
import uuid
import networkx as nx
import logging
import pprint
import simplejson as json
#logging.basicConfig(level=logging.DEBUG)
def maximum_entropy_graph(g, size, weight="weight", max_iter = None, ignore_components=False):
''' A method for condensing graphs.
param x A networkx graph
param size The number of nodes/clusters desired
param weight The node attribute to use as the weight
param max_iter The maximum number of iterations. If None, Iterations will be set
slightly over the number of edges to ensure the algorithm has an oportunity to run
to completion.
param ignore_components If True, size will be increased by the number of components - 1.
If False and size < # of components, the algorthm will raise a ValueError.
return a networkx graph with nodes representing clusters and edges between them.
Cluster nodes will have nodes and edges attributes which will be lists of the
nodes and edges encompassed. Nodes will also have an attribute of the 'weight'
argument and value of the aggregate weight encompased by the cluster. Edges will
have edges attributes represnting edges encompassed.
'''
if max_iter is None:
# algorithm should run for number of edges iterations at most. This ensures completion
max_iter = g.number_of_edges() * 1.01 + 1
if g.is_directed():
components = nx.components.number_weakly_connected_components(g)
else:
components = nx.components.number_connected_components(g)
if components >= size:
if not ignore_components:
raise ValueError("Size is not larger than the number of components in the graph and so maximum entropy will be the list of components.")
else:
size = components + size - 1
logging.warning("Size has been increased to {0} to account for the {1} components in graph.".format(size, components))
G = copy.deepcopy(g)
# delete properties so they don't conflict with ones we create
for node in G.nodes_iter():
w = G.node[node].get(weight, 0) # default to 0 weight
G.node[node] = {weight: w, "cluster":False}
for s,d in G.edges_iter():
G.edge[s][d] = {}
count = 0
while count <= max_iter:
count += 1
logging.debug("Iteration: {0}".format(count))
# Below method only if nodes can be in multiple clusters
# test if enough clusters exist to to encompass all nodes in _size_ clusters ( a quick test)
# test if the existing nodes encompass all nodes in _size_ number of clusters/nodes
# if so return set of clusters
if G.number_of_nodes() <= size:
return G
# If end condition not met:
else:
# Give each edge a score of the sum of the score of it's two nodes.
for s,d in G.edges_iter():
try:
G.edge[s][d][weight] = G.node[s].get(weight, 0) + G.node[d].get(weight, 0)
except:
# print "source ", s, ", destination ", d
raise
#logging.debug("Returning with error creating weight for edge {0}->{1}.".format(s, d))
#return G
# Remove the lowest scored edge and cluste the source and destination nodes
edges = [(s, d, attr[weight]) for s, d, attr in G.edges_iter(data=True)]
s, d, w = min(iter(edges), key=operator.itemgetter(2))
logging.debug("Joining nodes {0} and {1} with weight of {2}.".format(s, d, w))
# Collect edges
predecessors = []
successors = []
#try:
if True:
if G.is_directed():
for pred in G.predecessors(d):
if pred not in (d, s):
predecessors += get_edges(G, pred, d)
for pred in G.predecessors(s):
if pred not in (d, s):
predecessors += get_edges(G, pred, s)
for suc in G.successors(d):
if suc not in (d, s):
successors += get_edges(G, d, suc)
for suc in G.successors(s):
if suc not in (d, s):
successors += get_edges(G, s, suc)
else:
for neighbor in G.neighbors(d):
if neighbor not in (d, s):
#predecessors += get_edges(G, neighbor, d)
ret = get_edges(G, d, neighbor)
successors += ret
# print "adding successor ", ret
for neighbor in G.neighbors(s): # neighbors is the same as successors
if neighbor not in (d, s):
#predecessors += get_edges(G, neighbor, s)
ret = get_edges(G, s, neighbor)
successors += ret
# print "adding successor ", ret
#except Exception as e:
##print "s: ", s, "d: ", d, "w: ", w, "count: ", count
#print e
#return G
#raise e
nodes = list()
edges = list()
# collect the nodes
if G.node[s]["cluster"]:
nodes += G.node[s]["nodes"]
edges += G.node[s]["edges"]
else:
nodes += [(s, G.node[s])]
if G.node[d]["cluster"]:
nodes += G.node[d]["nodes"]
edges += G.node[d]["edges"]
else:
nodes += [(d, G.node[d])]
#collect edges between nodes
edges += get_edges(G, s, s)
edges += get_edges(G, s, d)
edges += get_edges(G, d, d)
edges += get_edges(G, d, s)
G.remove_node(d)
G.remove_node(s)
#node_name = str(uuid.uuid4())
node_name = max(G.nodes()) + 1
G.add_node(node_name, {"cluster":True, weight: w, "nodes":nodes, "edges": edges})
# with open("/Users/lists/tmp/successors.json", 'w') as filehandle: # DEBUG
# filehandle.write(json.dumps(successors, indent=4, separators=(',', ': ')))
# aggregate and add edges. (duplicate edges will be aggregated)
for edge in predecessors:
if G.has_edge(edge[0], node_name):
#G.edge[edge[0]][node_name][weight] += edge[2][weight]
pass # edges only have derived weights.
else:
G.add_edge(edge[0], node_name, attr_dict=edge[2])
# G.edge[edge[0]][node_name] = edge[2]
# print "Adding edge ", edge[0], " - ", node_name
for edge in successors:
if G.has_edge(node_name, edge[1]):
#G.edge[node_name][edge[1]][weight] += edge[2][weight]
pass
else:
G.add_edge(node_name, edge[1], attr_dict=edge[2])
# G.edge[node_name][edge[1]] = edge[2]
#print "Adding edge ", node_name, " - ", edge[1]
logging.debug("Maximum iterations reached.")
print "Maximum iterations reached."
return G
def get_edges(g2, s, d):
edges = list()
try:
#if True:
e = g2.edge[s][d]
if g2.is_multigraph():
edges += [(s, d, attr) for attr in e.values()]
else: # hopefully this catches everything else
edges += [(s, d, e)]
except KeyError as ex:
# print "error with ", s, ", ", d
# print ex
pass
return edges
def serialize_graph(G):
''' a method to serialize the graph
Most graph formats don't do well with the json attributes that store
the aggregated node information so we serialize it.
param G a networkx graph
return a networkx graph with attributes serialized
'''
for s, d, data in G.edges(data=True):
for k, v in data.iteritems():
data[k] = json.dumps(v)
G.edge[s][d] = data
for node, data in G.nodes(data=True):
for k, v in data.iteritems():
data[k] = json.dumps(v)
G.node[node] = data
for node in G.nodes():
G.node[node]['id'] = node
G.node[node]['label'] = node
return G
def deserialize_graph(G):
''' deserialize a graph serialized with serialize_graph()
param G a networkx graph
return a networkx graph with attributes serialized
'''
for s, d, data in G.edges(data=True):
for k, v in data.iteritems():
data[k] = json.loads(v)
G.edge[s][d] = data
for node, data in G.nodes(data=True):
for k, v in data.iteritems():
data[k] = json.loads(v)
G.node[node] = data
return G
def color_graph_from_entropy_cluster(G, entropy_graph):
''' Set a cluster id from the entropy graph on the nodes of the original graph
param G networkx grpah
param entropy_graph maximum entropy graph based on G
return G with 'cluster' attributed added to nodes
'''
for node, data in entropy_graph.nodes(data=True):
cluster_nodes = data['nodes']
if type(cluster_nodes) == str:
cluster_nodes = json.loads(cluster_nodes)
for n, d in cluster_nodes:
if G.has_node(n):
G.node[n]['cluster'] = node
return G
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment