Created
January 8, 2018 00:19
-
-
Save gdbassett/308a24a9c6dadec7aabfb9c4b33f141b to your computer and use it in GitHub Desktop.
A small set of functions to generate maximum entropy graph heirarchies for aggregating graphs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copywrite Gabriel Bassett 2018 | |
# Not licensed for reuse | |
import copy | |
import operator | |
import uuid | |
import networkx as nx | |
import logging | |
import pprint | |
import simplejson as json | |
#logging.basicConfig(level=logging.DEBUG) | |
def maximum_entropy_graph(g, size, weight="weight", max_iter = None, ignore_components=False): | |
''' A method for condensing graphs. | |
param x A networkx graph | |
param size The number of nodes/clusters desired | |
param weight The node attribute to use as the weight | |
param max_iter The maximum number of iterations. If None, Iterations will be set | |
slightly over the number of edges to ensure the algorithm has an oportunity to run | |
to completion. | |
param ignore_components If True, size will be increased by the number of components - 1. | |
If False and size < # of components, the algorthm will raise a ValueError. | |
return a networkx graph with nodes representing clusters and edges between them. | |
Cluster nodes will have nodes and edges attributes which will be lists of the | |
nodes and edges encompassed. Nodes will also have an attribute of the 'weight' | |
argument and value of the aggregate weight encompased by the cluster. Edges will | |
have edges attributes represnting edges encompassed. | |
''' | |
if max_iter is None: | |
# algorithm should run for number of edges iterations at most. This ensures completion | |
max_iter = g.number_of_edges() * 1.01 + 1 | |
if g.is_directed(): | |
components = nx.components.number_weakly_connected_components(g) | |
else: | |
components = nx.components.number_connected_components(g) | |
if components >= size: | |
if not ignore_components: | |
raise ValueError("Size is not larger than the number of components in the graph and so maximum entropy will be the list of components.") | |
else: | |
size = components + size - 1 | |
logging.warning("Size has been increased to {0} to account for the {1} components in graph.".format(size, components)) | |
G = copy.deepcopy(g) | |
# delete properties so they don't conflict with ones we create | |
for node in G.nodes_iter(): | |
w = G.node[node].get(weight, 0) # default to 0 weight | |
G.node[node] = {weight: w, "cluster":False} | |
for s,d in G.edges_iter(): | |
G.edge[s][d] = {} | |
count = 0 | |
while count <= max_iter: | |
count += 1 | |
logging.debug("Iteration: {0}".format(count)) | |
# Below method only if nodes can be in multiple clusters | |
# test if enough clusters exist to to encompass all nodes in _size_ clusters ( a quick test) | |
# test if the existing nodes encompass all nodes in _size_ number of clusters/nodes | |
# if so return set of clusters | |
if G.number_of_nodes() <= size: | |
return G | |
# If end condition not met: | |
else: | |
# Give each edge a score of the sum of the score of it's two nodes. | |
for s,d in G.edges_iter(): | |
try: | |
G.edge[s][d][weight] = G.node[s].get(weight, 0) + G.node[d].get(weight, 0) | |
except: | |
# print "source ", s, ", destination ", d | |
raise | |
#logging.debug("Returning with error creating weight for edge {0}->{1}.".format(s, d)) | |
#return G | |
# Remove the lowest scored edge and cluste the source and destination nodes | |
edges = [(s, d, attr[weight]) for s, d, attr in G.edges_iter(data=True)] | |
s, d, w = min(iter(edges), key=operator.itemgetter(2)) | |
logging.debug("Joining nodes {0} and {1} with weight of {2}.".format(s, d, w)) | |
# Collect edges | |
predecessors = [] | |
successors = [] | |
#try: | |
if True: | |
if G.is_directed(): | |
for pred in G.predecessors(d): | |
if pred not in (d, s): | |
predecessors += get_edges(G, pred, d) | |
for pred in G.predecessors(s): | |
if pred not in (d, s): | |
predecessors += get_edges(G, pred, s) | |
for suc in G.successors(d): | |
if suc not in (d, s): | |
successors += get_edges(G, d, suc) | |
for suc in G.successors(s): | |
if suc not in (d, s): | |
successors += get_edges(G, s, suc) | |
else: | |
for neighbor in G.neighbors(d): | |
if neighbor not in (d, s): | |
#predecessors += get_edges(G, neighbor, d) | |
ret = get_edges(G, d, neighbor) | |
successors += ret | |
# print "adding successor ", ret | |
for neighbor in G.neighbors(s): # neighbors is the same as successors | |
if neighbor not in (d, s): | |
#predecessors += get_edges(G, neighbor, s) | |
ret = get_edges(G, s, neighbor) | |
successors += ret | |
# print "adding successor ", ret | |
#except Exception as e: | |
##print "s: ", s, "d: ", d, "w: ", w, "count: ", count | |
#print e | |
#return G | |
#raise e | |
nodes = list() | |
edges = list() | |
# collect the nodes | |
if G.node[s]["cluster"]: | |
nodes += G.node[s]["nodes"] | |
edges += G.node[s]["edges"] | |
else: | |
nodes += [(s, G.node[s])] | |
if G.node[d]["cluster"]: | |
nodes += G.node[d]["nodes"] | |
edges += G.node[d]["edges"] | |
else: | |
nodes += [(d, G.node[d])] | |
#collect edges between nodes | |
edges += get_edges(G, s, s) | |
edges += get_edges(G, s, d) | |
edges += get_edges(G, d, d) | |
edges += get_edges(G, d, s) | |
G.remove_node(d) | |
G.remove_node(s) | |
#node_name = str(uuid.uuid4()) | |
node_name = max(G.nodes()) + 1 | |
G.add_node(node_name, {"cluster":True, weight: w, "nodes":nodes, "edges": edges}) | |
# with open("/Users/lists/tmp/successors.json", 'w') as filehandle: # DEBUG | |
# filehandle.write(json.dumps(successors, indent=4, separators=(',', ': '))) | |
# aggregate and add edges. (duplicate edges will be aggregated) | |
for edge in predecessors: | |
if G.has_edge(edge[0], node_name): | |
#G.edge[edge[0]][node_name][weight] += edge[2][weight] | |
pass # edges only have derived weights. | |
else: | |
G.add_edge(edge[0], node_name, attr_dict=edge[2]) | |
# G.edge[edge[0]][node_name] = edge[2] | |
# print "Adding edge ", edge[0], " - ", node_name | |
for edge in successors: | |
if G.has_edge(node_name, edge[1]): | |
#G.edge[node_name][edge[1]][weight] += edge[2][weight] | |
pass | |
else: | |
G.add_edge(node_name, edge[1], attr_dict=edge[2]) | |
# G.edge[node_name][edge[1]] = edge[2] | |
#print "Adding edge ", node_name, " - ", edge[1] | |
logging.debug("Maximum iterations reached.") | |
print "Maximum iterations reached." | |
return G | |
def get_edges(g2, s, d): | |
edges = list() | |
try: | |
#if True: | |
e = g2.edge[s][d] | |
if g2.is_multigraph(): | |
edges += [(s, d, attr) for attr in e.values()] | |
else: # hopefully this catches everything else | |
edges += [(s, d, e)] | |
except KeyError as ex: | |
# print "error with ", s, ", ", d | |
# print ex | |
pass | |
return edges | |
def serialize_graph(G): | |
''' a method to serialize the graph | |
Most graph formats don't do well with the json attributes that store | |
the aggregated node information so we serialize it. | |
param G a networkx graph | |
return a networkx graph with attributes serialized | |
''' | |
for s, d, data in G.edges(data=True): | |
for k, v in data.iteritems(): | |
data[k] = json.dumps(v) | |
G.edge[s][d] = data | |
for node, data in G.nodes(data=True): | |
for k, v in data.iteritems(): | |
data[k] = json.dumps(v) | |
G.node[node] = data | |
for node in G.nodes(): | |
G.node[node]['id'] = node | |
G.node[node]['label'] = node | |
return G | |
def deserialize_graph(G): | |
''' deserialize a graph serialized with serialize_graph() | |
param G a networkx graph | |
return a networkx graph with attributes serialized | |
''' | |
for s, d, data in G.edges(data=True): | |
for k, v in data.iteritems(): | |
data[k] = json.loads(v) | |
G.edge[s][d] = data | |
for node, data in G.nodes(data=True): | |
for k, v in data.iteritems(): | |
data[k] = json.loads(v) | |
G.node[node] = data | |
return G | |
def color_graph_from_entropy_cluster(G, entropy_graph): | |
''' Set a cluster id from the entropy graph on the nodes of the original graph | |
param G networkx grpah | |
param entropy_graph maximum entropy graph based on G | |
return G with 'cluster' attributed added to nodes | |
''' | |
for node, data in entropy_graph.nodes(data=True): | |
cluster_nodes = data['nodes'] | |
if type(cluster_nodes) == str: | |
cluster_nodes = json.loads(cluster_nodes) | |
for n, d in cluster_nodes: | |
if G.has_node(n): | |
G.node[n]['cluster'] = node | |
return G |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment