Skip to content

Instantly share code, notes, and snippets.

@aanastasiou
Created April 10, 2020 18:26
Show Gist options
  • Save aanastasiou/57b0e2dc1f9ce12c4610412a817d7635 to your computer and use it in GitHub Desktop.
Save aanastasiou/57b0e2dc1f9ce12c4610412a817d7635 to your computer and use it in GitHub Desktop.
Graphml to Networkx Conversion

Alternative Graphml reader based on Networkx

The code in main.py is an alternative "parser" for isntantiating networkx graphs from graphml files.

At the moment it reads in simple graphml files but it can be extended to include files with complex entities attached to the graph, node and edge levels (such as files generated by yEd) with the same technique.

Detailed documentation in main.py comments.

Tested on:

  1. Airlines from: https://github.com/gephi/gephi/wiki/Datasets
  2. yWorks graphml example from: http://docs.yworks.com/yfilessilverlight/developers-guide/reading_writing.html
"""
Reads a Graphml file and instantiate the right type Networkx graph.
Notes:
1. This is still work in progress. At its current state, the code will comfortably read "simple" graphml files.
2. The next step is to enable the routine to selectively read in (graph, node, edge) level data based on the
namespace used.
3. This will probably be implemented with named tuples and it should support at least simple reads (if not full
complex writes too).
:author:Athansios Anastasiou
:date: April 2020
"""
import sys
import lxml.etree
import networkx
if __name__ == "__main__":
# SETUP THE INPUT FILE HERE
input_file = "airlines.graphml"
# input_file = "yworks_graphml.graphml"
doc = lxml.etree.parse(input_file)
# Get the namespace map
nsmap = doc.getroot().nsmap
# Find the default namespace
default_ns = list(filter(lambda x:"http://graphml.graphdrawing.org/xmlns" in x[1],nsmap.items()))
# If you cannot find a default namespace, maybe this is not a graphml file
if len(default_ns)<1:
print(f"{input_file} does not contain the default graphml namespace")
sys.exit(1)
# If the default namespace has a name, use it, otherwise, assign a generic name to it
if default_ns[0][0] is not None:
glns = default_ns[0][0]
else:
glns = "_q_q"; # Or something else as absurd, with zero chance of overwriting another key from the nsmap
nsmap[glns] = default_ns[0][1]
del(nsmap[None])
# Let's create the graph
# Get all the graph attributes
# Note, there are more attributes which could be more telling about the nature of the graph,
# please see: http://graphml.graphdrawing.org/specification/dtd.html#key
graph_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='graph']",namespaces=nsmap)
# Get all the node attributes
node_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='node']",namespaces=nsmap)
# Get all the edge attributes
edge_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='edge']",namespaces=nsmap)
# Get all the nodes
graph_nodes = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:node",namespaces=nsmap)
# Get all the edges
graph_edges = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:edge",namespaces=nsmap)
# We are now ready to build a network. This next step is not strictly
# required, but it allows us to figure out more information about what sort of
# networkx network should be created.
# Setup a data type conversion map
# This maps the attr.type of graphml to the appropriate native data type
data_type_conversion_map = {"boolean":bool,"int":int,"long":int,"float":float, "double":float, "string":str}
# Create a dictionary dict<node_id>:<node_data>
node_dict = {}
for a_node in graph_nodes:
node_id = a_node.attrib["id"]
# The following line is a bit long but all the data required for the data marhsalling is available on a per
# data item basis.
# TODO: HIGH, This is missing the "or" part that substitutes the default values
node_data = dict(
map(lambda x:(x.attrib["attr.name"],
data_type_conversion_map[x.attrib["attr.type"]](
a_node.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)
),node_attr))
# Some basic validation here
if node_id not in node_dict:
node_dict[node_id] = node_data
else:
print(f"Node id {node_id} is not unique in {input_file}")
sys.exit(1)
# Create a dictionary dict<edge_from,edge_to>:dict<edge_id:edge_data>. The structure of this id will also help in
# determining if the graph is a multigraph
edge_dict = {}
for an_edge in graph_edges:
edge_from = an_edge.attrib["source"]
edge_to = an_edge.attrib["target"]
edge_id = an_edge.attrib["id"]
edge_key = (edge_from, edge_to)
edge_data = dict(map(lambda x:(x.attrib["attr.name"],data_type_conversion_map[x.attrib["attr.type"]](an_edge.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)),edge_attr))
if edge_key not in edge_dict:
edge_dict[edge_key] = {}
edge_dict[edge_key][edge_id] = edge_data
# Is this a directed or undirected network?
is_directed = doc.xpath(f"/{glns}:graphml/{glns}:graph/@edgedefault",namespaces=nsmap)[0].lower()=="directed"
# Is it a multigraph?
# To determine this check if there are more than two edges towards the same direction between two nodes
is_multigraph = len(list(filter(lambda x:len(x[1])>1,edge_dict.items())))>0
# This mapping determines the right networkx data type based on the available data read from the file.
graph_type_map = {(False, False):networkx.Graph,
(False,True):networkx.MultiGraph,
(True, False):networkx.DiGraph,
(True, True):networkx.MultiDiGraph}
# Create the networkx graph
G = graph_type_map[(is_directed,is_multigraph)]()
# Add both nodes and edges
for an_edge in edge_dict.items():
source_node_id = an_edge[0][0]
target_node_id = an_edge[0][1]
# Add the endpoints if they do not yet exist
if not source_node_id in G.nodes:
G.add_node(source_node_id, **node_dict[source_node_id])
if not target_node_id in G.nodes:
G.add_node(target_node_id, **node_dict[target_node_id])
# Now add all edges. This step is identical for Graphs and MultiGraphs.
for an_edge_id in an_edge[1].items():
G.add_edge(an_edge[0][0], an_edge[0][1], key=an_edge_id[0], **an_edge_id[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment