aanastasiou/main.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Alternative Graphml reader based on Networkx

The code in main.py is an alternative "parser" for isntantiating networkx graphs from graphml files.
At the moment it reads in simple graphml files but it can be extended to include files with complex entities
attached to the graph, node and edge levels (such as files generated by yEd) with
the same technique.
Detailed documentation in main.py comments.
Tested on:

Airlines from: https://github.com/gephi/gephi/wiki/Datasets
yWorks graphml example from: http://docs.yworks.com/yfilessilverlight/developers-guide/reading_writing.html


## main.py
"""

Reads a Graphml file and instantiate the right type Networkx graph.

Notes:
    1. This is still work in progress. At its current state, the code will comfortably read "simple" graphml files.
    2. The next step is to enable the routine to selectively read in (graph, node, edge) level data based on the
       namespace used.
    3. This will probably be implemented with named tuples and it should support at least simple reads (if not full
       complex writes too).

:author:Athansios Anastasiou
:date: April 2020
"""
import sys
import lxml.etree
import networkx

if __name__ == "__main__":
    # SETUP THE INPUT FILE HERE
    input_file = "airlines.graphml"
    # input_file = "yworks_graphml.graphml"

    doc = lxml.etree.parse(input_file)
    # Get the namespace map
    nsmap = doc.getroot().nsmap
    # Find the default namespace
    default_ns = list(filter(lambda x:"http://graphml.graphdrawing.org/xmlns" in x[1],nsmap.items()))
    # If you cannot find a default namespace, maybe this is not a graphml file
    if len(default_ns)<1:
        print(f"{input_file} does not contain the default graphml namespace")
        sys.exit(1)
    # If the default namespace has a name, use it, otherwise, assign a generic name to it
    if default_ns[0][0] is not None:
        glns = default_ns[0][0]
    else:
        glns = "_q_q"; # Or something else as absurd, with zero chance of overwriting another key from the nsmap
        nsmap[glns] = default_ns[0][1]
        del(nsmap[None])
    # Let's create the graph
    # Get all the graph attributes
    # Note, there are more attributes which could be more telling about the nature of the graph,
    # please see: http://graphml.graphdrawing.org/specification/dtd.html#key
    graph_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='graph']",namespaces=nsmap)
    # Get all the node attributes
    node_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='node']",namespaces=nsmap)
    # Get all the edge attributes
    edge_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='edge']",namespaces=nsmap)
    # Get all the nodes
    graph_nodes = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:node",namespaces=nsmap)
    # Get all the edges
    graph_edges = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:edge",namespaces=nsmap)
    # We are now ready to build a network. This next step is not strictly
    # required, but it allows us to figure out more information about what sort of
    # networkx network should be created.
    # Setup a data type conversion map
    # This maps the attr.type of graphml to the appropriate native data type
    data_type_conversion_map = {"boolean":bool,"int":int,"long":int,"float":float, "double":float, "string":str}
    # Create a dictionary dict<node_id>:<node_data>
    node_dict = {}
    for a_node in graph_nodes:
        node_id = a_node.attrib["id"]
        # The following line is a bit long but all the data required for the data marhsalling is available on a per
        # data item basis.
        # TODO: HIGH, This is missing the "or" part that substitutes the default values
        node_data = dict(
                         map(lambda x:(x.attrib["attr.name"],
                             data_type_conversion_map[x.attrib["attr.type"]](
                             a_node.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)
                             ),node_attr))

        # Some basic validation here
        if node_id not in node_dict:
            node_dict[node_id] = node_data
        else:
            print(f"Node id {node_id} is not unique in {input_file}")
            sys.exit(1)
    # Create a dictionary dict<edge_from,edge_to>:dict<edge_id:edge_data>. The structure of this id will also help in
    # determining if the graph is a multigraph
    edge_dict = {}
    for an_edge in graph_edges:
        edge_from = an_edge.attrib["source"]
        edge_to = an_edge.attrib["target"]
        edge_id = an_edge.attrib["id"]
        edge_key = (edge_from, edge_to)
        edge_data = dict(map(lambda x:(x.attrib["attr.name"],data_type_conversion_map[x.attrib["attr.type"]](an_edge.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)),edge_attr))

        if edge_key not in edge_dict:
            edge_dict[edge_key] = {}

        edge_dict[edge_key][edge_id] = edge_data

    # Is this a directed or undirected network?
    is_directed = doc.xpath(f"/{glns}:graphml/{glns}:graph/@edgedefault",namespaces=nsmap)[0].lower()=="directed"
    # Is it a multigraph?
    # To determine this check if there are more than two edges towards the same direction between two nodes
    is_multigraph = len(list(filter(lambda x:len(x[1])>1,edge_dict.items())))>0
    # This mapping determines the right networkx data type based on the available data read from the file.
    graph_type_map = {(False, False):networkx.Graph,
                      (False,True):networkx.MultiGraph,
                      (True, False):networkx.DiGraph,
                      (True, True):networkx.MultiDiGraph}
    # Create the networkx graph
    G = graph_type_map[(is_directed,is_multigraph)]()
    # Add both nodes and edges
    for an_edge in edge_dict.items():
        source_node_id = an_edge[0][0]
        target_node_id = an_edge[0][1]
        # Add the endpoints if they do not yet exist
        if not source_node_id in G.nodes:
            G.add_node(source_node_id, **node_dict[source_node_id])
        if not target_node_id in G.nodes:
            G.add_node(target_node_id, **node_dict[target_node_id])
        # Now add all edges. This step is identical for Graphs and MultiGraphs.
        for an_edge_id in an_edge[1].items():
            G.add_edge(an_edge[0][0], an_edge[0][1], key=an_edge_id[0], **an_edge_id[1])
	"""

	Reads a Graphml file and instantiate the right type Networkx graph.

	Notes:
	1. This is still work in progress. At its current state, the code will comfortably read "simple" graphml files.
	2. The next step is to enable the routine to selectively read in (graph, node, edge) level data based on the
	namespace used.
	3. This will probably be implemented with named tuples and it should support at least simple reads (if not full
	complex writes too).

	:author:Athansios Anastasiou
	:date: April 2020
	"""
	import sys
	import lxml.etree
	import networkx

	if __name__ == "__main__":
	# SETUP THE INPUT FILE HERE
	input_file = "airlines.graphml"
	# input_file = "yworks_graphml.graphml"

	doc = lxml.etree.parse(input_file)
	# Get the namespace map
	nsmap = doc.getroot().nsmap
	# Find the default namespace
	default_ns = list(filter(lambda x:"http://graphml.graphdrawing.org/xmlns" in x[1],nsmap.items()))
	# If you cannot find a default namespace, maybe this is not a graphml file
	if len(default_ns)<1:
	print(f"{input_file} does not contain the default graphml namespace")
	sys.exit(1)
	# If the default namespace has a name, use it, otherwise, assign a generic name to it
	if default_ns[0][0] is not None:
	glns = default_ns[0][0]
	else:
	glns = "_q_q"; # Or something else as absurd, with zero chance of overwriting another key from the nsmap
	nsmap[glns] = default_ns[0][1]
	del(nsmap[None])
	# Let's create the graph
	# Get all the graph attributes
	# Note, there are more attributes which could be more telling about the nature of the graph,
	# please see: http://graphml.graphdrawing.org/specification/dtd.html#key
	graph_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='graph']",namespaces=nsmap)
	# Get all the node attributes
	node_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='node']",namespaces=nsmap)
	# Get all the edge attributes
	edge_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='edge']",namespaces=nsmap)
	# Get all the nodes
	graph_nodes = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:node",namespaces=nsmap)
	# Get all the edges
	graph_edges = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:edge",namespaces=nsmap)
	# We are now ready to build a network. This next step is not strictly
	# required, but it allows us to figure out more information about what sort of
	# networkx network should be created.
	# Setup a data type conversion map
	# This maps the attr.type of graphml to the appropriate native data type
	data_type_conversion_map = {"boolean":bool,"int":int,"long":int,"float":float, "double":float, "string":str}
	# Create a dictionary dict<node_id>:<node_data>
	node_dict = {}
	for a_node in graph_nodes:
	node_id = a_node.attrib["id"]
	# The following line is a bit long but all the data required for the data marhsalling is available on a per
	# data item basis.
	# TODO: HIGH, This is missing the "or" part that substitutes the default values
	node_data = dict(
	map(lambda x:(x.attrib["attr.name"],
	data_type_conversion_map[x.attrib["attr.type"]](
	a_node.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)
	),node_attr))

	# Some basic validation here
	if node_id not in node_dict:
	node_dict[node_id] = node_data
	else:
	print(f"Node id {node_id} is not unique in {input_file}")
	sys.exit(1)
	# Create a dictionary dict<edge_from,edge_to>:dict<edge_id:edge_data>. The structure of this id will also help in
	# determining if the graph is a multigraph
	edge_dict = {}
	for an_edge in graph_edges:
	edge_from = an_edge.attrib["source"]
	edge_to = an_edge.attrib["target"]
	edge_id = an_edge.attrib["id"]
	edge_key = (edge_from, edge_to)
	edge_data = dict(map(lambda x:(x.attrib["attr.name"],data_type_conversion_map[x.attrib["attr.type"]](an_edge.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)),edge_attr))

	if edge_key not in edge_dict:
	edge_dict[edge_key] = {}

	edge_dict[edge_key][edge_id] = edge_data

	# Is this a directed or undirected network?
	is_directed = doc.xpath(f"/{glns}:graphml/{glns}:graph/@edgedefault",namespaces=nsmap)[0].lower()=="directed"
	# Is it a multigraph?
	# To determine this check if there are more than two edges towards the same direction between two nodes
	is_multigraph = len(list(filter(lambda x:len(x[1])>1,edge_dict.items())))>0
	# This mapping determines the right networkx data type based on the available data read from the file.
	graph_type_map = {(False, False):networkx.Graph,
	(False,True):networkx.MultiGraph,
	(True, False):networkx.DiGraph,
	(True, True):networkx.MultiDiGraph}
	# Create the networkx graph
	G = graph_type_map[(is_directed,is_multigraph)]()
	# Add both nodes and edges
	for an_edge in edge_dict.items():
	source_node_id = an_edge[0][0]
	target_node_id = an_edge[0][1]
	# Add the endpoints if they do not yet exist
	if not source_node_id in G.nodes:
	G.add_node(source_node_id, **node_dict[source_node_id])
	if not target_node_id in G.nodes:
	G.add_node(target_node_id, **node_dict[target_node_id])
	# Now add all edges. This step is identical for Graphs and MultiGraphs.
	for an_edge_id in an_edge[1].items():
	G.add_edge(an_edge[0][0], an_edge[0][1], key=an_edge_id[0], **an_edge_id[1])