maetl/subgraph.rb

## subgraph.rb
$LOAD_PATH << "./lib"
require 'mementus'

# Prototype of subgraph isomorphism search using Ullmann’s original 1976
# algorithm [1] as the foundation for a depth-first scan that moves forward
# through each possible outgoing edge, then attempts to confirm the match by
# doing a reverse edge lookup, comparing the query to the target graph.
#
# This is known to be a NP-complete problem but despite being exponential in
# theory, in practice it’s possible to achieve reasonable results by using
# index lookups and pruning rules to drastically limit the number of vertices
# considered at each step of the search.
#
# The procedure is structured using the generic template method approach
# outlined in [2], which provides potential for experimenting with different
# strategies for skipping vertices and reducing the number of steps needed to
# complete a full run of the algorithm. There are a number of possible
# optimisations listed in the paper that can be probably be applied to this
# implementation without needing a drastic rewrite.
#
# Given that this is all in Ruby, it’s likely that the biggest performance sink
# here is in method dispatch rather than iterating through adjacency lists. For
# small scale graph processing there should be no major problems.
#
# [1]  J. R. Ullmann. “An algorithm for subgraph isomorphism.” J. ACM, 23:31–42,
#      January 1976.
# [2]  J. Lee, W.-S. Han, R. Kasperovics, and J.-H. Lee. “An in-depth comparison
#      of subgraph isomorphism algorithms in graph databases.” In Proceedings of
#      the 39th international conference on Very Large Data Bases, PVLDB’13,
#      pages 133–144. VLDB Endowment, 2013.
#
module Mementus
  class Graph
    def match(&block)
      # Build a query graph from the given input structure.
      query = Mementus::Graph.new(&block)

      # If there are zero nodes in either of the graphs there cannot be a
      # matching subgraph so return straight away.
      return @results if query.nodes.empty? || nodes.empty?

      # An empty set of possible candidate nodes in the target graph.
      candidates = {}

      # For each node in the query graph, obtain a list of candidate nodes in
      # the target graph.
      query.nodes.each do |query_node|
        # Find all nodes in the target graph that match the query node by label.
        candidate_nodes = nodes(query_node.label)

        # If we don't find any graph nodes for a given query node we can
        # immediately rule out the possibility of finding a matching subgraph.
        return @results if candidate_nodes.empty?

        # Store a list of possible matching nodes for the given query node.
        candidates[query_node.id] = candidate_nodes
      end

      # A partial embedding that maps query nodes to target nodes.
      matches = {}

      # Final list of matched subgraphs from the target graph.
      @results = []

      # Internal state stack for embeddings at each level of recursion.
      @matches_stack = []

      # Kick off the traversal with the starting set of candidates and the empty
      # set of matches.
      subgraph_search(query, matches, candidates)

      # Return the collected results.
      @results
    end

    def subgraph_search(query, matches, candidates)
      # When each query node maps to a qualified node in the target graph there
      # is a matching subgraph. Store the results and return upwards.
      if matches.count == query.nodes_count
        @results << matches
        return
      end

      # Select the next unmatched query node.
      query_node = next_unmatched_node(query, matches)

      # Stash the current state of potential candidates in the target graph.
      previous_candidates = candidates[query_node.id]

      # Refine the potential matches.
      candidates[query_node.id] = refine_candidates(query_node, matches, candidates)

      # Iterate through the possible candidates for the current query node.
      candidates[query_node.id].each do |candidate_node|
        # Check whether the target graph contains a corresponding edge joining
        # back to a previously matched node.
        if is_joinable(query_node, candidate_node, matches)
          # The nodes are joinable so continue moving down the connected query
          # Push existing state onto a history stack so we can backtrack after
          # the depth-first search bottoms out.
          @matches_stack << matches.dup

          # Update the state to mark the current candidate node
          matches[query_node.id] = candidate_node.id

          # Recursively attempt to match remaining candidates.
          subgraph_search(query, matches, candidates)

          # Restore the previous matching state and move on to the next
          # candidate at the current level.
          matches = @matches_stack.pop
        end
      end

      # Restore the previous list of candidates and exit from this branch.
      candidates[query_node.id] = previous_candidates
    end

    # Select the next unmatched query node.
    def next_unmatched_node(query, matches)
      # TODO: this could be improved by popping from a transient data structure
      # so as not to scan through the entire collection multiple times.
      query.nodes.detect { |query_node| !matches.key?(query_node.id) }
    end

    # Prune the list of candidates for a given query node (as per the Ulmann
    # algorithm).
    #
    # All nodes that have a smaller out degree than the query node or have
    # already been matched for the query node get rejected.
    def refine_candidates(query_node, matches, candidates)
      candidates[query_node.id].reject do |candidate_node|
        # TODO: add `#degree` reader to node proxy
        candidate_node.outgoing.count < query_node.outgoing.count || matches.has_value?(candidate_node.id)
      end
    end

    # Checks whether the edges between the current query node and already
    # matched query nodes have corresponding edges between the candidate node
    # and already matched nodes in the target graph.
    #
    # This method comes from a subroutine introduced in the paper by Lee, Han,
    # Kasperovics and Lee who point to several possible improvements over
    # blindly iterating through each adjacent node. Extracting this into a
    # generic method means that it’s possible to refine and optimise the
    # approach to matching nodes in a pluggable way without ripping apart the
    # main body of the algorithm.
    #
    # This definitely feels like it’s doing more work than it needs to, but
    # I wanted to get the damn thing working correctly before diving down the
    # rabbit hole of possible edge pruning strategies. I’m still not 100%
    # confident I understand the finer points of Ullmann’s algorithm let alone
    # the improvements proposed in the VF2 and QuickSI algorithms.
    def is_joinable(query_node, candidate_node, matches)
      # If there are no matches yet then assume the nodes are joinable.
      return true if matches.empty?

      # TODO: this only supports directed graphs, is there a more general API for undirected neighbours?
      # TODO: we probably want a better join/check API on node and edge proxies
      query_node.incoming.each do |incoming_query_node|
        if matches.key?(incoming_query_node.id)
          candidate_node.incoming.each do |incoming_candidate_node|
            return true if matches[incoming_query_node.id] == incoming_candidate_node.id
          end
        end
      end

      false
    end
  end
end

triple = Mementus::Graph.new do
  add_node(id: 1, label: :a)
  add_node(id: 2, label: :b)
  add_node(id: 3, label: :c)

  create_edge do |edge|
    edge.from = 1
    edge.to = 2
  end

  create_edge do |edge|
    edge.from = 2
    edge.to = 3
  end
end

chain = Mementus::Graph.new do
  add_node(id: 1, label: :a)
  add_node(id: 2, label: :b)
  add_node(id: 3, label: :c)
  add_node(id: 4, label: :a)
  add_node(id: 5, label: :b)
  add_node(id: 6, label: :d)

  create_edge do |edge|
    edge.from = 1
    edge.to = 2
  end

  create_edge do |edge|
    edge.from = 2
    edge.to = 3
  end

  create_edge do |edge|
    edge.from = 3
    edge.to = 4
  end

  create_edge do |edge|
    edge.from = 4
    edge.to = 5
  end

  create_edge do |edge|
    edge.from = 5
    edge.to = 6
  end
end

tree = Mementus::Graph.new do
  add_node(id: 1, label: :a)
  add_node(id: 2, label: :b)
  add_node(id: 3, label: :c)
  add_node(id: 4, label: :a)
  add_node(id: 5, label: :b)
  add_node(id: 6, label: :d)

  create_edge do |edge|
    edge.from = 1
    edge.to = 2
  end

  create_edge do |edge|
    edge.from = 1
    edge.to = 3
  end

  create_edge do |edge|
    edge.from = 3
    edge.to = 4
  end

  create_edge do |edge|
    edge.from = 4
    edge.to = 5
  end

  create_edge do |edge|
    edge.from = 5
    edge.to = 6
  end
end

diamond = Mementus::Graph.new do
  add_node(id: 1, label: :e)
  add_node(id: 2, label: :a)
  add_node(id: 3, label: :b)
  add_node(id: 4, label: :c)
  add_node(id: 5, label: :d)
  add_node(id: 6, label: :e)
  add_node(id: 7, label: :f)
  add_node(id: 8, label: :a)
  add_node(id: 9, label: :b)
  add_node(id: 10, label: :c)
  add_node(id: 11, label: :d)

  create_edge do |edge|
    edge.from = 1
    edge.to = 2
  end

  create_edge do |edge|
    edge.from = 2
    edge.to = 3
  end

  create_edge do |edge|
    edge.from = 2
    edge.to = 4
  end

  create_edge do |edge|
    edge.from = 3
    edge.to = 5
  end

  create_edge do |edge|
    edge.from = 4
    edge.to = 5
  end

  create_edge do |edge|
    edge.from = 5
    edge.to = 6
  end

  create_edge do |edge|
    edge.from = 6
    edge.to = 7
  end

  create_edge do |edge|
    edge.from = 7
    edge.to = 8
  end

  create_edge do |edge|
    edge.from = 8
    edge.to = 9
  end

  create_edge do |edge|
    edge.from = 8
    edge.to = 10
  end

  create_edge do |edge|
    edge.from = 9
    edge.to = 11
  end

  create_edge do |edge|
    edge.from = 10
    edge.to = 11
  end
end

puts "============================================="
puts "match a->b against a->b->c"
result = triple.match do
  add_node(id: 10, label: :a)
  add_node(id: 20, label: :b)

  create_edge do |edge|
    edge.from = 10
    edge.to = 20
  end
end

puts result
puts "============================================="
puts

puts "============================================="
puts "match a->b against a->b->c->a->b->d"
result = chain.match do
  add_node(id: 10, label: :a)
  add_node(id: 20, label: :b)

  create_edge do |edge|
    edge.from = 10
    edge.to = 20
  end
end

p result
puts "============================================="
puts

puts "============================================="
puts "match a->b a->c against a->b a->c->a->b->d"
result = tree.match do
  add_node(id: 10, label: :a)
  add_node(id: 20, label: :b)
  add_node(id: 30, label: :c)

  create_edge do |edge|
    edge.from = 10
    edge.to = 20
  end

  create_edge do |edge|
    edge.from = 10
    edge.to = 30
  end
end

p result
puts "============================================="
puts

puts "============================================="
puts "match a->b a->c b->d c->d against e->a->b a->c b->d c->d d->e->f f->a->b a->b"
result = diamond.match do
  add_node(id: 10, label: :a)
  add_node(id: 20, label: :b)
  add_node(id: 30, label: :c)
  add_node(id: 40, label: :d)

  create_edge do |edge|
    edge.from = 10
    edge.to = 20
  end

  create_edge do |edge|
    edge.from = 10
    edge.to = 30
  end

  create_edge do |edge|
    edge.from = 20
    edge.to = 40
  end

  create_edge do |edge|
    edge.from = 30
    edge.to = 40
  end
end

p result
puts "============================================="
puts
	$LOAD_PATH << "./lib"
	require 'mementus'

	# Prototype of subgraph isomorphism search using Ullmann’s original 1976
	# algorithm [1] as the foundation for a depth-first scan that moves forward
	# through each possible outgoing edge, then attempts to confirm the match by
	# doing a reverse edge lookup, comparing the query to the target graph.
	#
	# This is known to be a NP-complete problem but despite being exponential in
	# theory, in practice it’s possible to achieve reasonable results by using
	# index lookups and pruning rules to drastically limit the number of vertices
	# considered at each step of the search.
	#
	# The procedure is structured using the generic template method approach
	# outlined in [2], which provides potential for experimenting with different
	# strategies for skipping vertices and reducing the number of steps needed to
	# complete a full run of the algorithm. There are a number of possible
	# optimisations listed in the paper that can be probably be applied to this
	# implementation without needing a drastic rewrite.
	#
	# Given that this is all in Ruby, it’s likely that the biggest performance sink
	# here is in method dispatch rather than iterating through adjacency lists. For
	# small scale graph processing there should be no major problems.
	#
	# [1] J. R. Ullmann. “An algorithm for subgraph isomorphism.” J. ACM, 23:31–42,
	# January 1976.
	# [2] J. Lee, W.-S. Han, R. Kasperovics, and J.-H. Lee. “An in-depth comparison
	# of subgraph isomorphism algorithms in graph databases.” In Proceedings of
	# the 39th international conference on Very Large Data Bases, PVLDB’13,
	# pages 133–144. VLDB Endowment, 2013.
	#
	module Mementus
	class Graph
	def match(&block)
	# Build a query graph from the given input structure.
	query = Mementus::Graph.new(&block)

	# If there are zero nodes in either of the graphs there cannot be a
	# matching subgraph so return straight away.
	return @results if query.nodes.empty? \|\| nodes.empty?

	# An empty set of possible candidate nodes in the target graph.
	candidates = {}

	# For each node in the query graph, obtain a list of candidate nodes in
	# the target graph.
	query.nodes.each do \|query_node\|
	# Find all nodes in the target graph that match the query node by label.
	candidate_nodes = nodes(query_node.label)

	# If we don't find any graph nodes for a given query node we can
	# immediately rule out the possibility of finding a matching subgraph.
	return @results if candidate_nodes.empty?

	# Store a list of possible matching nodes for the given query node.
	candidates[query_node.id] = candidate_nodes
	end

	# A partial embedding that maps query nodes to target nodes.
	matches = {}

	# Final list of matched subgraphs from the target graph.
	@results = []

	# Internal state stack for embeddings at each level of recursion.
	@matches_stack = []

	# Kick off the traversal with the starting set of candidates and the empty
	# set of matches.
	subgraph_search(query, matches, candidates)

	# Return the collected results.
	@results
	end

	def subgraph_search(query, matches, candidates)
	# When each query node maps to a qualified node in the target graph there
	# is a matching subgraph. Store the results and return upwards.
	if matches.count == query.nodes_count
	@results << matches
	return
	end

	# Select the next unmatched query node.
	query_node = next_unmatched_node(query, matches)

	# Stash the current state of potential candidates in the target graph.
	previous_candidates = candidates[query_node.id]

	# Refine the potential matches.
	candidates[query_node.id] = refine_candidates(query_node, matches, candidates)

	# Iterate through the possible candidates for the current query node.
	candidates[query_node.id].each do \|candidate_node\|
	# Check whether the target graph contains a corresponding edge joining
	# back to a previously matched node.
	if is_joinable(query_node, candidate_node, matches)
	# The nodes are joinable so continue moving down the connected query
	# Push existing state onto a history stack so we can backtrack after
	# the depth-first search bottoms out.
	@matches_stack << matches.dup

	# Update the state to mark the current candidate node
	matches[query_node.id] = candidate_node.id

	# Recursively attempt to match remaining candidates.
	subgraph_search(query, matches, candidates)

	# Restore the previous matching state and move on to the next
	# candidate at the current level.
	matches = @matches_stack.pop
	end
	end

	# Restore the previous list of candidates and exit from this branch.
	candidates[query_node.id] = previous_candidates
	end

	# Select the next unmatched query node.
	def next_unmatched_node(query, matches)
	# TODO: this could be improved by popping from a transient data structure
	# so as not to scan through the entire collection multiple times.
	query.nodes.detect { \|query_node\| !matches.key?(query_node.id) }
	end

	# Prune the list of candidates for a given query node (as per the Ulmann
	# algorithm).
	#
	# All nodes that have a smaller out degree than the query node or have
	# already been matched for the query node get rejected.
	def refine_candidates(query_node, matches, candidates)
	candidates[query_node.id].reject do \|candidate_node\|
	# TODO: add `#degree` reader to node proxy
	candidate_node.outgoing.count < query_node.outgoing.count \|\| matches.has_value?(candidate_node.id)
	end
	end

	# Checks whether the edges between the current query node and already
	# matched query nodes have corresponding edges between the candidate node
	# and already matched nodes in the target graph.
	#
	# This method comes from a subroutine introduced in the paper by Lee, Han,
	# Kasperovics and Lee who point to several possible improvements over
	# blindly iterating through each adjacent node. Extracting this into a
	# generic method means that it’s possible to refine and optimise the
	# approach to matching nodes in a pluggable way without ripping apart the
	# main body of the algorithm.
	#
	# This definitely feels like it’s doing more work than it needs to, but
	# I wanted to get the damn thing working correctly before diving down the
	# rabbit hole of possible edge pruning strategies. I’m still not 100%
	# confident I understand the finer points of Ullmann’s algorithm let alone
	# the improvements proposed in the VF2 and QuickSI algorithms.
	def is_joinable(query_node, candidate_node, matches)
	# If there are no matches yet then assume the nodes are joinable.
	return true if matches.empty?

	# TODO: this only supports directed graphs, is there a more general API for undirected neighbours?
	# TODO: we probably want a better join/check API on node and edge proxies
	query_node.incoming.each do \|incoming_query_node\|
	if matches.key?(incoming_query_node.id)
	candidate_node.incoming.each do \|incoming_candidate_node\|
	return true if matches[incoming_query_node.id] == incoming_candidate_node.id
	end
	end
	end

	false
	end
	end
	end

	triple = Mementus::Graph.new do
	add_node(id: 1, label: :a)
	add_node(id: 2, label: :b)
	add_node(id: 3, label: :c)

	create_edge do \|edge\|
	edge.from = 1
	edge.to = 2
	end

	create_edge do \|edge\|
	edge.from = 2
	edge.to = 3
	end
	end

	chain = Mementus::Graph.new do
	add_node(id: 1, label: :a)
	add_node(id: 2, label: :b)
	add_node(id: 3, label: :c)
	add_node(id: 4, label: :a)
	add_node(id: 5, label: :b)
	add_node(id: 6, label: :d)

	create_edge do \|edge\|
	edge.from = 1
	edge.to = 2
	end

	create_edge do \|edge\|
	edge.from = 2
	edge.to = 3
	end

	create_edge do \|edge\|
	edge.from = 3
	edge.to = 4
	end

	create_edge do \|edge\|
	edge.from = 4
	edge.to = 5
	end

	create_edge do \|edge\|
	edge.from = 5
	edge.to = 6
	end
	end

	tree = Mementus::Graph.new do
	add_node(id: 1, label: :a)
	add_node(id: 2, label: :b)
	add_node(id: 3, label: :c)
	add_node(id: 4, label: :a)
	add_node(id: 5, label: :b)
	add_node(id: 6, label: :d)

	create_edge do \|edge\|
	edge.from = 1
	edge.to = 2
	end

	create_edge do \|edge\|
	edge.from = 1
	edge.to = 3
	end

	create_edge do \|edge\|
	edge.from = 3
	edge.to = 4
	end

	create_edge do \|edge\|
	edge.from = 4
	edge.to = 5
	end

	create_edge do \|edge\|
	edge.from = 5
	edge.to = 6
	end
	end

	diamond = Mementus::Graph.new do
	add_node(id: 1, label: :e)
	add_node(id: 2, label: :a)
	add_node(id: 3, label: :b)
	add_node(id: 4, label: :c)
	add_node(id: 5, label: :d)
	add_node(id: 6, label: :e)
	add_node(id: 7, label: :f)
	add_node(id: 8, label: :a)
	add_node(id: 9, label: :b)
	add_node(id: 10, label: :c)
	add_node(id: 11, label: :d)

	create_edge do \|edge\|
	edge.from = 1
	edge.to = 2
	end

	create_edge do \|edge\|
	edge.from = 2
	edge.to = 3
	end

	create_edge do \|edge\|
	edge.from = 2
	edge.to = 4
	end

	create_edge do \|edge\|
	edge.from = 3
	edge.to = 5
	end

	create_edge do \|edge\|
	edge.from = 4
	edge.to = 5
	end

	create_edge do \|edge\|
	edge.from = 5
	edge.to = 6
	end

	create_edge do \|edge\|
	edge.from = 6
	edge.to = 7
	end

	create_edge do \|edge\|
	edge.from = 7
	edge.to = 8
	end

	create_edge do \|edge\|
	edge.from = 8
	edge.to = 9
	end

	create_edge do \|edge\|
	edge.from = 8
	edge.to = 10
	end

	create_edge do \|edge\|
	edge.from = 9
	edge.to = 11
	end

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 11
	end
	end

	puts "============================================="
	puts "match a->b against a->b->c"
	result = triple.match do
	add_node(id: 10, label: :a)
	add_node(id: 20, label: :b)

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 20
	end
	end

	puts result
	puts "============================================="
	puts

	puts "============================================="
	puts "match a->b against a->b->c->a->b->d"
	result = chain.match do
	add_node(id: 10, label: :a)
	add_node(id: 20, label: :b)

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 20
	end
	end

	p result
	puts "============================================="
	puts

	puts "============================================="
	puts "match a->b a->c against a->b a->c->a->b->d"
	result = tree.match do
	add_node(id: 10, label: :a)
	add_node(id: 20, label: :b)
	add_node(id: 30, label: :c)

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 20
	end

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 30
	end
	end

	p result
	puts "============================================="
	puts

	puts "============================================="
	puts "match a->b a->c b->d c->d against e->a->b a->c b->d c->d d->e->f f->a->b a->b"
	result = diamond.match do
	add_node(id: 10, label: :a)
	add_node(id: 20, label: :b)
	add_node(id: 30, label: :c)
	add_node(id: 40, label: :d)

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 20
	end

	create_edge do \|edge\|
	edge.from = 10
	edge.to = 30
	end

	create_edge do \|edge\|
	edge.from = 20
	edge.to = 40
	end

	create_edge do \|edge\|
	edge.from = 30
	edge.to = 40
	end
	end

	p result
	puts "============================================="
	puts