Skip to content

Instantly share code, notes, and snippets.

@dennisverspuij
Last active September 14, 2018 09:11
Show Gist options
  • Save dennisverspuij/785fc7128792a2bb939e56b21c67b98b to your computer and use it in GitHub Desktop.
Save dennisverspuij/785fc7128792a2bb939e56b21c67b98b to your computer and use it in GitHub Desktop.
Python 2/3 utility for interactively unraveling structure from a Neo4j graph database.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Python 2/3 utility for interactively unraveling structure from a Neo4j graph database.
First adjust the DRIVER statement for the database you wish to explore. Then run python neoexplore.py.
The script will first scan the database for unique combinations of labels, and offer you queries to explore these.
E.g. >0 (A, B)*123 presents a query for exploring 123 nodes with both labels A and B by typing >0.
First the possible properties are listed, e.g. .[someprop]*67 means 67 out of 123 nodes have property someprop.
The square brackets emphasize the property does not always exist.
Second possible outgoing relations are listed, e.g. <>5 >10* [somereltype]*50 (C, D)*20 means 10 out of the 123
nodes together have 50 outgoing relations of type somereltype to 20 distinct nodes with both labels C and D.
Again the square brackets emphasize the relation does not always exist.
To explore the target (C, D) nodes you can type >5. Or if you wish to explore the source (A, B) nodes that do
have these somereltype relations to (C, D) nodes type <5.
To iterate source or target nodes and print their property values type <5= or >5= respectively.
You can additionally filter for having or not-having certain properties by adding .someprop or !someprop sequences
after the query number. E.g. >5.someprop!otherprop will list only those that do have someprop but not otherprop.
Special query >5.= lists nodes that have at least one property, and >5=! lists nodes that have no properties at all.
Besides using queries offered by the script you can add and run manual queries as well. First type the subject
variable of the query, e.g. n, and then type your query without RETURN/ORDER clause followed by an extra newline,
e.g. MATCH (n) WHERE id(n) < 10.
'''
from __future__ import absolute_import, division, print_function
import sys
import readline
import re
from collections import OrderedDict
from hashlib import sha256 as hashfunc
from neo4j.v1 import GraphDatabase, basic_auth
# Settings: ADJUST ME!
DRIVER = GraphDatabase.driver(uri=u'bolt://HOST_OR_IP:7687', auth=basic_auth(u'USER', u'PASSWORD'))
# Utils
read_stdin = input if vars(__builtins__).get('raw_input') is None else lambda prompt: raw_input(prompt).decode(sys.stdin.encoding)
# Model
class Query:
VARFORMAT = u'_{var}' # We format our own variables like this, avoid using vars like these in manual queries!
def __init__(self, inspector, index, hash, description, cypher, var_to, cnt_to=0, cnt_rel=0, var_from=None, cnt_from=0, parent=None):
self.inspector = inspector # Neo4jInspector bound to
self.index = index # index in collections this Query is used in
self.hash = hash # hash of this query and its parent
self.description = description # description (unicode)
self.cypher = cypher # cypher query (unicode)
self.var_to = var_to # variable name of the target subject in the cypher query (unicode)
self.cnt_to = cnt_to # last nr matching target nodes (int), 0 if unknown yet
self.cnt_rel = cnt_rel # last nr of relations (int), 0 if unknown or not applicable
self.var_from = var_from # variable name of the originating subject in the cypher query (unicode), 0 if not applicable
self.cnt_from = cnt_from # last nr matching originating nodes (int), 0 if unknown or not applicable
self.parent = parent # parent query (Query)
def var(self, alias):
return self.VARFORMAT.format(var=u'{0}{1}'.format(alias, self.index))
def get_cypher(self):
return (u'' if self.parent is None else (self.parent.get_cypher() + u'\n')) + \
self.cypher # u'{0} /*#1*/'.format(self.cypher, self.index)
def get_description(self, full=True, with_index=True):
description = (u'' if (not full) or self.parent is None else (self.parent.get_description(True, False) + u' ')) + \
self.description.format(cnt_from=self.cnt_from, cnt_rel=self.cnt_rel, cnt_to=self.cnt_to)
if with_index:
return u'{0:<5} {1}'.format(u'{0}>{1}'.format(u' ' if self.var_from is None else u'<', self.index), description)
else:
return description
@staticmethod
def _where_props(subjectvar, propsq):
propsq = { m.group(2).strip(): (m.group(1) == u'.') for m in re.compile(u'([.!])([^.!]*)', re.UNICODE).finditer(propsq) }
where = []
for key, has_or_not in propsq.items():
if key != u'':
where.append(u'({0}EXISTS({1}.{2}))'.format(u'' if has_or_not else u'NOT ', subjectvar, key))
else:
where = [u'properties({0}) {1} {{}}'.format(subjectvar, u'<>' if has_or_not else u'=')]
break
return (u' WHERE ' + u' AND '.join(where)) if len(where) else u''
def inspect(self, use_var_from=False, propsq=u''):
subjectvar = self.var_from if use_var_from else self.var_to
if subjectvar is None:
raise ValueError(u'No source subject in this query')
where = u'WITH *' + self._where_props(subjectvar, propsq)
cypher = u'{0}\n{1}'.format(self.get_cypher(), where)
print(cypher + u'\nRETURN DISTINCT ' + subjectvar)
if self.var_from is None:
cntq = u'{0} RETURN 0 as cnt_from, 0 as cnt_rel, count(DISTINCT {1}) AS cnt_to'.format(cypher, self.var_to)
else:
cntq = u'{0} RETURN count(DISTINCT {1}) AS cnt_from, count(*) AS cnt_rel, count(DISTINCT {2}) AS cnt_to'.format(cypher, self.var_from, self.var_to)
try:
r = next(iter(self.inspector.run_query(cntq)))
subjectcnt = r[u'cnt_from'] if use_var_from else r[u'cnt_to']
if propsq == u'':
self.cnt_from, self.cnt_rel, self.cnt_to = (r[u'cnt_from'], r[u'cnt_rel'], r[u'cnt_to'])
except StopIteration:
subjectcnt = 0
print(u'== {0:=<77}'.format(u'{0}*{1:d} '.format(subjectvar, subjectcnt)))
if subjectcnt == 0:
return
cypher += '\nWITH DISTINCT ' + subjectvar
for key in self.inspector.propertykeys:
cnt = next(iter(self.inspector.run_query(u'{0} WHERE exists({1}.`{2}`) RETURN count(*) AS cnt'.format(cypher, subjectvar, key))))[u'cnt']
if cnt > 0:
print(u' .{0}*{1:d}'.format(key if cnt == subjectcnt else u'[{0}]'.format(key), cnt))
var_rel = self.var(u'r')
var_to = self.var(u'n')
relqueries = []
for r in self.inspector.run_query(u'{0} MATCH ({1})-[{2}]->({3}) WITH {1}, {2}, {3} RETURN DISTINCT type({2}) AS type, labels({3}) AS labels, count(DISTINCT {1}) AS cnt_from, count(DISTINCT {2}) AS cnt_rel, count(DISTINCT {3}) AS cnt_to'.format(cypher, subjectvar, var_rel, var_to)):
if r[u'cnt_rel'] > 0:
labels = self.inspector.sort_labels(r[u'labels'])
relqueries.append((
r[u'type'],
self.inspector.set(
description=u'{propsq}>*{{cnt_from:d}} {type}*{{cnt_rel:d}} ( {labels} )*{{cnt_to:d}}'.format(
propsq=propsq, type=r[u'type'] if r[u'cnt_from'] == subjectcnt else u'[{0}]'.format(r[u'type']),
labels=', '.join(labels)
),
cypher=u'{0} MATCH ({1})-[{2}:`{3}`]->({4}) WHERE {4}:`{5}` AND size(labels({4})) = {6}'.format(
where, subjectvar, var_rel, r[u'type'], var_to, u'` AND {0}:`'.format(var_to).join(labels), len(labels)
),
var_to=var_to,
cnt_to=r[u'cnt_to'],
cnt_rel=r[u'cnt_rel'],
var_from=subjectvar,
cnt_from=r[u'cnt_from'],
parent=self
)
))
for type, relquery in sorted(relqueries, key=lambda r: (self.inspector.relationtypes[r[0]], r[0])):
print(relquery.get_description(False))
print(u'{0:=<80}'.format(u''))
def list(self, use_var_from=False, propsq=u''):
subjectvar = self.var_from if use_var_from else self.var_to
if subjectvar is None:
raise ValueError(u'No source subject in this query')
var_rel = self.var(u'r')
var_to = self.var(u'n')
cypher = u'{0}\nWITH DISTINCT {2}{1}\nOPTIONAL MATCH ({2})-[{3}]->({4})'.format(self.get_cypher(), self._where_props(subjectvar, propsq), subjectvar, var_rel, var_to)
print(cypher + u'\nRETURN DISTINCT ' + subjectvar)
cypher += u'\nRETURN DISTINCT {0}, type({1}) as type, labels({2}) as labels, count(DISTINCT {2}) as cnt_to ORDER BY id({0}), type, labels'.format(subjectvar, var_rel, var_to)
print(u'{0:=<80}'.format(u''))
prev_id = None
for r in self.inspector.run_query(cypher):
node = r[subjectvar]
if node.id != prev_id:
if prev_id is not None:
try:
read_stdin()
except:
break
prev_id = node.id
print(u'\n({1}) {0:d}:'.format(node.id, u', '.join(self.inspector.sort_labels(node.labels))))
for key, value in self.inspector.sort_properties(node.properties).items():
print(u'.{0:<16s}\t{1}'.format(key, repr(value)))
if r[u'type'] is not None:
print(u'> {0} ({1})*{2}'.format(r[u'type'], u', '.join(self.inspector.sort_labels(r[u'labels'])), r[u'cnt_to']))
print(u'\n{0:=<80}'.format(u''))
class Neo4jInspector():
def __init__(self, driver):
self.driver = driver # Neo4j GraphDatabase driver
self.reset()
def reset(self):
self.queriesByIndex = []
self.queriesByHash = {}
self.label_uses = {}
print(u'Scanning labels')
labelsets = [[r[u'labels'], r[u'cnt']] for r in self.run_query(u'MATCH (n) RETURN DISTINCT labels(n) AS labels, count(*) AS cnt')]
for labels, cnt in labelsets:
for label in labels:
self.label_uses[label] = self.label_uses[label] + 1 if label in self.label_uses else 1
for labels, cnt in sorted((self.sort_labels(labels), cnt) for labels, cnt in labelsets):
self.set(
description=u'( {0} )*{{cnt_to:d}}'.format(', '.join(labels)),
cypher=u'MATCH (n) WHERE n:`{0}` AND size(labels(n)) = {1:d}'.format(u'` AND n:`'.join(labels), len(labels)),
var_to=u'n',
cnt_to=cnt
)
print(u'Scanning relation types')
self.relationtypes = OrderedDict(
(r[u'type'], index)
for index, r in enumerate(self.run_query(u'MATCH ()-[r]->(n) RETURN DISTINCT type(r) as type, count(DISTINCT labels(n)) AS cnt ORDER BY cnt DESC, type ASC'))
)
print(u'Scanning property keys')
self.propertykeys = OrderedDict(
(key, index)
for index, (cnt, key) in enumerate(sorted(
(-next(iter(self.run_query(u'MATCH (n) WHERE exists(n.`{0}`) RETURN count(DISTINCT labels(n)) AS cnt'.format(r[u'propertyKey']))))[u'cnt'], r[u'propertyKey'])
for r in self.run_query(u'CALL db.propertyKeys()')
))
)
def sort_labels(self, labels):
return [label for uses, label in sorted(zip([self.label_uses[label] for label in labels], labels))]
def sort_properties(self, properties):
return OrderedDict((key, value) for index, (key, value) in sorted(zip([self.propertykeys[key] for key in properties], properties.items())))
def __len__(self):
return len(self.queriesByIndex)
def __iter__(self):
return iter(self.queriesByIndex)
def get(self, index):
return self.queriesByIndex[index]
def set(self, description, cypher, var_to, cnt_to=0, cnt_rel=0, var_from=None, cnt_from=0, parent=None):
hash = hashfunc((cypher + (u'' if parent is None else parent.hash)).encode('UTF-8')).hexdigest()
if hash not in self.queriesByHash:
query = Query(self, len(self.queriesByIndex), hash, description, cypher, var_to, cnt_to, cnt_rel, var_from, cnt_from, parent)
self.queriesByIndex.append(query)
self.queriesByHash[hash] = query
else:
query = self.queriesByHash[hash]
query.cnt_from, query.cnt_rel, query.cnt_to = (cnt_from, cnt_rel, cnt_to)
return query
def pop(self):
self.queriesByHash.pop(self.queriesByIndex.pop().hash)
def clear(self, till_index=0):
for index in range(till_index, len(self.queriesByIndex)):
self.pop()
def run_query(self, *arg, **kwarg):
with self.driver.session() as session:
return session.run(*arg, **kwarg)
# Main
inspector = Neo4jInspector(DRIVER)
nr_initial_queries = len(inspector)
query_re = re.compile(u'^(?:(?P<var_to>[`a-z][^.!=]*)|(?P<from_or_to>[<>])?(?P<nr>\d+))(?P<propsq>(?:[.!].*?)*)?(?P<list>=)?$', re.UNICODE | re.IGNORECASE)
do_print_queries = True
do_print_help = True
while True:
if do_print_queries:
do_print_queries = False
print(u'Queries (#):')
for index, query in enumerate(inspector):
if index == nr_initial_queries:
print(u'-----')
print(query.get_description())
print(u'')
if do_print_help:
do_print_help = False
print(u'Commands:')
print(u'(<|>)#[(.|!)[prop]...][=]')
print(u' Inspect or list (=) the source (<) or target (>) nodes of given query (#),')
print(u' optionally filtered by having (.) or lacking (!) given properties')
print(u' (all or none if prop omitted).\n')
print(u'subject[(.|!)[prop]...][=]\\ncypher\\n\\n')
print(u' Inspect or list (=) the nodes of given subject from given Cypher query')
print(u' (without RETURN and ORDER BY clauses). The query will also be added to the')
print(u' list of available queries.\n')
print(u':')
print(u' List queries\n')
print(u'~')
print(u' Drop all but the initial queries for unique label combinations\n')
print(u'?')
print(u' Print this help information\n')
try:
line = read_stdin(u'>>> ').strip()
except EOFError as e:
break
except:
print(u'')
continue
if line == u'':
continue
elif line == u':':
do_print_queries = True
elif line == u'?':
do_print_help = True
elif line == u'~':
inspector.clear(nr_initial_queries)
print(u'Reset to initial queries\n')
else:
m = query_re.search(line)
if not m:
print(u'Invalid command (? = help)\n')
else:
try:
# Fetch query
if m.group(u'var_to') is None: # Defined query
query = inspector.get(int(m.group(u'nr')))
use_from = (m.group(u'from_or_to') == u'<')
else: # Manual query
var_to = m.group(u'var_to')
cypher = u''
try:
while True:
line = read_stdin(u'... ').strip()
if line == u'':
break
cypher += u'\n' + line
except EOFError as e:
continue
query = inspector.set(
description=u'| {0} |*{{cnt_to}})'.format(cypher[1:].replace(u'\n', u'').replace(u'{', u'{{').replace(u'}', u'}}')),
cypher=cypher[1:],
var_to=var_to
)
use_from = False
# Inspect or list
if m.group(u'list') is None:
query.inspect(use_from, m.group(u'propsq'))
else:
query.list(use_from, m.group(u'propsq'))
print(u'')
except BaseException as e:
print(u'ERROR: {0}\n'.format(e))
if (m.group(u'var_to') is not None) and (query.index == (len(inspector)-1)):
inspector.pop() # Drop newly added manual query
print(u'')
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment