Last active
September 14, 2018 09:11
-
-
Save dennisverspuij/785fc7128792a2bb939e56b21c67b98b to your computer and use it in GitHub Desktop.
Python 2/3 utility for interactively unraveling structure from a Neo4j graph database.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
Python 2/3 utility for interactively unraveling structure from a Neo4j graph database. | |
First adjust the DRIVER statement for the database you wish to explore. Then run python neoexplore.py. | |
The script will first scan the database for unique combinations of labels, and offer you queries to explore these. | |
E.g. >0 (A, B)*123 presents a query for exploring 123 nodes with both labels A and B by typing >0. | |
First the possible properties are listed, e.g. .[someprop]*67 means 67 out of 123 nodes have property someprop. | |
The square brackets emphasize the property does not always exist. | |
Second possible outgoing relations are listed, e.g. <>5 >10* [somereltype]*50 (C, D)*20 means 10 out of the 123 | |
nodes together have 50 outgoing relations of type somereltype to 20 distinct nodes with both labels C and D. | |
Again the square brackets emphasize the relation does not always exist. | |
To explore the target (C, D) nodes you can type >5. Or if you wish to explore the source (A, B) nodes that do | |
have these somereltype relations to (C, D) nodes type <5. | |
To iterate source or target nodes and print their property values type <5= or >5= respectively. | |
You can additionally filter for having or not-having certain properties by adding .someprop or !someprop sequences | |
after the query number. E.g. >5.someprop!otherprop will list only those that do have someprop but not otherprop. | |
Special query >5.= lists nodes that have at least one property, and >5=! lists nodes that have no properties at all. | |
Besides using queries offered by the script you can add and run manual queries as well. First type the subject | |
variable of the query, e.g. n, and then type your query without RETURN/ORDER clause followed by an extra newline, | |
e.g. MATCH (n) WHERE id(n) < 10. | |
''' | |
from __future__ import absolute_import, division, print_function | |
import sys | |
import readline | |
import re | |
from collections import OrderedDict | |
from hashlib import sha256 as hashfunc | |
from neo4j.v1 import GraphDatabase, basic_auth | |
# Settings: ADJUST ME! | |
DRIVER = GraphDatabase.driver(uri=u'bolt://HOST_OR_IP:7687', auth=basic_auth(u'USER', u'PASSWORD')) | |
# Utils | |
read_stdin = input if vars(__builtins__).get('raw_input') is None else lambda prompt: raw_input(prompt).decode(sys.stdin.encoding) | |
# Model | |
class Query: | |
VARFORMAT = u'_{var}' # We format our own variables like this, avoid using vars like these in manual queries! | |
def __init__(self, inspector, index, hash, description, cypher, var_to, cnt_to=0, cnt_rel=0, var_from=None, cnt_from=0, parent=None): | |
self.inspector = inspector # Neo4jInspector bound to | |
self.index = index # index in collections this Query is used in | |
self.hash = hash # hash of this query and its parent | |
self.description = description # description (unicode) | |
self.cypher = cypher # cypher query (unicode) | |
self.var_to = var_to # variable name of the target subject in the cypher query (unicode) | |
self.cnt_to = cnt_to # last nr matching target nodes (int), 0 if unknown yet | |
self.cnt_rel = cnt_rel # last nr of relations (int), 0 if unknown or not applicable | |
self.var_from = var_from # variable name of the originating subject in the cypher query (unicode), 0 if not applicable | |
self.cnt_from = cnt_from # last nr matching originating nodes (int), 0 if unknown or not applicable | |
self.parent = parent # parent query (Query) | |
def var(self, alias): | |
return self.VARFORMAT.format(var=u'{0}{1}'.format(alias, self.index)) | |
def get_cypher(self): | |
return (u'' if self.parent is None else (self.parent.get_cypher() + u'\n')) + \ | |
self.cypher # u'{0} /*#1*/'.format(self.cypher, self.index) | |
def get_description(self, full=True, with_index=True): | |
description = (u'' if (not full) or self.parent is None else (self.parent.get_description(True, False) + u' ')) + \ | |
self.description.format(cnt_from=self.cnt_from, cnt_rel=self.cnt_rel, cnt_to=self.cnt_to) | |
if with_index: | |
return u'{0:<5} {1}'.format(u'{0}>{1}'.format(u' ' if self.var_from is None else u'<', self.index), description) | |
else: | |
return description | |
@staticmethod | |
def _where_props(subjectvar, propsq): | |
propsq = { m.group(2).strip(): (m.group(1) == u'.') for m in re.compile(u'([.!])([^.!]*)', re.UNICODE).finditer(propsq) } | |
where = [] | |
for key, has_or_not in propsq.items(): | |
if key != u'': | |
where.append(u'({0}EXISTS({1}.{2}))'.format(u'' if has_or_not else u'NOT ', subjectvar, key)) | |
else: | |
where = [u'properties({0}) {1} {{}}'.format(subjectvar, u'<>' if has_or_not else u'=')] | |
break | |
return (u' WHERE ' + u' AND '.join(where)) if len(where) else u'' | |
def inspect(self, use_var_from=False, propsq=u''): | |
subjectvar = self.var_from if use_var_from else self.var_to | |
if subjectvar is None: | |
raise ValueError(u'No source subject in this query') | |
where = u'WITH *' + self._where_props(subjectvar, propsq) | |
cypher = u'{0}\n{1}'.format(self.get_cypher(), where) | |
print(cypher + u'\nRETURN DISTINCT ' + subjectvar) | |
if self.var_from is None: | |
cntq = u'{0} RETURN 0 as cnt_from, 0 as cnt_rel, count(DISTINCT {1}) AS cnt_to'.format(cypher, self.var_to) | |
else: | |
cntq = u'{0} RETURN count(DISTINCT {1}) AS cnt_from, count(*) AS cnt_rel, count(DISTINCT {2}) AS cnt_to'.format(cypher, self.var_from, self.var_to) | |
try: | |
r = next(iter(self.inspector.run_query(cntq))) | |
subjectcnt = r[u'cnt_from'] if use_var_from else r[u'cnt_to'] | |
if propsq == u'': | |
self.cnt_from, self.cnt_rel, self.cnt_to = (r[u'cnt_from'], r[u'cnt_rel'], r[u'cnt_to']) | |
except StopIteration: | |
subjectcnt = 0 | |
print(u'== {0:=<77}'.format(u'{0}*{1:d} '.format(subjectvar, subjectcnt))) | |
if subjectcnt == 0: | |
return | |
cypher += '\nWITH DISTINCT ' + subjectvar | |
for key in self.inspector.propertykeys: | |
cnt = next(iter(self.inspector.run_query(u'{0} WHERE exists({1}.`{2}`) RETURN count(*) AS cnt'.format(cypher, subjectvar, key))))[u'cnt'] | |
if cnt > 0: | |
print(u' .{0}*{1:d}'.format(key if cnt == subjectcnt else u'[{0}]'.format(key), cnt)) | |
var_rel = self.var(u'r') | |
var_to = self.var(u'n') | |
relqueries = [] | |
for r in self.inspector.run_query(u'{0} MATCH ({1})-[{2}]->({3}) WITH {1}, {2}, {3} RETURN DISTINCT type({2}) AS type, labels({3}) AS labels, count(DISTINCT {1}) AS cnt_from, count(DISTINCT {2}) AS cnt_rel, count(DISTINCT {3}) AS cnt_to'.format(cypher, subjectvar, var_rel, var_to)): | |
if r[u'cnt_rel'] > 0: | |
labels = self.inspector.sort_labels(r[u'labels']) | |
relqueries.append(( | |
r[u'type'], | |
self.inspector.set( | |
description=u'{propsq}>*{{cnt_from:d}} {type}*{{cnt_rel:d}} ( {labels} )*{{cnt_to:d}}'.format( | |
propsq=propsq, type=r[u'type'] if r[u'cnt_from'] == subjectcnt else u'[{0}]'.format(r[u'type']), | |
labels=', '.join(labels) | |
), | |
cypher=u'{0} MATCH ({1})-[{2}:`{3}`]->({4}) WHERE {4}:`{5}` AND size(labels({4})) = {6}'.format( | |
where, subjectvar, var_rel, r[u'type'], var_to, u'` AND {0}:`'.format(var_to).join(labels), len(labels) | |
), | |
var_to=var_to, | |
cnt_to=r[u'cnt_to'], | |
cnt_rel=r[u'cnt_rel'], | |
var_from=subjectvar, | |
cnt_from=r[u'cnt_from'], | |
parent=self | |
) | |
)) | |
for type, relquery in sorted(relqueries, key=lambda r: (self.inspector.relationtypes[r[0]], r[0])): | |
print(relquery.get_description(False)) | |
print(u'{0:=<80}'.format(u'')) | |
def list(self, use_var_from=False, propsq=u''): | |
subjectvar = self.var_from if use_var_from else self.var_to | |
if subjectvar is None: | |
raise ValueError(u'No source subject in this query') | |
var_rel = self.var(u'r') | |
var_to = self.var(u'n') | |
cypher = u'{0}\nWITH DISTINCT {2}{1}\nOPTIONAL MATCH ({2})-[{3}]->({4})'.format(self.get_cypher(), self._where_props(subjectvar, propsq), subjectvar, var_rel, var_to) | |
print(cypher + u'\nRETURN DISTINCT ' + subjectvar) | |
cypher += u'\nRETURN DISTINCT {0}, type({1}) as type, labels({2}) as labels, count(DISTINCT {2}) as cnt_to ORDER BY id({0}), type, labels'.format(subjectvar, var_rel, var_to) | |
print(u'{0:=<80}'.format(u'')) | |
prev_id = None | |
for r in self.inspector.run_query(cypher): | |
node = r[subjectvar] | |
if node.id != prev_id: | |
if prev_id is not None: | |
try: | |
read_stdin() | |
except: | |
break | |
prev_id = node.id | |
print(u'\n({1}) {0:d}:'.format(node.id, u', '.join(self.inspector.sort_labels(node.labels)))) | |
for key, value in self.inspector.sort_properties(node.properties).items(): | |
print(u'.{0:<16s}\t{1}'.format(key, repr(value))) | |
if r[u'type'] is not None: | |
print(u'> {0} ({1})*{2}'.format(r[u'type'], u', '.join(self.inspector.sort_labels(r[u'labels'])), r[u'cnt_to'])) | |
print(u'\n{0:=<80}'.format(u'')) | |
class Neo4jInspector(): | |
def __init__(self, driver): | |
self.driver = driver # Neo4j GraphDatabase driver | |
self.reset() | |
def reset(self): | |
self.queriesByIndex = [] | |
self.queriesByHash = {} | |
self.label_uses = {} | |
print(u'Scanning labels') | |
labelsets = [[r[u'labels'], r[u'cnt']] for r in self.run_query(u'MATCH (n) RETURN DISTINCT labels(n) AS labels, count(*) AS cnt')] | |
for labels, cnt in labelsets: | |
for label in labels: | |
self.label_uses[label] = self.label_uses[label] + 1 if label in self.label_uses else 1 | |
for labels, cnt in sorted((self.sort_labels(labels), cnt) for labels, cnt in labelsets): | |
self.set( | |
description=u'( {0} )*{{cnt_to:d}}'.format(', '.join(labels)), | |
cypher=u'MATCH (n) WHERE n:`{0}` AND size(labels(n)) = {1:d}'.format(u'` AND n:`'.join(labels), len(labels)), | |
var_to=u'n', | |
cnt_to=cnt | |
) | |
print(u'Scanning relation types') | |
self.relationtypes = OrderedDict( | |
(r[u'type'], index) | |
for index, r in enumerate(self.run_query(u'MATCH ()-[r]->(n) RETURN DISTINCT type(r) as type, count(DISTINCT labels(n)) AS cnt ORDER BY cnt DESC, type ASC')) | |
) | |
print(u'Scanning property keys') | |
self.propertykeys = OrderedDict( | |
(key, index) | |
for index, (cnt, key) in enumerate(sorted( | |
(-next(iter(self.run_query(u'MATCH (n) WHERE exists(n.`{0}`) RETURN count(DISTINCT labels(n)) AS cnt'.format(r[u'propertyKey']))))[u'cnt'], r[u'propertyKey']) | |
for r in self.run_query(u'CALL db.propertyKeys()') | |
)) | |
) | |
def sort_labels(self, labels): | |
return [label for uses, label in sorted(zip([self.label_uses[label] for label in labels], labels))] | |
def sort_properties(self, properties): | |
return OrderedDict((key, value) for index, (key, value) in sorted(zip([self.propertykeys[key] for key in properties], properties.items()))) | |
def __len__(self): | |
return len(self.queriesByIndex) | |
def __iter__(self): | |
return iter(self.queriesByIndex) | |
def get(self, index): | |
return self.queriesByIndex[index] | |
def set(self, description, cypher, var_to, cnt_to=0, cnt_rel=0, var_from=None, cnt_from=0, parent=None): | |
hash = hashfunc((cypher + (u'' if parent is None else parent.hash)).encode('UTF-8')).hexdigest() | |
if hash not in self.queriesByHash: | |
query = Query(self, len(self.queriesByIndex), hash, description, cypher, var_to, cnt_to, cnt_rel, var_from, cnt_from, parent) | |
self.queriesByIndex.append(query) | |
self.queriesByHash[hash] = query | |
else: | |
query = self.queriesByHash[hash] | |
query.cnt_from, query.cnt_rel, query.cnt_to = (cnt_from, cnt_rel, cnt_to) | |
return query | |
def pop(self): | |
self.queriesByHash.pop(self.queriesByIndex.pop().hash) | |
def clear(self, till_index=0): | |
for index in range(till_index, len(self.queriesByIndex)): | |
self.pop() | |
def run_query(self, *arg, **kwarg): | |
with self.driver.session() as session: | |
return session.run(*arg, **kwarg) | |
# Main | |
inspector = Neo4jInspector(DRIVER) | |
nr_initial_queries = len(inspector) | |
query_re = re.compile(u'^(?:(?P<var_to>[`a-z][^.!=]*)|(?P<from_or_to>[<>])?(?P<nr>\d+))(?P<propsq>(?:[.!].*?)*)?(?P<list>=)?$', re.UNICODE | re.IGNORECASE) | |
do_print_queries = True | |
do_print_help = True | |
while True: | |
if do_print_queries: | |
do_print_queries = False | |
print(u'Queries (#):') | |
for index, query in enumerate(inspector): | |
if index == nr_initial_queries: | |
print(u'-----') | |
print(query.get_description()) | |
print(u'') | |
if do_print_help: | |
do_print_help = False | |
print(u'Commands:') | |
print(u'(<|>)#[(.|!)[prop]...][=]') | |
print(u' Inspect or list (=) the source (<) or target (>) nodes of given query (#),') | |
print(u' optionally filtered by having (.) or lacking (!) given properties') | |
print(u' (all or none if prop omitted).\n') | |
print(u'subject[(.|!)[prop]...][=]\\ncypher\\n\\n') | |
print(u' Inspect or list (=) the nodes of given subject from given Cypher query') | |
print(u' (without RETURN and ORDER BY clauses). The query will also be added to the') | |
print(u' list of available queries.\n') | |
print(u':') | |
print(u' List queries\n') | |
print(u'~') | |
print(u' Drop all but the initial queries for unique label combinations\n') | |
print(u'?') | |
print(u' Print this help information\n') | |
try: | |
line = read_stdin(u'>>> ').strip() | |
except EOFError as e: | |
break | |
except: | |
print(u'') | |
continue | |
if line == u'': | |
continue | |
elif line == u':': | |
do_print_queries = True | |
elif line == u'?': | |
do_print_help = True | |
elif line == u'~': | |
inspector.clear(nr_initial_queries) | |
print(u'Reset to initial queries\n') | |
else: | |
m = query_re.search(line) | |
if not m: | |
print(u'Invalid command (? = help)\n') | |
else: | |
try: | |
# Fetch query | |
if m.group(u'var_to') is None: # Defined query | |
query = inspector.get(int(m.group(u'nr'))) | |
use_from = (m.group(u'from_or_to') == u'<') | |
else: # Manual query | |
var_to = m.group(u'var_to') | |
cypher = u'' | |
try: | |
while True: | |
line = read_stdin(u'... ').strip() | |
if line == u'': | |
break | |
cypher += u'\n' + line | |
except EOFError as e: | |
continue | |
query = inspector.set( | |
description=u'| {0} |*{{cnt_to}})'.format(cypher[1:].replace(u'\n', u'').replace(u'{', u'{{').replace(u'}', u'}}')), | |
cypher=cypher[1:], | |
var_to=var_to | |
) | |
use_from = False | |
# Inspect or list | |
if m.group(u'list') is None: | |
query.inspect(use_from, m.group(u'propsq')) | |
else: | |
query.list(use_from, m.group(u'propsq')) | |
print(u'') | |
except BaseException as e: | |
print(u'ERROR: {0}\n'.format(e)) | |
if (m.group(u'var_to') is not None) and (query.index == (len(inspector)-1)): | |
inspector.pop() # Drop newly added manual query | |
print(u'') | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment