Skip to content

Instantly share code, notes, and snippets.

@jexp
Forked from knutwalker/README.md
Last active March 24, 2020 02:13
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jexp/844cf610db4951308574 to your computer and use it in GitHub Desktop.
Save jexp/844cf610db4951308574 to your computer and use it in GitHub Desktop.
Python script to parse a git commit log into Cypher create statements for Neo4j database

What is this about?

This script parses the git log and outputs Cypher statements to create a Neo4j database of your git history.

BEGIN
create constraint on (c:Commit) assert c.sha1 is unique;
COMMIT
BEGIN
CREATE (:Commit {author_email:'foo@bar.com',date_iso_8601:'2014-05-22 20:53:05 +0200',parents:['b6393fc9d5c065fd42644caad600a9b7ac911ae2'],refs:['HEAD', 'origin/master', 'master', 'in-index'],sha1:'934cacf9fe6cd0188be642b3e609b529edaad527',subject:'Some commit message',timestamp:'1400784785'});

CREATE (:Commit {author_email:'bar@foo.com',date_iso_8601:'2014-05-22 13:22:10 +0200',parents:['7765539ff17310f2c736ee7f0a8fc5e05180e262', '2d3abe010c36214b71c9bbbcaa9f6063947068de'],sha1:'b6393fc9d5c065fd42644caad600a9b7ac911ae2',subject:'Merge pull request #2445 from foo/bar',timestamp:'1400757730'});
...
MATCH (parent:Commit {sha1:"934cacf9fe6cd0188be642b3e609b529edaad527"}), 
      (child:Commit {sha1:"b6393fc9d5c065fd42644caad600a9b7ac911ae2"}) 
CREATE (parent)<-[:PARENT]-(child);
...
COMMIT

Prerequisite

  • requires python 2.7

Usage

Navigate to a git repository and run:

python git2neo.py > git.sql

Import the output into a Neo4j database with the neo4j-shell.

bin/neo4j-shell -file git.sql

You can also import it into a new database.

bin/neo4j-shell -path git.db -file git.sql

Options

(git2neo.py --help)

usage: git2neo.py [-h] [-n N] [--merge] [--json] [branch]

positional arguments:
  branch           which branch to examine - defaults to HEAD

optional arguments:
  -h, --help       show this help message and exit
  -n N, --limit N  only N latest commits
  --merge          use MERGE instead of CREATE
  --json           print JSON output for the Cypher REST API.

Issues

  • can't deal with non ASCII data (e.g. commit messages)
  • probably many others :-)
  • numeric values should not be quoted (timestamp, etc)
  • support json format for transactional endpoint
#!/usr/bin/env python
from __future__ import print_function
from collections import namedtuple
from itertools import tee
from operator import itemgetter
from subprocess import Popen, PIPE
import re
Commit = namedtuple('Commit', ['sha1', 'hash','parents', 'author_email', 'author_name', 'refs', 'subject', 'timestamp', 'date_time'])
GIT_LOG_FORMAT = '%x1E'.join(['%H','%h', '%P', '%ae','%an','%d', '%s', '%at', '%ai'])
seen_sha1s = set()
def as_list(elements):
items = [e for e in elements if e]
return items or None
def fix_parents(parents_string):
return as_list(parents_string.split())
def fix_refs(refs_string):
return as_list(refs_string.strip()[1:-1].split(', '))
def mk_commit(commit_line):
commit_line = re.sub(r'[^\x00-\x7F]+','', commit_line)
commit = Commit(*commit_line.split('\x1E'))
parents = fix_parents(commit.parents)
refs = fix_refs(commit.refs)
return commit._replace(parents=parents, refs=refs)
def create_node(neo4j_op, sha1_ident, property_pairs):
if property_pairs:
properties = ','.join('{0}:{1!r}'.format(k, v) for k, v in property_pairs)
return neo4j_op + ' (:Commit {{{0}}});'.format(properties)
else:
return neo4j_op + ' (:Commit);'
def create_rel(neo4j_op,sha1, parent):
return 'MATCH (parent:Commit {sha1:"'+sha1+'"}), (child:Commit {sha1:"'+parent+'"}) '+neo4j_op+' (parent)<-[:PARENT]-(child);'
def mk_node_stmts(neo4j_op, sha1_ident, **properties):
if sha1_ident not in seen_sha1s:
seen_sha1s.add(sha1_ident)
props = {k:v for k, v in properties.iteritems() if v is not None}
yield create_node(neo4j_op, sha1_ident, sorted(props.items(), key=itemgetter(0)))
def node_stmts(neo4j_op, commit_or_sha1):
if (hasattr(commit_or_sha1, 'sha1')):
sha1, properties = (commit_or_sha1.sha1, commit_or_sha1._asdict())
else:
sha1, properties = (commit_or_sha1, {})
return mk_node_stmts(neo4j_op, sha1, **properties)
def nodes(neo4j_op, cs):
for c in cs:
for s in node_stmts(neo4j_op, c):
yield s
def missing_parents(neo4j_op, parents):
for parent in parents or []:
for n in node_stmts(neo4j_op, parent):
yield n
def rel_stmts(neo4j_op, sha1, parents):
for parent in parents or []:
yield create_rel(neo4j_op, sha1, parent)
def join_rel_stmts(neo4j_op, cs):
it = (s for c in cs for s in rel_stmts(neo4j_op, c.sha1, c.parents))
a, b = tee(it)
next(b, None)
for _ in b:
yield next(a)
yield next(a)
def rels(neo4j_op, cs):
for c in cs:
for s in missing_parents(neo4j_op, c.parents):
yield s
for s in join_rel_stmts(neo4j_op, cs):
yield ' ' + s
def mk_cmd(limit, branch):
git_log = ['git', 'log', '--format=format:{0}'.format(GIT_LOG_FORMAT)]
if limit is not None:
git_log.append('-n')
git_log.append(str(limit))
if branch is not None:
git_log.append(str(branch))
return git_log
def commits(limit, branch):
proc = Popen(mk_cmd(limit, branch), stdout=PIPE)
for line in proc.stdout:
yield line.strip()
def main(neo4j_op, limit=None, branch=None):
cs = map(mk_commit, commits(limit, branch))
statements = []
if cs:
statements.append('BEGIN')
statements.extend(nodes(neo4j_op, cs))
statements.append('COMMIT')
statements.append('BEGIN')
statements.append('MATCH (n:Commit) SET n.date = substring(n.date_time,0,10);')
statements.append('COMMIT')
statements.append('BEGIN')
statements.append('create index on :Commit(hash);')
statements.append('create index on :Commit(author_name);')
statements.append('create index on :Commit(author_email);')
statements.append('create index on :Commit(date);')
statements.append('create constraint on (c:Commit) assert c.sha1 is unique;')
statements.append('COMMIT')
statements.append('BEGIN')
if len(cs) > 1:
statements.extend(rels(neo4j_op, cs))
statements.append('COMMIT')
return statements
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--limit', type=int, metavar='N', help='only N latest commits', default=None)
parser.add_argument('--merge', dest='neo4j_op', const='MERGE', default='CREATE', action='store_const', help='use MERGE instead of CREATE')
parser.add_argument('--json', action='store_true', help='print JSON output for the Cypher REST API.')
parser.add_argument('branch', help='which branch to examine - defaults to HEAD', nargs='?', default=None)
arguments = parser.parse_args()
res = main(arguments.neo4j_op, arguments.limit, arguments.branch)
if arguments.json:
import json, sys
json.dump({"query": ' '.join(res)}, sys.stdout)
else:
print('\n'.join(res))
create constraint on (u:User) assert u.email is unique;
// create users
begin
MATCH (c:Commit)
MERGE (a:Author:User {email:c.author_email}) ON CREATE SET a.name = c.author_name
CREATE (a)-[:AUTHORED]->(c);
commit
create index on :User(name);
create constraint on (y:Year) assert y.year is unique;
// create time tree
begin
MATCH (c:Commit)
MERGE (y:Year {year:substring(c.date,0,4)})
MERGE (m:Month {month:substring(c.date,5,2)})-[:IN_YEAR]->(y)
MERGE (d:Day {date:c.date, day:substring(c.date,8,2)})-[:IN_MONTH]->(m)
CREATE (c)-[:ON_DAY]->(d);
commit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment