Skip to content

Instantly share code, notes, and snippets.

@cheerfulstoic
Created January 31, 2018 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cheerfulstoic/7d64cc4a3d87162c09562ffe283d813c to your computer and use it in GitHub Desktop.
Save cheerfulstoic/7d64cc4a3d87162c09562ffe283d813c to your computer and use it in GitHub Desktop.
Queries and R code for Paradise Papers presentation
# R setup
library(RNeo4j)
graph = startGraph("http://localhost:7474/db/data/")
# Queries:
# What types of objects are we dealing with?
MATCH (n)
RETURN labels(n), count(*)
ORDER BY count(*) DESC
// What is related, and how
CALL db.schema()
// Average connections (degree) to which different types of nodes are connected
MATCH (n) WITH labels(n) AS type, SIZE( (n)--() ) AS degree
RETURN type, MAX(degree) AS max, ROUND(AVG(degree)) AS avg, ROUND(STDEV(degree)) AS stdev
// Let's explore!
// Look at names of officers
MATCH (o:Officer)
RETURN o.name, count(*)
ORDER BY count(*) DESC
// They're linked together (probably by the person who imported the data into Neo4j)
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})
RETURN *
// Find what jurisditions the officer is involved in
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN entity.jurisdiction_description, count(*) ORDER BY count(*) DESC
// How active has this officer's entities been
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN entity.incorporation_date ORDER BY entity.incorporation_date
// Let's aggregate the years
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) ORDER BY year
// That might be worth charting
query = "MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) AS count ORDER BY year"
result <- cypher(graph, query)
result$year <- as.integer(result$year)
years <- seq(min(result$year), max(result$year))
counts <- sapply(years, function(year) { result[result$year == year, "count"][1] })
plot(years, counts)
// Big deal, those are basically just joins...
// Shortest distance between Rex Tillerson and the Queen
MATCH p=shortestPath((rex:Officer)-[*]-(queen:Officer))
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
RETURN p
// All shortest distances between Rex Tillerson and the Queen
MATCH p=allShortestPaths((rex:Officer)-[*]-(queen:Officer))
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
RETURN p
// Page Rank slide!
CALL algo.pageRank.stream(null, null ,{iterations:5, dampingFactor:0.85})
YIELD node, score
WHERE node:Entity
RETURN node.name AS name, node.jurisdiction_description AS jurisdiction, score
ORDER BY score DESC LIMIT 10;
// I prepared this query earlier...
// Using page rank algorithm to find central nodes
CALL algo.pageRank(null,null,{write:true,writeProperty:'pagerank_g'})
// Use the `pagerank_g` property which was set to find top entities by pagerank score
MATCH (e:Entity) WHERE exists(e.pagerank_g)
RETURN e.name AS entity, e.jurisdiction_description AS jurisdiction,
e.pagerank_g AS pagerank ORDER BY pagerank DESC LIMIT 15
// I could just pull a random entity:
MATCH (e:Entity) WITH e LIMIT 1
RETURN e LIMIT 1
// But I could use PageRack to jump to tet most connected...
MATCH (e:Entity)
RETURN e ORDER BY e.pagerank_g DESC LIMIT 1
// Get the top five most connected entities and find the entities that they are connected to
// Using `collect` function to return a data structure that is easy to use in your programming language of choice
MATCH (e:Entity)
WITH e, e.pagerank_g AS pagerank ORDER BY e.pagerank_g DESC LIMIT 5
MATCH (e)--(through)--(other:Entity)
WITH e.name AS entity, through.name AS through, other.name AS other, count(*) AS count ORDER BY count DESC
RETURN entity, collect({through: through, other: other, count: count})
// Find linked addresses
// Take the top most connect address and find all addresses linked through some intermediate node
MATCH (a:Address)
WITH a ORDER BY a.pagerank_g DESC LIMIT 1
MATCH (a)--(middle)--(other:Address)
WHERE (a) <> other
WITH a, middle, other ORDER BY middle.name
WITH a.address AS address, other.address AS other, labels(middle)[0] AS label, collect(middle.name) AS middle_names, count(*) AS count ORDER BY count(*) DESC
RETURN address, collect({label: label, middle_names: middle_names}), count, other ORDER BY count DESC
# Query from PowerPoint:
MATCH (o:Officer)-[r1:REGISTERED_ADDRESS]->(a:Address)<-[r2:REGISTERED_ADDRESS]-(i:Intermediary)-[r3:CONNECTED_TO|INTERMEDIARY_OF]->(e:Entity)-[:CONNECTED_TO|OFFICER_OF]-(other:Other),
(a)-[rother]-(e)
RETURN * LIMIT 1
// Let's visualize that
query = "MATCH (a:Address)
WITH a ORDER BY a.pagerank_g DESC LIMIT 1
MATCH (a)--()--(other:Address)
WHERE (a) <> other
WITH other.countries AS country, count(*) AS count ORDER BY count DESC
WHERE count > 10
RETURN country, count"
result <- cypher(graph, query)
result$country <- factor(result$country, levels=result[order(result$count), "country"])
library(ggplot2)
ggplot(aes(x=count, y=country), data=result) + geom_point()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment