cheerfulstoic/gist:7d64cc4a3d87162c09562ffe283d813c

## gistfile1.txt
# R setup
library(RNeo4j)

graph = startGraph("http://localhost:7474/db/data/")


# Queries:

# What types of objects are we dealing with?
MATCH (n)
RETURN labels(n), count(*)
ORDER BY count(*) DESC

// What is related, and how
CALL db.schema()

// Average connections (degree) to which different types of nodes are connected
MATCH (n) WITH labels(n) AS type, SIZE( (n)--() ) AS degree
RETURN type, MAX(degree) AS max, ROUND(AVG(degree)) AS avg, ROUND(STDEV(degree)) AS stdev

// Let's explore!

// Look at names of officers
MATCH (o:Officer)
RETURN o.name, count(*)
ORDER BY count(*) DESC

// They're linked together (probably by the person who imported the data into Neo4j)
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})
RETURN *

// Find what jurisditions the officer is involved in
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN entity.jurisdiction_description, count(*) ORDER BY count(*) DESC

// How active has this officer's entities been
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN entity.incorporation_date ORDER BY entity.incorporation_date

// Let's aggregate the years
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) ORDER BY year

// That might be worth charting
query = "MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) AS count ORDER BY year"

result <- cypher(graph, query)

result$year <- as.integer(result$year)
years <- seq(min(result$year), max(result$year))
counts <- sapply(years, function(year) { result[result$year == year, "count"][1] })

plot(years, counts)


// Big deal, those are basically just joins...

// Shortest distance between Rex Tillerson and the Queen
MATCH p=shortestPath((rex:Officer)-[*]-(queen:Officer))
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
RETURN p

// All shortest distances between Rex Tillerson and the Queen
MATCH p=allShortestPaths((rex:Officer)-[*]-(queen:Officer))
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
RETURN p


// Page Rank slide!
CALL algo.pageRank.stream(null, null ,{iterations:5, dampingFactor:0.85})
YIELD node, score
WHERE node:Entity
RETURN node.name AS name, node.jurisdiction_description AS jurisdiction, score
ORDER BY score DESC LIMIT 10;


// I prepared this query earlier...
// Using page rank algorithm  to find central nodes
CALL algo.pageRank(null,null,{write:true,writeProperty:'pagerank_g'})


// Use the `pagerank_g` property which was set to find top entities by pagerank score
MATCH (e:Entity) WHERE exists(e.pagerank_g)
RETURN e.name AS entity, e.jurisdiction_description AS jurisdiction,
       e.pagerank_g AS pagerank ORDER BY pagerank DESC LIMIT 15

// I could just pull a random entity:
MATCH (e:Entity) WITH e LIMIT 1
RETURN e LIMIT 1

// But I could use PageRack to jump to tet most connected...
MATCH (e:Entity)
RETURN e ORDER BY e.pagerank_g DESC LIMIT 1

// Get the top five most connected entities and find the entities that they are connected to
// Using `collect` function to return a data structure that is easy to use in your programming language of choice
MATCH (e:Entity)
WITH e, e.pagerank_g AS pagerank ORDER BY e.pagerank_g DESC LIMIT 5
MATCH (e)--(through)--(other:Entity)
WITH e.name AS entity, through.name AS through, other.name AS other, count(*) AS count ORDER BY count DESC
RETURN entity, collect({through: through, other: other, count: count})

// Find linked addresses
// Take the top most connect address and find all addresses linked through some intermediate node
MATCH (a:Address)
WITH a ORDER BY a.pagerank_g DESC LIMIT 1
MATCH (a)--(middle)--(other:Address)
WHERE (a) <> other
WITH a, middle, other ORDER BY middle.name
WITH a.address AS address, other.address AS other, labels(middle)[0] AS label, collect(middle.name) AS middle_names, count(*) AS count ORDER BY count(*) DESC
RETURN address, collect({label: label, middle_names: middle_names}), count, other ORDER BY count DESC


# Query from PowerPoint:
MATCH (o:Officer)-[r1:REGISTERED_ADDRESS]->(a:Address)<-[r2:REGISTERED_ADDRESS]-(i:Intermediary)-[r3:CONNECTED_TO|INTERMEDIARY_OF]->(e:Entity)-[:CONNECTED_TO|OFFICER_OF]-(other:Other),
(a)-[rother]-(e)
RETURN * LIMIT 1


// Let's visualize that
query = "MATCH (a:Address)
WITH a ORDER BY a.pagerank_g DESC LIMIT 1
MATCH (a)--()--(other:Address)
WHERE (a) <> other
WITH other.countries AS country, count(*) AS count ORDER BY count DESC
WHERE count > 10
RETURN country, count"

result <- cypher(graph, query)

result$country <- factor(result$country, levels=result[order(result$count), "country"])

library(ggplot2)
ggplot(aes(x=count, y=country), data=result) + geom_point()
	# R setup
	library(RNeo4j)

	graph = startGraph("http://localhost:7474/db/data/")


	# Queries:

	# What types of objects are we dealing with?
	MATCH (n)
	RETURN labels(n), count(*)
	ORDER BY count(*) DESC

	// What is related, and how
	CALL db.schema()

	// Average connections (degree) to which different types of nodes are connected
	MATCH (n) WITH labels(n) AS type, SIZE( (n)--() ) AS degree
	RETURN type, MAX(degree) AS max, ROUND(AVG(degree)) AS avg, ROUND(STDEV(degree)) AS stdev

	// Let's explore!

	// Look at names of officers
	MATCH (o:Officer)
	RETURN o.name, count(*)
	ORDER BY count(*) DESC

	// They're linked together (probably by the person who imported the data into Neo4j)
	MATCH (o:Officer {name: 'CLEMENTI LIMITED'})
	RETURN *

	// Find what jurisditions the officer is involved in
	MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
	RETURN entity.jurisdiction_description, count() ORDER BY count() DESC

	// How active has this officer's entities been
	MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
	RETURN entity.incorporation_date ORDER BY entity.incorporation_date

	// Let's aggregate the years
	MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
	RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) ORDER BY year

	// That might be worth charting
	query = "MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity)
	RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) AS count ORDER BY year"

	result <- cypher(graph, query)

	result$year <- as.integer(result$year)
	years <- seq(min(result$year), max(result$year))
	counts <- sapply(years, function(year) { result[result$year == year, "count"][1] })

	plot(years, counts)


	// Big deal, those are basically just joins...

	// Shortest distance between Rex Tillerson and the Queen
	MATCH p=shortestPath((rex:Officer)-[*]-(queen:Officer))
	WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
	RETURN p

	// All shortest distances between Rex Tillerson and the Queen
	MATCH p=allShortestPaths((rex:Officer)-[*]-(queen:Officer))
	WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster"
	RETURN p


	// Page Rank slide!
	CALL algo.pageRank.stream(null, null ,{iterations:5, dampingFactor:0.85})
	YIELD node, score
	WHERE node:Entity
	RETURN node.name AS name, node.jurisdiction_description AS jurisdiction, score
	ORDER BY score DESC LIMIT 10;


	// I prepared this query earlier...
	// Using page rank algorithm to find central nodes
	CALL algo.pageRank(null,null,{write:true,writeProperty:'pagerank_g'})


	// Use the `pagerank_g` property which was set to find top entities by pagerank score
	MATCH (e:Entity) WHERE exists(e.pagerank_g)
	RETURN e.name AS entity, e.jurisdiction_description AS jurisdiction,
	e.pagerank_g AS pagerank ORDER BY pagerank DESC LIMIT 15

	// I could just pull a random entity:
	MATCH (e:Entity) WITH e LIMIT 1
	RETURN e LIMIT 1

	// But I could use PageRack to jump to tet most connected...
	MATCH (e:Entity)
	RETURN e ORDER BY e.pagerank_g DESC LIMIT 1

	// Get the top five most connected entities and find the entities that they are connected to
	// Using `collect` function to return a data structure that is easy to use in your programming language of choice
	MATCH (e:Entity)
	WITH e, e.pagerank_g AS pagerank ORDER BY e.pagerank_g DESC LIMIT 5
	MATCH (e)--(through)--(other:Entity)
	WITH e.name AS entity, through.name AS through, other.name AS other, count(*) AS count ORDER BY count DESC
	RETURN entity, collect({through: through, other: other, count: count})

	// Find linked addresses
	// Take the top most connect address and find all addresses linked through some intermediate node
	MATCH (a:Address)
	WITH a ORDER BY a.pagerank_g DESC LIMIT 1
	MATCH (a)--(middle)--(other:Address)
	WHERE (a) <> other
	WITH a, middle, other ORDER BY middle.name
	WITH a.address AS address, other.address AS other, labels(middle)[0] AS label, collect(middle.name) AS middle_names, count() AS count ORDER BY count() DESC
	RETURN address, collect({label: label, middle_names: middle_names}), count, other ORDER BY count DESC


	# Query from PowerPoint:
	MATCH (o:Officer)-[r1:REGISTERED_ADDRESS]->(a:Address)<-[r2:REGISTERED_ADDRESS]-(i:Intermediary)-[r3:CONNECTED_TO\|INTERMEDIARY_OF]->(e:Entity)-[:CONNECTED_TO\|OFFICER_OF]-(other:Other),
	(a)-[rother]-(e)
	RETURN * LIMIT 1



	// Let's visualize that
	query = "MATCH (a:Address)
	WITH a ORDER BY a.pagerank_g DESC LIMIT 1
	MATCH (a)--()--(other:Address)
	WHERE (a) <> other
	WITH other.countries AS country, count(*) AS count ORDER BY count DESC
	WHERE count > 10
	RETURN country, count"

	result <- cypher(graph, query)

	result$country <- factor(result$country, levels=result[order(result$count), "country"])

	library(ggplot2)
	ggplot(aes(x=count, y=country), data=result) + geom_point()