rvanbruggen/0-schema_ops.cql

## 0-schema_ops.cql
//This offers two alternative ways of setting up Indexes and Constraints for this dataset

//1. setup indexes & constraints - the manual way
create index on :StageType(name);
create index on :Rider(fullname);
create index on :Team(name);
create index on :Stage(seq);
create constraint on (c:City) assert c.name is unique;

//2. create all the indexes in one APOC
call apoc.schema.assert({StageType:["name"], Rider:["fullname"], Team:["name"], Stage:["seq"]},{City:["name"]}) yield label, key, unique, action;


## 1 - import_tdf2016_v1.cql
//load stagetypes
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
with split(csv.Type,", ") as Realtype
unwind Realtype as Rtype
merge (st:StageType {name: Rtype});

//load stages
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
create (s:Stage {seq: toInt(csv.Stage), date: csv.Date, distance: toInt(csv.Distance)});

//connect stages
match (s1:Stage), (s2:Stage)
where s1.seq=s2.seq+1
merge (s2)-[:PRECEDES]->(s1);

//load cities
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
merge (from:City {name: csv.From})
merge (to:City {name: csv.To});

//connect stages
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (from:City {name: csv.From}),  (to:City {name: csv.To}), (s:Stage)
where s.seq = toInt(csv.Stage)
merge (from)<-[:STARTS_AT]-(s)-[:ENDS_AT]->(to);

//load stagetypes
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
with split(csv.Type,", ") as Realtype, toInt(csv.Stage) as StageSeq
unwind Realtype as Rtype
match (st:StageType {name: Rtype}), (s:Stage {seq: StageSeq})
merge (st)<-[:HAS_TYPE]-(s);

//load riders and teams
load csv with headers from
"https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=1612867267" as csv
//"file:///riders.csv" as csv
merge (t:Team {name: csv.Team})
merge (r:Rider {id: toInt(csv.ID), fullname: csv.`Full Name`, firstname: csv.`First Name`, lastname: csv.`Last Name`})-[:PART_OF_TEAM]->(t);

//stage podium
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (s:Stage {seq: toInt(csv.Stage)}), (r1:Rider {fullname: csv.Stage1st}), (r2:Rider {fullname: csv.Stage2nd}), (r3:Rider {fullname: csv.Stage3rd})
merge (r1)-[:FINISHED {rank:1}]->(s)
merge (r2)-[:FINISHED {rank:2}]->(s)
merge (r3)-[:FINISHED {rank:3}]->(s);

//stage jerseys
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (s:Stage {seq: toInt(csv.Stage)}), (yellow:Rider {fullname: csv.Yellow}), (green:Rider {fullname: csv.Green}), (mountain:Rider {fullname: csv.Mountain}), (youth:Rider {fullname: csv.Youth}), (team:Team {name: csv.Teams})
merge (yellow)-[:HAS_JERSEY {type: "Yellow"}]->(s)
merge (green)-[:HAS_JERSEY {type: "Green"}]->(s)
merge (mountain)-[:HAS_JERSEY {type: "Mountain"}]->(s)
merge (youth)-[:HAS_JERSEY {type: "Youth"}]->(s)
merge (team)-[:LEADS_RANKING]->(s);

## 2a - import_tdf2016_v2.cql
//load stagetypes
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
with split(csv.Type,", ") as Realtype
unwind Realtype as Rtype
merge (st:StageType {name: Rtype});

//load stages
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
create (s:Stage {seq: toInt(csv.Stage), date: csv.Date, distance: toInt(csv.Distance)})-[:HAS_PODIUM]->(p:Podium {name:"Stage "+csv.Stage+" Podium"})-[:HAS_PLACE]->(:StagePlace {value:1})
create (p)-[:HAS_PLACE]->(:StagePlace {value:2})
create (p)-[:HAS_PLACE]->(:StagePlace {value:3})
create (s)-[:HAS_JERSEYSET]->(sjs:StageJerseySet {name:"Stage "+csv.Stage+" Jersey Set"})-[:HAS_JERSEY]->(:StageJersey {type:"Yellow"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {type:"Green"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {type:"Mountain"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {type:"Youth"});

//connect stages
match (s1:Stage), (s2:Stage)
where s1.seq=s2.seq+1
merge (s2)-[:PRECEDES]->(s1);

//load cities
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
merge (from:City {name: csv.From})
merge (to:City {name: csv.To});

//connect stages
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (from:City {name: csv.From}),  (to:City {name: csv.To}), (s:Stage)
where s.seq = toInt(csv.Stage)
merge (from)<-[:STARTS_AT]-(s)-[:ENDS_AT]->(to)
merge (from)-[:FROM_TO]->(to);

//load stagetypes
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
with split(csv.Type,", ") as Realtype, toInt(csv.Stage) as StageSeq
unwind Realtype as Rtype
match (st:StageType {name: Rtype}), (s:Stage {seq: StageSeq})
merge (st)<-[:HAS_TYPE]-(s);

//load riders and teams
load csv with headers from
"https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=1612867267" as csv
//"file:///riders.csv" as csv
merge (t:Team {name: csv.Team})
merge (r:Rider {id: toInt(csv.ID), fullname: csv.`Full Name`, firstname: csv.`First Name`, lastname: csv.`Last Name`})-[:PART_OF_TEAM]->(t);

//stage podium
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:Podium)--(sp:StagePlace {value:1}), (r:Rider {fullname: csv.Stage1st})
merge (r)-[:ON_PODIUM]->(sp)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:Podium)--(sp:StagePlace {value:2}), (r:Rider {fullname: csv.Stage2nd})
merge (r)-[:ON_PODIUM]->(sp)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:Podium)--(sp:StagePlace {value:3}),(r:Rider {fullname: csv.Stage3rd})
merge (r)-[:ON_PODIUM]->(sp);


//stage jerseys
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
//load csv with headers from  "file:///stages.csv" as csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {type:"Yellow"}), (r:Rider {fullname: csv.Yellow})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {type:"Green"}), (r:Rider {fullname: csv.Green})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {type:"Youth"}), (r:Rider {fullname: csv.Youth})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {type:"Mountain"}), (r:Rider {fullname: csv.Mountain})
merge (r)-[:HAS_JERSEY]->(sj);

//extract the JerseyType nodes
match (sj:StageJersey)
with distinct sj.type as type
merge (jt:JerseyType {name: type});

match (sj:StageJersey), (jt:JerseyType)
where sj.type = jt.name
merge (sj)-[:IS_OF_TYPE]->(jt);

//load team rankings
//load csv with headers from  "file:///stages.csv" as csv
load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
match (s:Stage {seq: toInt(csv.Stage)}),(team:Team {name: csv.Teams})
merge (team)-[:LEADS_RANKING]->(s);

## 2b - import_tdf2016_v2_with_apoc.cql

//start or end time of the import
call apoc.date.formatDefault(timestamp(),'ms');

//import the TDF2016 data with one APOC call
//if the file is local
// call apoc.cypher.runFile('/Users/rvanbruggen/Dropbox/Neo Technology/Demo/Tour de France 2016/load tdf2016 v2.cql') yield row, result;

//if the file is remote
call apoc.cypher.runFile('https://gist.githubusercontent.com/rvanbruggen/c8d09f2c2fe174ebf818c344adad4fee/raw/840b9a3d92c32139b3908e78a99214894612399a/2%2520-%2520import_tdf2016_v2.cql') yield row, result;

## 3 - query_tdf2016.cql
//structure of the dataset
CALL apoc.meta.graphSample(1000);
CALL apoc.meta.graph;

//look at the stages subgraph
MATCH (n:Stage)-[r]-() RETURN n,r LIMIT 25

//look at the rider subgraph
MATCH (n:Rider)-[r*..2]-() RETURN n,r LIMIT 25

//paths between riders
match (r1:Rider), (r2:Rider),
p = allshortestpaths ((r1)-[*]-(r2))
where r1.fullname contains "Avermaet"
and r2.fullname contains "Froome"
return p
limit 10;

//paths between teams
match (t1:Team), (t2:Team),
p = allshortestpaths ((t1)-[*]-(t2))
where t1.name contains "ORICA"
and t2.name contains "Quick"
return p
limit 10;

//paths between rider and teams
match (r1:Rider), (t2:Team),
p = allshortestpaths ((r1)-[*]-(t2))
where r1.fullname contains "Froome"
and t2.name contains "Quick"
return p
limit 10;


//pagerank apoc
match (t:Team)
with collect(t) as teams
call apoc.algo.pageRank(teams) YIELD node, score
return node.name, score
order by score desc
limit 10

//betweenness centrality
match (r:Rider)
WHERE r.id %2 = 0
with collect(r) as riders
call apoc.algo.betweenness(['ON_PODIUM','HAS_JERSEY'],riders,'BOTH') YIELD node, score
with node.fullname as name, score
where score > 0
return name, score
order by score desc
limit 10
//
// //degree of riders
// profile match (r:Rider)-[rel]-()
// return r.fullname, count(rel)
// order by count(rel) desc
// limit 10
//
// profile match (r:Rider)
// with r, size( (r)--()) as degree
// return r.fullname, degree
// order by degree desc
// limit 10
//

## 4-tdf2016graphgist.adoc

      
    Raw
  

              4-tdf2016graphgist.adoc
            
          
    The Tour de France 2016 GraphGist


Over the past couple of years, I have turned into a really cycling fan. Not just the Flanders Classics, but many of the international UCI World Tour races can grab my attention for hours on end - silly as that may be!


Now, the Tour de France 2016 is coming to an end. I have written a couple of blog posts about some graphiness in the Tour data over here:


part 1 was all about creating a dataset


part 2 was all about importing a dataset


part 3 was all about some queries on that dataset


Central to the blogpost was the ability to use a number of Neo4j Awesome Apocs for the import and queries, but of course we can do a number of things as well without them. So let’s explore


Of course we start with a Google sheet


I had to of course start from the information on the Sporza Tour website (hosted by a local TV station in Belgium), and convert that into a google sheet with all the data. Once I had that, I could add the data pretty easily with this model:


Very simple - but it’s so much nicer when you can make it interactive and load it into Neo4j. Let’s do that. Let’s load that data into this graphgist.


Let’s set up the graph with some


stagetypes: flat, mountain, timetrial, or a combination of types…


stages: 21 in total - we will connect them all together too


stage podiums: every stage has a podium associated with it


stage podium places: 1,2 and 3


stage jersey-sets:


yellow: the overall leader in the race


green: the "points" leader, which is typically the sprinters' jersey-sets


red polka-dot: the best climber in the race


white/youth: the best under 26 year old in the race


Setting it up: step by step


Before we start: please note that that this page does a bunch of queries in the background, and may take a while to load completely. Please be gentle :) …


So here we go - let’s do the initial setup:


load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
with csv, split(csv.Type,", ") as Realtype
unwind Realtype as Rtype
merge (st:StageType {name: Rtype})
with csv
create (s:Stage {seq: toInt(csv.Stage), date: csv.Date, distance: toInt(csv.Distance)})-[:HAS_PODIUM]->(p:StagePodium {name:"Stage "+csv.Stage+" Podium"})-[:HAS_PLACE]->(:StagePlace {value:1})
create (p)-[:HAS_PLACE]->(:StagePlace {value:2})
create (p)-[:HAS_PLACE]->(:StagePlace {value:3})
create (s)-[:HAS_JERSEYSET]->(sjs:StageJerseySet {name:"Stage "+csv.Stage+" Jersey Set"})-[:HAS_JERSEY]->(:StageJersey {name:"Yellow"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {name:"Green"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {name:"Mountain"})
create (sjs)-[:HAS_JERSEY]->(:StageJersey {name:"Youth"})


Let’s see what we have then:


Now we can add more stuff to it.


load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
merge (from:City {name: csv.From})
merge (to:City {name: csv.To})
with csv
match (from:City {name: csv.From}),  (to:City {name: csv.To}), (s:Stage)
where s.seq = toInt(csv.Stage)
merge (from)<-[:STARTS_AT]-(s)-[:ENDS_AT]->(to)
merge (from)-[:FROM_TO]->(to)
with csv
with split(csv.Type,", ") as Realtype, toInt(csv.Stage) as StageSeq
unwind Realtype as Rtype
match (st:StageType {name: Rtype}), (s:Stage {seq: StageSeq})
merge (st)<-[:HAS_TYPE]-(s);


And then we can connect the different stages too:


match (s1:Stage), (s2:Stage)
where s1.seq=s2.seq+1
merge (s2)-[:PRECEDES]->(s1);


Now let’s see what we have by looking at one particular stage and exploring the area around it:


match (s:Stage)
with s as stage
limit 1
match (stage)-[r*..2]-(n)
return stage, r, n;


and visualize it with a subgraph:


Now we can start adding the riders and teams to the dataset. Here’s how we do that:


load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=1612867267" as csv
merge (t:Team {name: csv.Team})
merge (r:Rider {id: toInt(csv.ID), fullname: csv.`Full Name`, firstname: csv.`First Name`, lastname: csv.`Last Name`})-[:PART_OF_TEAM]->(t);


And then we can look at a couple or riders and their teams:


match (t:Team)
with t
limit 3
match (t)--(r:Rider)
return t, r;


Which gives us this result:


Now, in the dataset/google sheet, I have also added some of the key result data for each and everyone of the stages. So let’s add that to the graph as well - starting with the stage podium for every stage:


load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:StagePodium)--(sp:StagePlace {value:1}), (r:Rider {fullname: csv.Stage1st})
merge (r)-[:ON_PODIUM]->(sp)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:StagePodium)--(sp:StagePlace {value:2}), (r:Rider {fullname: csv.Stage2nd})
merge (r)-[:ON_PODIUM]->(sp)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(p:StagePodium)--(sp:StagePlace {value:3}),(r:Rider {fullname: csv.Stage3rd})
merge (r)-[:ON_PODIUM]->(sp);


We can also add the different jerseys as they are given after every stage, and add those:


load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {name:"Yellow"}), (r:Rider {fullname: csv.Yellow})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {name:"Green"}), (r:Rider {fullname: csv.Green})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {name:"Youth"}), (r:Rider {fullname: csv.Youth})
merge (r)-[:HAS_JERSEY]->(sj)
with csv
match (s:Stage {seq: toInt(csv.Stage)})--(sjs:StageJerseySet)--(sj:StageJersey {name:"Mountain"}), (r:Rider {fullname: csv.Mountain})
merge (r)-[:HAS_JERSEY]->(sj);


and then we can look at the podium and jersey sets for a particular stage:


match (s:Stage)
with s
limit 1
match path = ((r2:Rider)--(spl:StagePlace)--(spo:StagePodium)--(s)--(sjs:StageJerseySet)--(sj:StageJersey)--(r1:Rider))
return path;


And display the result:


In the Tour the France, there’s also something called a "team ranking" after every stage. So let’s add that to the graph:


load csv with headers from  "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
match (s:Stage {seq: toInt(csv.Stage)}),(team:Team {name: csv.Teams})
merge (team)-[:LEADS_RANKING]->(s);


And let’s take a look at the team ranking after every stage:


match (s:Stage)-[:LEADS_RANKING]-(t:Team)
return distinct s.date, s.seq, t.name
order by s.seq asc;


which gives:


Plenty of other things to do!


There are so many other things that we could look at, especially if we start using some of the awesome apocs that you can add to your own Neo4j server. Other things you can think of are ready to be explored in the console below. Enjoy!


I hope this gist was interesting for you, and that we will see each other soon.


This gist was created by Rik Van Bruggen


My Blog


the Graphistania Neo4j Graph Database Podcast


My Book


On Twitter


On LinkedIn
	//This offers two alternative ways of setting up Indexes and Constraints for this dataset

	//1. setup indexes & constraints - the manual way
	create index on :StageType(name);
	create index on :Rider(fullname);
	create index on :Team(name);
	create index on :Stage(seq);
	create constraint on (c:City) assert c.name is unique;

	//2. create all the indexes in one APOC
	call apoc.schema.assert({StageType:["name"], Rider:["fullname"], Team:["name"], Stage:["seq"]},{City:["name"]}) yield label, key, unique, action;
	//load stagetypes
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	with split(csv.Type,", ") as Realtype
	unwind Realtype as Rtype
	merge (st:StageType {name: Rtype});

	//load stages
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	create (s:Stage {seq: toInt(csv.Stage), date: csv.Date, distance: toInt(csv.Distance)});

	//connect stages
	match (s1:Stage), (s2:Stage)
	where s1.seq=s2.seq+1
	merge (s2)-[:PRECEDES]->(s1);

	//load cities
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	merge (from:City {name: csv.From})
	merge (to:City {name: csv.To});

	//connect stages
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	match (from:City {name: csv.From}), (to:City {name: csv.To}), (s:Stage)
	where s.seq = toInt(csv.Stage)
	merge (from)<-[:STARTS_AT]-(s)-[:ENDS_AT]->(to);

	//load stagetypes
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	with split(csv.Type,", ") as Realtype, toInt(csv.Stage) as StageSeq
	unwind Realtype as Rtype
	match (st:StageType {name: Rtype}), (s:Stage {seq: StageSeq})
	merge (st)<-[:HAS_TYPE]-(s);

	//load riders and teams
	load csv with headers from
	"https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=1612867267" as csv
	//"file:///riders.csv" as csv
	merge (t:Team {name: csv.Team})
	merge (r:Rider {id: toInt(csv.ID), fullname: csv.`Full Name`, firstname: csv.`First Name`, lastname: csv.`Last Name`})-[:PART_OF_TEAM]->(t);

	//stage podium
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	match (s:Stage {seq: toInt(csv.Stage)}), (r1:Rider {fullname: csv.Stage1st}), (r2:Rider {fullname: csv.Stage2nd}), (r3:Rider {fullname: csv.Stage3rd})
	merge (r1)-[:FINISHED {rank:1}]->(s)
	merge (r2)-[:FINISHED {rank:2}]->(s)
	merge (r3)-[:FINISHED {rank:3}]->(s);

	//stage jerseys
	load csv with headers from "https://docs.google.com/spreadsheets/d/1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q/export?format=csv&id=1kD9JwoR2MzEVyq5YV1HX3eeV2k98FJ69A10ZxQDj01Q&gid=0" as csv
	//load csv with headers from "file:///stages.csv" as csv
	match (s:Stage {seq: toInt(csv.Stage)}), (yellow:Rider {fullname: csv.Yellow}), (green:Rider {fullname: csv.Green}), (mountain:Rider {fullname: csv.Mountain}), (youth:Rider {fullname: csv.Youth}), (team:Team {name: csv.Teams})
	merge (yellow)-[:HAS_JERSEY {type: "Yellow"}]->(s)
	merge (green)-[:HAS_JERSEY {type: "Green"}]->(s)
	merge (mountain)-[:HAS_JERSEY {type: "Mountain"}]->(s)
	merge (youth)-[:HAS_JERSEY {type: "Youth"}]->(s)
	merge (team)-[:LEADS_RANKING]->(s);

	//start or end time of the import
	call apoc.date.formatDefault(timestamp(),'ms');

	//import the TDF2016 data with one APOC call
	//if the file is local
	// call apoc.cypher.runFile('/Users/rvanbruggen/Dropbox/Neo Technology/Demo/Tour de France 2016/load tdf2016 v2.cql') yield row, result;

	//if the file is remote
	call apoc.cypher.runFile('https://gist.githubusercontent.com/rvanbruggen/c8d09f2c2fe174ebf818c344adad4fee/raw/840b9a3d92c32139b3908e78a99214894612399a/2%2520-%2520import_tdf2016_v2.cql') yield row, result;
	//structure of the dataset
	CALL apoc.meta.graphSample(1000);
	CALL apoc.meta.graph;

	//look at the stages subgraph
	MATCH (n:Stage)-[r]-() RETURN n,r LIMIT 25

	//look at the rider subgraph
	MATCH (n:Rider)-[r*..2]-() RETURN n,r LIMIT 25

	//paths between riders
	match (r1:Rider), (r2:Rider),
	p = allshortestpaths ((r1)-[*]-(r2))
	where r1.fullname contains "Avermaet"
	and r2.fullname contains "Froome"
	return p
	limit 10;

	//paths between teams
	match (t1:Team), (t2:Team),
	p = allshortestpaths ((t1)-[*]-(t2))
	where t1.name contains "ORICA"
	and t2.name contains "Quick"
	return p
	limit 10;

	//paths between rider and teams
	match (r1:Rider), (t2:Team),
	p = allshortestpaths ((r1)-[*]-(t2))
	where r1.fullname contains "Froome"
	and t2.name contains "Quick"
	return p
	limit 10;


	//pagerank apoc
	match (t:Team)
	with collect(t) as teams
	call apoc.algo.pageRank(teams) YIELD node, score
	return node.name, score
	order by score desc
	limit 10

	//betweenness centrality
	match (r:Rider)
	WHERE r.id %2 = 0
	with collect(r) as riders
	call apoc.algo.betweenness(['ON_PODIUM','HAS_JERSEY'],riders,'BOTH') YIELD node, score
	with node.fullname as name, score
	where score > 0
	return name, score
	order by score desc
	limit 10
	//
	// //degree of riders
	// profile match (r:Rider)-[rel]-()
	// return r.fullname, count(rel)
	// order by count(rel) desc
	// limit 10
	//
	// profile match (r:Rider)
	// with r, size( (r)--()) as degree
	// return r.fullname, degree
	// order by degree desc
	// limit 10
	//