Skip to content

Instantly share code, notes, and snippets.

@rvanbruggen
Last active August 25, 2016 17:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rvanbruggen/ead7d81740b4b90c74107d5818c7e80d to your computer and use it in GitHub Desktop.
Save rvanbruggen/ead7d81740b4b90c74107d5818c7e80d to your computer and use it in GitHub Desktop.
Olympic Medallists
create index on :Country(name);
create index on :City(name);
create index on :Sport(name);
create index on :Discipline(name);
create index on :Athlete(name);
create index on :Event(name);
create index on :Year(name);
create index on :Gender(name);
create constraint on (c:Country) assert c.noc is unique;
//add the genders
create (g:Gender {name:"Men"});
create (g:Gender {name:"Women"});
//load countries
load csv with headers from
// "file:///ioccountrycodes.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=2" as csv
create (c:Country {name: csv.Country, noc: csv.NOC, iso: csv.ISOcode});
//load cities
load csv with headers from
// "file:///cities.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=2027901504 " as csv
match (c:Country {noc: csv.NOC})
merge (ci:City {name: csv.City})
merge (ci)-[:LOCATED_IN]->(c);
//load sports taxonomy
load csv with headers from
// "file:///sportstaxonomy.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=1457987568" as csv
merge (s:Sport {name: csv.Sport})
merge (s)<-[:PART_OF]-(d:Discipline {name: csv.Discipline});
//load the games
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (c:City {name: csv.City})
merge (c)-[:HOSTS_GAMES]->(y:Year {name: csv.Edition});
//load the events at the games
cypher planner=rule
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (y:Year {name: csv.Edition}), (d:Discipline {name: csv.Discipline})
merge (d)<-[:PART_OF]-(e:Event {name: csv.Event})-[:AT_GAMES]->(y);
//load the medallists
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (g:Gender {name: csv.Gender})
merge (a:Athlete {name: csv.Athlete})-[:HAS_GENDER]->(g);
cypher planner=rule
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (a:Athlete {name: csv.Athlete}), (c:Country {noc: csv.NOC})
merge (a)-[:REPRESENTS]->(c);
cypher planner=rule
using periodic commit
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (a:Athlete {name: csv.Athlete}), (d:Discipline {name: csv.Discipline})<--(e:Event {name: csv.Event})-->(y:Year {name: csv.Edition})
create (a)-[:WINS]->(m:Medal {type: csv.Medal})-[:REWARD_FOR]->(e);
//number of sports per game
match (y:Year)<--(e:Event)-->(d:Discipline)-->(s:Sport)
with distinct y.name as game, s.name as sport
return game, count(sport)
order by game ASC
//medals table
match (y:Year {name:"1956"}),
(c:Country)<-[:REPRESENTS]-(a:Athlete)-[:WINS]->(m:Medal)-[:REWARD_FOR]->(e:Event)-[:AT_GAMES]->(y)
with y, c, count(m) as totalmedals
match (c)<-[:REPRESENTS]-(a:Athlete)-[:WINS]->(m:Medal)-[:REWARD_FOR]->(e:Event)-[:AT_GAMES]->(y)
return c.name, totalmedals, m.type, count(m)
order by totalmedals desc
limit 10;
//medals per sport per country
match (c:Country)<--(a:Athlete)-->(m:Medal)-->(e:Event)-->()-->(s:Sport),
(e)--(y:Year {name:"1956"})
return c.name, s.name, count(s) as NrOfMedals
order by NrOfMedals desc;
//athletes with medals in games hosted in their home countries
MATCH (y:Year)<--(e:Event)<--(m:Medal)<--(a:Athlete)-->(co:Country)<--(c:City)-->(y)
return y.name, c.name, co.name, m.type, count(a)
order by y.name desc
MATCH (y:Year)<--(e:Event)<--(m:Medal)<--(a:Athlete)-->(co:Country)<--(c:City)-->(y)
return y.name, c.name, co.name, m.type, count(a), collect(a.name)
order by y.name desc
//top Olympians
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
where countmedals > 1
return a.name, countmedals
order by countmedals desc
limit 10
//top Olympians and their types of medals
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
order by countmedals desc
limit 10
match (a)--(m:Medal)
return a.name, m.type, count(m)
order by a.name, count(m) desc;
//Olympians with medals in different games
match (a:Athlete)--(m:Medal)--(e:Event)--(y:Year)
return a.name as name, collect(y.name+": "+m.type) as Years
order by size(Years) desc
limit 10;
//Olympians with medals in different sports
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
limit 100
where countmedals > 1
match (s:Sport)--(:Discipline)--(:Event)--(m:Medal)--(a)
with distinct a,s
match (a)-->(c:Country)
with a.name as Athlete, c.name as Country, collect(s.name) as Sports
where size(Sports)>1
return Athlete, Country, Sports
//Most recent Olympian with medals in different sports
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
limit 100
where countmedals > 1
match (s:Sport)--(:Discipline)--(:Event)--(m:Medal)--(a)
with distinct a,s
with a, collect(s.name) as Sports
where size(Sports)>1
with a
match path=((s:Sport)<--(:Discipline)<--(e:Event)<--(:Medal)<--(a)-->(c:Country)), (e)-->(y:Year)
return path,y
order by y.name desc
limit 10;
//some paths between belgium and the London Olympics
match (c:Country {name:"Belgium"}), (y:Year {name:"2012"}),
p = allshortestpaths( (c)-[*]-(y) )
return p;

The Great Olympic Graph Guide

Rio

Note: this graphgist is really meant to be run as a "Guide" in the Neo4j Browser. You need to add a specific config param to your local server. Add browser.remote_content_hostname_whitelist=http://portal.graphgist.org as a property for the :play http://portal.graphgist.org/graph_gists/the-great-olympian-graph/graph_guide command to work.

Now, the Rio Olympics 2016 are coming to an end. I have written a couple of blog posts about some graphiness in the Olympic medal data over here:

  • part 1 was all about creating a dataset

  • part 2 was all about importing a dataset

  • part 3 was all about some queries on that dataset

Of course we start with a Google sheet

Rio

I had to of course start from two spreadsheets on The Guardian website and combine those:

This became this global spreadsheet which you can then convert into 4 .csv files really easily:

Now let’s look t how we model this.

Choosing a model for the Olympic Graph

Rio

Here’s the model that we will be setting up:

model

Now let’s look at how we import that.

Importing the Great Olympic Graph

Rio

So here we go - let’s do the initial setup:

create index on :Country(name);
create index on :City(name);
create index on :Sport(name);
create index on :Discipline(name);
create index on :Athlete(name);
create index on :Event(name);
create index on :Year(name);
create index on :Gender(name);
create constraint on (c:Country) assert c.noc is unique;

Then we start by first adding the 2 gender nodes

//add the genders
create (g1:Gender {name:"Men"}), (g2:Gender {name:"Women"});

Then we add the countries:

//load countries
load csv with headers from
// "file:///ioccountrycodes.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=2" as csv
create (c:Country {name: csv.Country, noc: csv.NOC, iso: csv.ISOcode});

Then we add the cities:

//load cities
load csv with headers from
// "file:///cities.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=2027901504 " as csv
match (c:Country {noc: csv.NOC})
merge (ci:City {name: csv.City})
merge (ci)-[:LOCATED_IN]->(c);

Then we add the sports taxonomy:

//load sports taxonomy
load csv with headers from
// "file:///sportstaxonomy.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=1457987568" as csv
merge (s:Sport {name: csv.Sport})
merge (s)<-[:PART_OF]-(d:Discipline {name: csv.Discipline});

Then we add the Olympic Games / Years to the graph:

//load the games
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (c:City {name: csv.City})
merge (c)-[:HOSTS_GAMES]->(y:Year {name: csv.Edition});

Then we add the events to the games:

//load the events at the games
cypher planner=rule
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (y:Year {name: csv.Edition}), (d:Discipline {name: csv.Discipline})
merge (d)<-[:PART_OF]-(e:Event {name: csv.Event})-[:AT_GAMES]->(y);

Then we add the athletes and connect them to their genders:

//load the medallists
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (g:Gender {name: csv.Gender})
merge (a:Athlete {name: csv.Athlete})-[:HAS_GENDER]->(g);

And connect the athletes to their countries:

cypher planner=rule
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (a:Athlete {name: csv.Athlete}), (c:Country {noc: csv.NOC})
merge (a)-[:REPRESENTS]->(c);

And then finally we connect the athletes and events to a medal:

cypher planner=rule
using periodic commit
load csv with headers from
// "file:///medallists.csv" as csv
"https://docs.google.com/a/neotechnology.com/spreadsheets/d/1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w/export?format=csv&id=1l-23m3ZPTkz5HNGp-1uHCB1qqhLxq5LSrqNDqBD_T3w&gid=0" as csv
match (a:Athlete {name: csv.Athlete}), (d:Discipline {name: csv.Discipline})<--(e:Event {name: csv.Event})-->(y:Year {name: csv.Edition})
create (a)-[:WINS]->(m:Medal {type: csv.Medal})-[:REWARD_FOR]->(e);

Querying the Great Olympic Graph

Rio

Look at the sports per game:

//number of sports per game
match (y:Year)<--(e:Event)-->(d:Discipline)-->(s:Sport)
with distinct y.name as game, s.name as sport
return game, count(sport)
order by game ASC

Look at the Medals table:

//medals table
match (y:Year {name:"1956"}),
(c:Country)<-[:REPRESENTS]-(a:Athlete)-[:WINS]->(m:Medal)-[:REWARD_FOR]->(e:Event)-[:AT_GAMES]->(y)
with y, c, count(m) as totalmedals
match (c)<-[:REPRESENTS]-(a:Athlete)-[:WINS]->(m:Medal)-[:REWARD_FOR]->(e:Event)-[:AT_GAMES]->(y)
return c.name, totalmedals, m.type, count(m)
order by totalmedals desc
limit 10;

Look at the Medals per sport per country:

//medals per sport per country
match (c:Country)<--(a:Athlete)-->(m:Medal)-->(e:Event)-->()-->(s:Sport),
(e)--(y:Year {name:"1956"})
return c.name, s.name, count(s) as NrOfMedals
order by NrOfMedals desc;

Look at athletes with medals in games hosted in their home countries:

//athletes with medals in games hosted in their home countries
MATCH (y:Year)<--(e:Event)<--(m:Medal)<--(a:Athlete)-->(co:Country)<--(c:City)-->(y)
return y.name, c.name, co.name, m.type, count(a)
order by y.name desc

And collect the names:

MATCH (y:Year)<--(e:Event)<--(m:Medal)<--(a:Athlete)-->(co:Country)<--(c:City)-->(y)
return y.name, c.name, co.name, m.type, count(a), collect(a.name)
order by y.name desc

Look at the top Olympians:

//top Olympians
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
where countmedals > 1
return a.name, countmedals
order by countmedals desc
limit 10

Top Olympians and their types of medals:

//top Olympians and their types of medals
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
order by countmedals desc
limit 10
match (a)--(m:Medal)
return a.name, m.type, count(m)
order by a.name, count(m) desc;

Olympians with medals in different games:

//Olympians with medals in different games
match (a:Athlete)--(m:Medal)--(e:Event)--(y:Year)
return a.name as name, collect(y.name+": "+m.type) as Years
order by size(Years) desc
limit 10;

Olympians with medals in different sports:

//Olympians with medals in different sports
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
limit 100
where countmedals > 1
match (s:Sport)--(:Discipline)--(:Event)--(m:Medal)--(a)
with distinct a,s
match (a)-->(c:Country)
with a.name as Athlete, c.name as Country, collect(s.name) as Sports
where size(Sports)>1
return Athlete, Country, Sports

Most recent Olympian with medals in different sports:

//Most recent Olympian with medals in different sports
match (a:Athlete)
with a, size((a)-[:WINS]->()) as countmedals
limit 100
where countmedals > 1
match (s:Sport)--(:Discipline)--(:Event)--(m:Medal)--(a)
with distinct a,s
with a, collect(s.name) as Sports
where size(Sports)>1
with a
match path=((s:Sport)<--(:Discipline)<--(e:Event)<--(:Medal)<--(a)-->(c:Country)), (e)-->(y:Year)
return path,y
order by y.name desc
limit 10;

To wrap it up: some paths between belgium and the London Olympics:

//some paths between belgium and the London Olympics
match (c:Country {name:"Belgium"}), (y:Year {name:"2012"}),
p = allshortestpaths( (c)-[*]-(y) )
return p;

Plenty of other Olympic things to do!

Rio

There are so many other things that we could look at, especially if we start using some of the awesome apocs that you can add to your own Neo4j server. Enjoy!

I hope this gist was interesting for you, and that we will see each other soon.

This gist was created by Rik Van Bruggen

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment