Skip to content

Instantly share code, notes, and snippets.

@rvanbruggen
Last active January 17, 2016 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rvanbruggen/371e06726a58843a2938 to your computer and use it in GitHub Desktop.
Save rvanbruggen/371e06726a58843a2938 to your computer and use it in GitHub Desktop.
Creating and Querying the Graphblog-Graph
//Bruggen blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0
create index on :Blog(name);
create constraint on (p:Page) assert p.url is unique;
create (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"});
create (n:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"});
create (n:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"});
create (n:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"});
create (n:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"});
create (n:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"});
create (n:Blog {name:"Mark Needham's Blog", url:"http://markhneedham.com/blog"});
//create the Bruggen blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
match (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"})
create (p:Page {url: csv.URL, title: csv.Title, created: csv.Date})-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
with csv.URL as URL, csv.Links as row
unwind row as listoflinks
with URL, [l in split(listoflinks,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
MERGE (p)-[:LINKS_TO]->(l);
//remove the redundant links
//linking to pages with same url (eg. archive pages, label pages...)
match (b:Blog {name:"Bruggen"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://blog.bruggen.com"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
//linking to other posts of the same blog
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
//linking to itself
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//linking to the blog provider (Blogger)
match (p:Page)
where p.url contains "//www.blogger.com"
detach delete p;
//Neo4j blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051
//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created=csv.Date
on create set p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
//remove the redundant links
match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://neo4j.com/blog"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//neo4j.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//JEXP blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510
//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created=csv.Date
on create set p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
//remove the redundant links
match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://jexp.de/blog"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//jexp.de/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//Stefan Armbruster-IT https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530
//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created=csv.Date
on create set p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
//remove the redundant links
match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://blog.armbruster-it.de"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//blog.armbruster-it.de/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//Max De Marzi
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649
//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created=csv.Date
on create set p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
// //remove the redundant links
// match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
// where p2.url starts with "http://maxdemarzi.com"
// and not ((b)<-[:PART_OF]-(p2))
// detach delete p2;
match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//maxdemarzi.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//Will Lyon
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509
//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created="Unknown"
on create set p.title=csv.Title, p.created="Unknown"
with p
match (b:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
// //remove the redundant links
match (b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//lyonwj.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//Mark Needham
https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478
//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
merge (p:Page {url: csv.URL})
on match set p.title=csv.Title, p.created="Unknown"
on create set p.title=csv.Title, p.created="Unknown"
with p
match (b:Blog {name:"Mark Needham's Blog"})
create (p)-[:PART_OF]->(b);
//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);
// //remove the redundant links
match (b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//www.markhneedham.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//clear out the relative and incomplete URLs
match (p:Page)
where p.url starts with "/"
detach delete p;
match (p:Page)
where p.url starts with "#"
detach delete p;
match (p:Page)
where p.url starts with "javascript"
detach delete p;
//Query the Bloggraph
//Which pages have the most links
match (b:Blog)--(p:Page)-[r:LINKS_TO]->(p2:Page)
return b.name, p.title, count(r)
order by count(r) desc
//Which pages are being linked to most
match ()-[r:LINKS_TO]->(p:Page)
return p.url, count(r)
order by count(r) DESC
limit 10;
//links between blogposts
MATCH p=((n1:Blog)--(p1:Page)-[:LINKS_TO]-(p2:Page)--(b2:Blog))
RETURN p
//show all the paths between blogs
match (b1:Blog {name:"Bruggen"}), (b2:Blog {name:"Neo4j Blog"}),
p1 = allshortestpaths((b1)-[*]-(b2))
return p1 as paths
union
match (b1:Blog {name:"Bruggen"}),(b3:Blog {name:"JEXP Blog"}),
p2 = allshortestpaths((b1)-[*]-(b3))
return p2 as paths
// union
// match (b2:Blog {name:"Neo4j Blog"}),(b3:Blog {name:"JEXP Blog"}),
// p3 = allshortestpaths((b2)-[*]-(b3))
// return p3 as paths;
//Pagerank ftw
//1. calculate the PageRank
UNWIND range(1,50) AS round
MATCH (n:Page)
WHERE rand() < 0.1
MATCH (n:Page)-[:LINKS_TO*..10]->(m:Page)
SET m.rank = coalesce(m.rank,0) + 1
//2. Show the PageRank
match (n:Page)
where n.rank is not null
return n.url, n.rank
order by n.rank desc
limit 10;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment