rvanbruggen/importing_graphbloggraph.cql

## importing_graphbloggraph.cql
//Bruggen blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0

create index on :Blog(name);
create constraint on (p:Page) assert p.url is unique;

create (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"});
create (n:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"});
create (n:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"});
create (n:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"});
create (n:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"});
create (n:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"});
create (n:Blog {name:"Mark Needham's Blog", url:"http://markhneedham.com/blog"});


//create the Bruggen blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
match (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"})
create (p:Page {url: csv.URL, title: csv.Title, created: csv.Date})-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
with csv.URL as URL, csv.Links as row
unwind row as listoflinks
with URL, [l in split(listoflinks,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
MERGE (p)-[:LINKS_TO]->(l);

//remove the redundant links
//linking to pages with same url (eg. archive pages, label pages...)
match (b:Blog {name:"Bruggen"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://blog.bruggen.com"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;
//linking to other posts of the same blog
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;
//linking to itself
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;
//linking to the blog provider (Blogger)
match (p:Page)
where p.url contains "//www.blogger.com"
detach delete p;


//Neo4j blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051

//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created=csv.Date
on create set  p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

//remove the redundant links
match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://neo4j.com/blog"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//neo4j.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;


//JEXP blog feed
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510

//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created=csv.Date
on create set  p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

//remove the redundant links
match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://jexp.de/blog"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//jexp.de/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;


//Stefan Armbruster-IT https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530

//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created=csv.Date
on create set  p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

//remove the redundant links
match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url starts with "http://blog.armbruster-it.de"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//blog.armbruster-it.de/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;


//Max De Marzi
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649

//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created=csv.Date
on create set  p.title=csv.Title, p.created=csv.Date
with p
match (b:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

// //remove the redundant links
// match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
// where p2.url starts with "http://maxdemarzi.com"
// and not ((b)<-[:PART_OF]-(p2))
// detach delete p2;

match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//maxdemarzi.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;


//Will Lyon
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509

//create the blog entries
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created="Unknown"
on create set  p.title=csv.Title, p.created="Unknown"
with p
match (b:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

// //remove the redundant links

match (b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//lyonwj.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;

//Mark Needham
https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478

//create the blog entries
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
merge (p:Page {url: csv.URL})
on match set  p.title=csv.Title, p.created="Unknown"
on create set  p.title=csv.Title, p.created="Unknown"
with p
match (b:Blog {name:"Mark Needham's Blog"})
create (p)-[:PART_OF]->(b);

//create the link graph
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
with csv.URL as URL, csv.Links as row
unwind row as linklist
with URL, [l in split(linklist,"****") | trim(l)] as links
unwind range(0,size(links)-2) as idx
MERGE (l:Page {url:links[idx]})
WITH l, URL
MATCH (p:Page {url: URL})
CREATE (p)-[:LINKS_TO]->(l);

// //remove the redundant links
match (b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
where p2.url contains "//www.markhneedham.com/"
and not ((b)<-[:PART_OF]-(p2))
detach delete p2;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p2:Page),
(p1)-[lt:LINKS_TO]-(p2)
delete lt;

match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"}),
(p1)-[lt:LINKS_TO]-(p1)
delete lt;

//clear out the relative and incomplete URLs
match (p:Page)
where p.url starts with "/"
detach delete p;

match (p:Page)
where p.url starts with "#"
detach delete p;

match (p:Page)
where p.url starts with "javascript"
detach delete p;

## query_graphbloggraph.cql
//Query the Bloggraph

//Which pages have the most links
match (b:Blog)--(p:Page)-[r:LINKS_TO]->(p2:Page)
return b.name, p.title, count(r)
order by count(r) desc

//Which pages are being linked to most
match ()-[r:LINKS_TO]->(p:Page)
return p.url, count(r)
order by count(r) DESC
limit 10;

//links between blogposts
MATCH p=((n1:Blog)--(p1:Page)-[:LINKS_TO]-(p2:Page)--(b2:Blog))
RETURN p

//show all the paths between blogs
match (b1:Blog {name:"Bruggen"}), (b2:Blog {name:"Neo4j Blog"}),
p1 = allshortestpaths((b1)-[*]-(b2))
return p1 as paths
union
match (b1:Blog {name:"Bruggen"}),(b3:Blog {name:"JEXP Blog"}),
p2 = allshortestpaths((b1)-[*]-(b3))
return p2 as paths
// union
// match (b2:Blog {name:"Neo4j Blog"}),(b3:Blog {name:"JEXP Blog"}),
// p3 = allshortestpaths((b2)-[*]-(b3))
// return p3 as paths;

 //Pagerank ftw
 //1. calculate the PageRank
 UNWIND range(1,50) AS round
 MATCH (n:Page)
 WHERE rand() < 0.1
 MATCH (n:Page)-[:LINKS_TO*..10]->(m:Page)
 SET m.rank = coalesce(m.rank,0) + 1

 //2. Show the PageRank
 match (n:Page)
 where n.rank is not null
 return n.url, n.rank
 order by n.rank desc
 limit 10;
	//Bruggen blog feed
	//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0

	create index on :Blog(name);
	create constraint on (p:Page) assert p.url is unique;

	create (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"});
	create (n:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"});
	create (n:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"});
	create (n:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"});
	create (n:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"});
	create (n:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"});
	create (n:Blog {name:"Mark Needham's Blog", url:"http://markhneedham.com/blog"});



	//create the Bruggen blog entries
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
	match (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"})
	create (p:Page {url: csv.URL, title: csv.Title, created: csv.Date})-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as listoflinks
	with URL, [l in split(listoflinks,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	MERGE (p)-[:LINKS_TO]->(l);

	//remove the redundant links
	//linking to pages with same url (eg. archive pages, label pages...)
	match (b:Blog {name:"Bruggen"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url starts with "http://blog.bruggen.com"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;
	//linking to other posts of the same blog
	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;
	//linking to itself
	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;
	//linking to the blog provider (Blogger)
	match (p:Page)
	where p.url contains "//www.blogger.com"
	detach delete p;


	//Neo4j blog feed
	//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051

	//create the blog entries
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created=csv.Date
	on create set p.title=csv.Title, p.created=csv.Date
	with p
	match (b:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	//remove the redundant links
	match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url starts with "http://neo4j.com/blog"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//neo4j.com/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;


	//JEXP blog feed
	//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510

	//create the blog entries
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created=csv.Date
	on create set p.title=csv.Title, p.created=csv.Date
	with p
	match (b:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	//remove the redundant links
	match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url starts with "http://jexp.de/blog"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//jexp.de/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;



	//Stefan Armbruster-IT https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530

	//create the blog entries
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created=csv.Date
	on create set p.title=csv.Title, p.created=csv.Date
	with p
	match (b:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	//remove the redundant links
	match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url starts with "http://blog.armbruster-it.de"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//blog.armbruster-it.de/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;



	//Max De Marzi
	https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649

	//create the blog entries
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created=csv.Date
	on create set p.title=csv.Title, p.created=csv.Date
	with p
	match (b:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	// //remove the redundant links
	// match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	// where p2.url starts with "http://maxdemarzi.com"
	// and not ((b)<-[:PART_OF]-(p2))
	// detach delete p2;

	match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//maxdemarzi.com/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;




	//Will Lyon
	https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509

	//create the blog entries
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created="Unknown"
	on create set p.title=csv.Title, p.created="Unknown"
	with p
	match (b:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	// //remove the redundant links

	match (b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//lyonwj.com/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;

	//Mark Needham
	https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478

	//create the blog entries
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
	merge (p:Page {url: csv.URL})
	on match set p.title=csv.Title, p.created="Unknown"
	on create set p.title=csv.Title, p.created="Unknown"
	with p
	match (b:Blog {name:"Mark Needham's Blog"})
	create (p)-[:PART_OF]->(b);

	//create the link graph
	load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv
	with csv.URL as URL, csv.Links as row
	unwind row as linklist
	with URL, [l in split(linklist,"****") \| trim(l)] as links
	unwind range(0,size(links)-2) as idx
	MERGE (l:Page {url:links[idx]})
	WITH l, URL
	MATCH (p:Page {url: URL})
	CREATE (p)-[:LINKS_TO]->(l);

	// //remove the redundant links
	match (b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page)
	where p2.url contains "//www.markhneedham.com/"
	and not ((b)<-[:PART_OF]-(p2))
	detach delete p2;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p2:Page),
	(p1)-[lt:LINKS_TO]-(p2)
	delete lt;

	match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"}),
	(p1)-[lt:LINKS_TO]-(p1)
	delete lt;

	//clear out the relative and incomplete URLs
	match (p:Page)
	where p.url starts with "/"
	detach delete p;

	match (p:Page)
	where p.url starts with "#"
	detach delete p;

	match (p:Page)
	where p.url starts with "javascript"
	detach delete p;
	//Query the Bloggraph

	//Which pages have the most links
	match (b:Blog)--(p:Page)-[r:LINKS_TO]->(p2:Page)
	return b.name, p.title, count(r)
	order by count(r) desc

	//Which pages are being linked to most
	match ()-[r:LINKS_TO]->(p:Page)
	return p.url, count(r)
	order by count(r) DESC
	limit 10;

	//links between blogposts
	MATCH p=((n1:Blog)--(p1:Page)-[:LINKS_TO]-(p2:Page)--(b2:Blog))
	RETURN p

	//show all the paths between blogs
	match (b1:Blog {name:"Bruggen"}), (b2:Blog {name:"Neo4j Blog"}),
	p1 = allshortestpaths((b1)-[*]-(b2))
	return p1 as paths
	union
	match (b1:Blog {name:"Bruggen"}),(b3:Blog {name:"JEXP Blog"}),
	p2 = allshortestpaths((b1)-[*]-(b3))
	return p2 as paths
	// union
	// match (b2:Blog {name:"Neo4j Blog"}),(b3:Blog {name:"JEXP Blog"}),
	// p3 = allshortestpaths((b2)-[*]-(b3))
	// return p3 as paths;

	//Pagerank ftw
	//1. calculate the PageRank
	UNWIND range(1,50) AS round
	MATCH (n:Page)
	WHERE rand() < 0.1
	MATCH (n:Page)-[:LINKS_TO*..10]->(m:Page)
	SET m.rank = coalesce(m.rank,0) + 1

	//2. Show the PageRank
	match (n:Page)
	where n.rank is not null
	return n.url, n.rank
	order by n.rank desc
	limit 10;