Created
November 4, 2013 18:54
-
-
Save rvanbruggen/7307438 to your computer and use it in GitHub Desktop.
Browser History - Clickstream Analysis with Neo4j
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// this script uses the neo4j-shell-tools | |
// create personal nodes | |
import-cypher -d ; -i ./IMPORT/INPUT/PersonalNodes.csv -o ./IMPORT/OUTPUT/personalnodeout.csv create (n:url {id:{id}+1000,name:{name},type:{type}}) return n.name as name | |
// create indices | |
CREATE index on :url(name); | |
CREATE index on :url(id); | |
CREATE index on :url(type); | |
//create personal rels | |
import-cypher -d ; -i ./IMPORT/INPUT/PersonalRels.csv -o ./IMPORT/OUTPUT/personalrelout.csv MATCH (id1:url), (id2:url) where id1.id={from}+1000 and id2.id={to}+1000 create id1-[:BROWSE_TO_PERSONAL]->id2 return id1.name, id2.name | |
//create professional nodes | |
import-cypher -d ; -i ./IMPORT/INPUT/ProfessionalNodes.csv -o ./IMPORT/OUTPUT/professionalnodeout.csv merge (n:url {name:{name}}) on match n set n.id2={id},n.type="Both" on create n set n.id2={id},n.type={type} return n.name as name | |
// create remaining index | |
CREATE index on :url(id2); | |
//create professional rels | |
import-cypher -d ; -i ./IMPORT/INPUT/ProfessionalRels.csv -o ./IMPORT/OUTPUT/profrelout.csv MATCH (id1:url), (id2:url) where id1.id2={from} and id2.id2={to} create id1-[:BROWSE_TO_PROFESSIONAL]->id2 return id1.name, id2.name | |
//set "Personal" and "Professional" labels | |
match (n:url) | |
where n.id <> null | |
set n:Personal | |
return count(n); | |
match (n:url) | |
where n.id2 <> null | |
set n:Professional | |
return count(n); | |
//create index on new labels | |
create index on :Personal(name); | |
create index on :Professional(name); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//count all nodes | |
match n return count(n); | |
match n return n; | |
//count all nodes that are professional AND personal | |
match n where (n:Professional) and (n:Personal) return count(n); | |
match n where (n:Professional:Personal) return n limit 25; | |
//count all nodes that are personal and NOT professional | |
match n where NOT (n:Professional) and (n:Personal) return count(n); | |
match n where NOT (n:Professional) and (n:Personal) return n limit 25; | |
//count all nodes that are NOT personal but are professional | |
match n where (n:Professional) and NOT (n:Personal) return count(n); | |
match n where (n:Professional) and NOT (n:Personal) return n limit 25; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Number of times that site2 was visited after site1 | |
match p = (site1:url)-[r]->(site2:url) | |
return site1.name as Site1, site2.name as Site2, count(distinct p) as NrOfVisits | |
order by NrOfVisits DESC | |
limit 25; | |
//Number of times a site was visited | |
match (site:url)<-[r]-() | |
return site.name as Site, count(r) as NrOfVisits, labels(site) as VisitType | |
order by NrOfVisits DESC | |
limit 25; | |
//Number of professional visits to a site | |
match (site:url)<-[prof:BROWSE_TO_PROFESSIONAL]-() | |
return site.name as Site, count(distinct prof) as NrOfProfVisits | |
order by NrOfProfVisits DESC | |
limit 25; | |
//Number of personal visits to a site | |
match (site:url)<-[pers:BROWSE_TO_PERSONAL]-() | |
return site.name as Site, count(distinct pers) as NrOfPersVisits | |
order by NrOfPersVisits DESC | |
limit 25; | |
//Sites with BOTH professional and personal visits | |
match ()-[pers:BROWSE_TO_PERSONAL]->(site:url)<-[prof:BROWSE_TO_PROFESSIONAL]-() | |
return site.name as Site, count(distinct pers) as NrOfPersVisits, count(distinct prof) as NrOfProfVisits | |
order by NrOfProfVisits DESC | |
limit 20; | |
//Sites that are visited PROFESSIONALLY and NOT PERSONALLY | |
match ()-[prof:BROWSE_TO_PROFESSIONAL]->(site:url) | |
where NOT ()-[:BROWSE_TO_PERSONAL]->(site:url) | |
return site.name as Site, count(distinct prof) as NrOfProfVisits | |
order by NrOfProfVisits DESC | |
limit 100; | |
//Sites that are visited PERSONALLY and NOT PROFESSIONALLY | |
match ()-[prof:BROWSE_TO_PERSONAL]->(site:url) | |
where NOT ()-[:BROWSE_TO_PROFESSIONAL]->(site:url) | |
return site.name as Site, count(distinct prof) as NrOfPersVisits | |
order by NrOfPersVisits DESC | |
limit 100; | |
//find paths between Personal and Professional websites - don't forget to LIMIT | |
match p=((n1:Personal)<-[*2..4]-(n2:Professional)) | |
return p | |
limit 50; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment