Skip to content

Instantly share code, notes, and snippets.

@rvanbruggen
Created November 4, 2013 18:54
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rvanbruggen/7307438 to your computer and use it in GitHub Desktop.
Save rvanbruggen/7307438 to your computer and use it in GitHub Desktop.
Browser History - Clickstream Analysis with Neo4j
// this script uses the neo4j-shell-tools
// create personal nodes
import-cypher -d ; -i ./IMPORT/INPUT/PersonalNodes.csv -o ./IMPORT/OUTPUT/personalnodeout.csv create (n:url {id:{id}+1000,name:{name},type:{type}}) return n.name as name
// create indices
CREATE index on :url(name);
CREATE index on :url(id);
CREATE index on :url(type);
//create personal rels
import-cypher -d ; -i ./IMPORT/INPUT/PersonalRels.csv -o ./IMPORT/OUTPUT/personalrelout.csv MATCH (id1:url), (id2:url) where id1.id={from}+1000 and id2.id={to}+1000 create id1-[:BROWSE_TO_PERSONAL]->id2 return id1.name, id2.name
//create professional nodes
import-cypher -d ; -i ./IMPORT/INPUT/ProfessionalNodes.csv -o ./IMPORT/OUTPUT/professionalnodeout.csv merge (n:url {name:{name}}) on match n set n.id2={id},n.type="Both" on create n set n.id2={id},n.type={type} return n.name as name
// create remaining index
CREATE index on :url(id2);
//create professional rels
import-cypher -d ; -i ./IMPORT/INPUT/ProfessionalRels.csv -o ./IMPORT/OUTPUT/profrelout.csv MATCH (id1:url), (id2:url) where id1.id2={from} and id2.id2={to} create id1-[:BROWSE_TO_PROFESSIONAL]->id2 return id1.name, id2.name
//set "Personal" and "Professional" labels
match (n:url)
where n.id <> null
set n:Personal
return count(n);
match (n:url)
where n.id2 <> null
set n:Professional
return count(n);
//create index on new labels
create index on :Personal(name);
create index on :Professional(name);
//count all nodes
match n return count(n);
match n return n;
//count all nodes that are professional AND personal
match n where (n:Professional) and (n:Personal) return count(n);
match n where (n:Professional:Personal) return n limit 25;
//count all nodes that are personal and NOT professional
match n where NOT (n:Professional) and (n:Personal) return count(n);
match n where NOT (n:Professional) and (n:Personal) return n limit 25;
//count all nodes that are NOT personal but are professional
match n where (n:Professional) and NOT (n:Personal) return count(n);
match n where (n:Professional) and NOT (n:Personal) return n limit 25;
//Number of times that site2 was visited after site1
match p = (site1:url)-[r]->(site2:url)
return site1.name as Site1, site2.name as Site2, count(distinct p) as NrOfVisits
order by NrOfVisits DESC
limit 25;
//Number of times a site was visited
match (site:url)<-[r]-()
return site.name as Site, count(r) as NrOfVisits, labels(site) as VisitType
order by NrOfVisits DESC
limit 25;
//Number of professional visits to a site
match (site:url)<-[prof:BROWSE_TO_PROFESSIONAL]-()
return site.name as Site, count(distinct prof) as NrOfProfVisits
order by NrOfProfVisits DESC
limit 25;
//Number of personal visits to a site
match (site:url)<-[pers:BROWSE_TO_PERSONAL]-()
return site.name as Site, count(distinct pers) as NrOfPersVisits
order by NrOfPersVisits DESC
limit 25;
//Sites with BOTH professional and personal visits
match ()-[pers:BROWSE_TO_PERSONAL]->(site:url)<-[prof:BROWSE_TO_PROFESSIONAL]-()
return site.name as Site, count(distinct pers) as NrOfPersVisits, count(distinct prof) as NrOfProfVisits
order by NrOfProfVisits DESC
limit 20;
//Sites that are visited PROFESSIONALLY and NOT PERSONALLY
match ()-[prof:BROWSE_TO_PROFESSIONAL]->(site:url)
where NOT ()-[:BROWSE_TO_PERSONAL]->(site:url)
return site.name as Site, count(distinct prof) as NrOfProfVisits
order by NrOfProfVisits DESC
limit 100;
//Sites that are visited PERSONALLY and NOT PROFESSIONALLY
match ()-[prof:BROWSE_TO_PERSONAL]->(site:url)
where NOT ()-[:BROWSE_TO_PROFESSIONAL]->(site:url)
return site.name as Site, count(distinct prof) as NrOfPersVisits
order by NrOfPersVisits DESC
limit 100;
//find paths between Personal and Professional websites - don't forget to LIMIT
match p=((n1:Personal)<-[*2..4]-(n2:Professional))
return p
limit 50;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment