Skip to content

Instantly share code, notes, and snippets.

@rvanbruggen
Last active November 27, 2019 17:39
Show Gist options
  • Save rvanbruggen/eeec1c52f6b801679ab8af297c0f4607 to your computer and use it in GitHub Desktop.
Save rvanbruggen/eeec1c52f6b801679ab8af297c0f4607 to your computer and use it in GitHub Desktop.
Carrefour data challenge in Neo4j
I have put the dataset on a google drive for download:
* the cleaned up JSON file: https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
* the ZIP of the cleaned up JSON file: https://drive.google.com/file/d/1xn2HC5APrZuoo6Tb51T5DSHbs4RArKKg/view?usp=sharing
create index on :Ticket(id);
create index on :Ticket(datetime);
create index on :Mall(id);
create index on :Client(id);
create index on :Product(description);
create index on :TicketItem(netamount);
create index on :TicketItem(numberofunits);
create index on :TicketItem(product);
//full import of cleaned json dataset
//you can download a cleaned up version of the .json file from
//https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
//:param url => "https://doc-00-ag-docs.googleusercontent.com/docs/securesc/4ahn2tvnfqvgk3nao9aq1fmtefllc0s6/arvg6ff0upfmgi8uks55qvm1pumkm5d3/1574704800000/01872855700700874083/01872855700700874083/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX?e=download&h=09716495216110682026&authuser=0&nonce=3k3ejiejn3k5e&user=01872855700700874083&hash=u7nherjph4hgc7s0u6o4o9m4g51tr9me";
:param url => "file:///path/to/DelightingCustomersBDclean.json";
call apoc.periodic.iterate("
call apoc.load.json($url) yield value return value","
create (t:Ticket {id: value._id, datetime: datetime(value.date)})
merge (m:Mall {id: value.mall})
create (t)-[:TICKET_PRINTED_IN_MALL]->(m)
merge (c:Client {id: value.client})
create (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t)
with value, t
unwind value.items as item
create (t)-[:TICKET_HAS_TICKETITEM]->(ti:TicketItem {product: item.desc, netamount: item.net_am, numberofunits: item.n_unit})
merge (p:Product {description: item.desc})
create (ti)-[:TICKETITEM_HAS_PRODUCT]->(p)",
{batchSize: 10000, iterateList: true, params: {url:$url}});
//PREPARATION QUERIES FOR LATER QUERIES
//calculate total value of tickets
match (t:Ticket)-->(ti:TicketItem)
with t, sum(ti.netamount) as ticketvalue
set t.ticketamount = ticketvalue;
//create direct link between ticket and product (not through ticketitem)
call apoc.periodic.iterate("
match (t:Ticket)-->(ti:TicketItem)-->(p:Product) return t,p,ti","
create (t)-[:TICKET_HAS_PRODUCT {numberofunits: ti.numberofunits, netamount: ti.netamount}]->(p)",{batchSize:10000, parallel:false});
//clients with most tickets
match (c:Client)<-[r]-(t:Ticket)
where c.id <> "Unknown"
return c.id as Client, count(r) as NrOfTickets
order by NrOfTickets desc
limit 10
//clients that are worth most
match (c:Client)<--(t:Ticket)-->(ti:TicketItem)
where c.id <> "Unknown"
return c.id, sum(ti.netamount) as totalNetamount
order by totalNetamount desc
limit 10
//products most bought in quantity
match (p:Product)<--(ti:TicketItem)
return p.description, sum(ti.numberofunits) as totalQuantity
order by totalQuantity desc
limit 10
//products most bought in sales netamount
match (p:Product)<--(ti:TicketItem)
return p.description, sum(ti.netamount) as totalNetamount
order by totalNetamount desc
limit 10
//clients that purchase in different malls
match (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
where m1<>m2 and t1<>t2 and c.id <> "Unknown"
return m1,m2,t1,t2,c
limit 100
//clients that purchase same product in different malls
match path = (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
where m1<>m2 and t1<>t2 and c.id <> "Unknown"
with path, t1, t2
match (t1)--(ti1:TicketItem)--(p:Product)--(ti2:TicketItem)--(t2)
return path,p,ti1,ti2
limit 10
//query temporal data
match (t1:Ticket), (t2:Ticket)
where t1<>t2
return t1.id, t2.id, duration.between(t1.datetime,t2.datetime)
limit 10;
//PRODUCTCOMBO queries
//a. what productcombo's are the highest spending customers buying most?
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
where c.id <> "Unknown"
with c, sum(t.ticketamount) as totalspend
order by totalspend desc
limit 100
match
(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
(t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
where id(p1)<id(p2)
return distinct p1.description+' with '+p2.description, count(*) as frequency
order by frequency desc limit 10;
//what productcombo's are the highest spending customers buying most - EXCEPT BAGS
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
where c.id <> "Unknown"
with c, sum(t.ticketamount) as totalspend
order by totalspend desc
limit 100
match
(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
(t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
where id(p1)<id(p2)
and not (p1.description contains "BOLSA")
and not (p2.description contains "BOLSA")
return distinct p1.description+' with '+p2.description, count(*) as frequency
order by frequency desc limit 10
//b. PRODUCTCOMBO's based on product price - expensive products bought together
//need average pricing for product
//first look at the different prices that we have on the ticketitems, and find out that they are quite different!
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits = 1
with p,n
limit 1000
with p.description as description, collect(n.netamount) as prices
return description, size(prices) as nrofprices
order by nrofprices desc
limit 10
//lots of different prices when we have more units of a product on a ticket
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits > 0
with p,n
limit 1000
return p.description, collect(n.netamount/n.numberofunits)
//let's look at average and standard deviation
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits > 0
with p,n
limit 100
return p.description, collect(n.netamount/n.numberofunits) as prices, avg(n.netamount/n.numberofunits) as avgprice, stdev(n.netamount/n.numberofunits) as stdev_price
order by stdev_price desc
limit 100
//digging in: which expensive products are being bought together
//first establish ticketitemprice
call apoc.periodic.iterate("
match (ti:TicketItem) where ti.numberofunits > 0 return ti",
"set ti.ticketitemprice = ti.netamount/ti.numberofunits",
{batchSize: 10000, parallel:true});
//then use the ticketitemprice to calculate averageprice of a product
match (ti:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where exists(ti.ticketitemprice)
with p, avg(ti.ticketitemprice) as avgprice
set p.averageprice = avgprice;
//TABLE VIEW:find the product combinations and their prices that are often bought together
match(p:Product)
where exists(p.averageprice)
with p.averageprice as prices
order by p.averageprice desc
limit 1000
with min(prices) as cutoff
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
return distinct p1.description as product1, p1.averageprice as product1price, p2.description as product2, p2.averageprice as product2price, count(*) as frequency
order by frequency desc limit 10
//GRAPH VIEW:which expensive products are being bought together
match(p:Product)
where exists(p.averageprice)
with p.averageprice as prices
order by p.averageprice desc
limit 250
with min(prices) as cutoff
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
return p1, t, p2
//Jaccard similarity example
// similarity of product by ticket which might not be that relevant except for co-purchase
MATCH (p:Product)<-[:TICKET_HAS_PRODUCT]-(t:Ticket)
WITH {item:id(p), categories: collect(id(t))} as userData
WITH collect(userData) as data
CALL algo.similarity.jaccard(data, {write:true, writeRelationshipType: "SIMILAR_BY_TICKET", topK:10, degreeCutoff:10, similarityCutoff:0.5})
YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p25, p50, p75, p90, p95, p99, p999, p100;
MATCH p=()-[r:SIMILAR_BY_TICKET]->() RETURN p LIMIT 100
match (p:Product)-[conn]-()
where p.description contains "HP 302"
return p,conn;
match (p:Product)-[conn]-()
where p.description contains "RIVERSIDE"
return p,conn;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment