Last active
November 27, 2019 17:39
-
-
Save rvanbruggen/eeec1c52f6b801679ab8af297c0f4607 to your computer and use it in GitHub Desktop.
Carrefour data challenge in Neo4j
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I have put the dataset on a google drive for download: | |
* the cleaned up JSON file: https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing | |
* the ZIP of the cleaned up JSON file: https://drive.google.com/file/d/1xn2HC5APrZuoo6Tb51T5DSHbs4RArKKg/view?usp=sharing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create index on :Ticket(id); | |
create index on :Ticket(datetime); | |
create index on :Mall(id); | |
create index on :Client(id); | |
create index on :Product(description); | |
create index on :TicketItem(netamount); | |
create index on :TicketItem(numberofunits); | |
create index on :TicketItem(product); | |
//full import of cleaned json dataset | |
//you can download a cleaned up version of the .json file from | |
//https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing | |
//:param url => "https://doc-00-ag-docs.googleusercontent.com/docs/securesc/4ahn2tvnfqvgk3nao9aq1fmtefllc0s6/arvg6ff0upfmgi8uks55qvm1pumkm5d3/1574704800000/01872855700700874083/01872855700700874083/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX?e=download&h=09716495216110682026&authuser=0&nonce=3k3ejiejn3k5e&user=01872855700700874083&hash=u7nherjph4hgc7s0u6o4o9m4g51tr9me"; | |
:param url => "file:///path/to/DelightingCustomersBDclean.json"; | |
call apoc.periodic.iterate(" | |
call apoc.load.json($url) yield value return value"," | |
create (t:Ticket {id: value._id, datetime: datetime(value.date)}) | |
merge (m:Mall {id: value.mall}) | |
create (t)-[:TICKET_PRINTED_IN_MALL]->(m) | |
merge (c:Client {id: value.client}) | |
create (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t) | |
with value, t | |
unwind value.items as item | |
create (t)-[:TICKET_HAS_TICKETITEM]->(ti:TicketItem {product: item.desc, netamount: item.net_am, numberofunits: item.n_unit}) | |
merge (p:Product {description: item.desc}) | |
create (ti)-[:TICKETITEM_HAS_PRODUCT]->(p)", | |
{batchSize: 10000, iterateList: true, params: {url:$url}}); | |
//PREPARATION QUERIES FOR LATER QUERIES | |
//calculate total value of tickets | |
match (t:Ticket)-->(ti:TicketItem) | |
with t, sum(ti.netamount) as ticketvalue | |
set t.ticketamount = ticketvalue; | |
//create direct link between ticket and product (not through ticketitem) | |
call apoc.periodic.iterate(" | |
match (t:Ticket)-->(ti:TicketItem)-->(p:Product) return t,p,ti"," | |
create (t)-[:TICKET_HAS_PRODUCT {numberofunits: ti.numberofunits, netamount: ti.netamount}]->(p)",{batchSize:10000, parallel:false}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//clients with most tickets | |
match (c:Client)<-[r]-(t:Ticket) | |
where c.id <> "Unknown" | |
return c.id as Client, count(r) as NrOfTickets | |
order by NrOfTickets desc | |
limit 10 | |
//clients that are worth most | |
match (c:Client)<--(t:Ticket)-->(ti:TicketItem) | |
where c.id <> "Unknown" | |
return c.id, sum(ti.netamount) as totalNetamount | |
order by totalNetamount desc | |
limit 10 | |
//products most bought in quantity | |
match (p:Product)<--(ti:TicketItem) | |
return p.description, sum(ti.numberofunits) as totalQuantity | |
order by totalQuantity desc | |
limit 10 | |
//products most bought in sales netamount | |
match (p:Product)<--(ti:TicketItem) | |
return p.description, sum(ti.netamount) as totalNetamount | |
order by totalNetamount desc | |
limit 10 | |
//clients that purchase in different malls | |
match (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall) | |
where m1<>m2 and t1<>t2 and c.id <> "Unknown" | |
return m1,m2,t1,t2,c | |
limit 100 | |
//clients that purchase same product in different malls | |
match path = (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall) | |
where m1<>m2 and t1<>t2 and c.id <> "Unknown" | |
with path, t1, t2 | |
match (t1)--(ti1:TicketItem)--(p:Product)--(ti2:TicketItem)--(t2) | |
return path,p,ti1,ti2 | |
limit 10 | |
//query temporal data | |
match (t1:Ticket), (t2:Ticket) | |
where t1<>t2 | |
return t1.id, t2.id, duration.between(t1.datetime,t2.datetime) | |
limit 10; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//PRODUCTCOMBO queries | |
//a. what productcombo's are the highest spending customers buying most? | |
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket) | |
where c.id <> "Unknown" | |
with c, sum(t.ticketamount) as totalspend | |
order by totalspend desc | |
limit 100 | |
match | |
(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product), | |
(t)-[:TICKET_HAS_PRODUCT]-(p2:Product) | |
where id(p1)<id(p2) | |
return distinct p1.description+' with '+p2.description, count(*) as frequency | |
order by frequency desc limit 10; | |
//what productcombo's are the highest spending customers buying most - EXCEPT BAGS | |
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket) | |
where c.id <> "Unknown" | |
with c, sum(t.ticketamount) as totalspend | |
order by totalspend desc | |
limit 100 | |
match | |
(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product), | |
(t)-[:TICKET_HAS_PRODUCT]-(p2:Product) | |
where id(p1)<id(p2) | |
and not (p1.description contains "BOLSA") | |
and not (p2.description contains "BOLSA") | |
return distinct p1.description+' with '+p2.description, count(*) as frequency | |
order by frequency desc limit 10 | |
//b. PRODUCTCOMBO's based on product price - expensive products bought together | |
//need average pricing for product | |
//first look at the different prices that we have on the ticketitems, and find out that they are quite different! | |
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product) | |
where n.numberofunits = 1 | |
with p,n | |
limit 1000 | |
with p.description as description, collect(n.netamount) as prices | |
return description, size(prices) as nrofprices | |
order by nrofprices desc | |
limit 10 | |
//lots of different prices when we have more units of a product on a ticket | |
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product) | |
where n.numberofunits > 0 | |
with p,n | |
limit 1000 | |
return p.description, collect(n.netamount/n.numberofunits) | |
//let's look at average and standard deviation | |
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product) | |
where n.numberofunits > 0 | |
with p,n | |
limit 100 | |
return p.description, collect(n.netamount/n.numberofunits) as prices, avg(n.netamount/n.numberofunits) as avgprice, stdev(n.netamount/n.numberofunits) as stdev_price | |
order by stdev_price desc | |
limit 100 | |
//digging in: which expensive products are being bought together | |
//first establish ticketitemprice | |
call apoc.periodic.iterate(" | |
match (ti:TicketItem) where ti.numberofunits > 0 return ti", | |
"set ti.ticketitemprice = ti.netamount/ti.numberofunits", | |
{batchSize: 10000, parallel:true}); | |
//then use the ticketitemprice to calculate averageprice of a product | |
match (ti:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product) | |
where exists(ti.ticketitemprice) | |
with p, avg(ti.ticketitemprice) as avgprice | |
set p.averageprice = avgprice; | |
//TABLE VIEW:find the product combinations and their prices that are often bought together | |
match(p:Product) | |
where exists(p.averageprice) | |
with p.averageprice as prices | |
order by p.averageprice desc | |
limit 1000 | |
with min(prices) as cutoff | |
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t) | |
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2) | |
return distinct p1.description as product1, p1.averageprice as product1price, p2.description as product2, p2.averageprice as product2price, count(*) as frequency | |
order by frequency desc limit 10 | |
//GRAPH VIEW:which expensive products are being bought together | |
match(p:Product) | |
where exists(p.averageprice) | |
with p.averageprice as prices | |
order by p.averageprice desc | |
limit 250 | |
with min(prices) as cutoff | |
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t) | |
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2) | |
return p1, t, p2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Jaccard similarity example | |
// similarity of product by ticket which might not be that relevant except for co-purchase | |
MATCH (p:Product)<-[:TICKET_HAS_PRODUCT]-(t:Ticket) | |
WITH {item:id(p), categories: collect(id(t))} as userData | |
WITH collect(userData) as data | |
CALL algo.similarity.jaccard(data, {write:true, writeRelationshipType: "SIMILAR_BY_TICKET", topK:10, degreeCutoff:10, similarityCutoff:0.5}) | |
YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100 | |
RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p25, p50, p75, p90, p95, p99, p999, p100; | |
MATCH p=()-[r:SIMILAR_BY_TICKET]->() RETURN p LIMIT 100 | |
match (p:Product)-[conn]-() | |
where p.description contains "HP 302" | |
return p,conn; | |
match (p:Product)-[conn]-() | |
where p.description contains "RIVERSIDE" | |
return p,conn; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment