rvanbruggen/0-carrefour dataset.txt

## 0-carrefour dataset.txt
I have put the dataset on a google drive for download:
* the cleaned up JSON file: https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
* the ZIP of the cleaned up JSON file: https://drive.google.com/file/d/1xn2HC5APrZuoo6Tb51T5DSHbs4RArKKg/view?usp=sharing

## 1-import and wrangle.cql
create index on :Ticket(id);
create index on :Ticket(datetime);
create index on :Mall(id);
create index on :Client(id);
create index on :Product(description);
create index on :TicketItem(netamount);
create index on :TicketItem(numberofunits);
create index on :TicketItem(product);

//full import of cleaned json dataset
//you can download a cleaned up version of the .json file from
//https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
//:param url => "https://doc-00-ag-docs.googleusercontent.com/docs/securesc/4ahn2tvnfqvgk3nao9aq1fmtefllc0s6/arvg6ff0upfmgi8uks55qvm1pumkm5d3/1574704800000/01872855700700874083/01872855700700874083/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX?e=download&h=09716495216110682026&authuser=0&nonce=3k3ejiejn3k5e&user=01872855700700874083&hash=u7nherjph4hgc7s0u6o4o9m4g51tr9me";
:param url => "file:///path/to/DelightingCustomersBDclean.json";

call apoc.periodic.iterate("
  call apoc.load.json($url) yield value return value","
    create (t:Ticket {id: value._id, datetime: datetime(value.date)})
    merge (m:Mall {id: value.mall})
    create (t)-[:TICKET_PRINTED_IN_MALL]->(m)
    merge (c:Client {id: value.client})
    create (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t)
      with value, t
        unwind value.items as item
        create (t)-[:TICKET_HAS_TICKETITEM]->(ti:TicketItem {product: item.desc, netamount: item.net_am, numberofunits: item.n_unit})
        merge (p:Product {description: item.desc})
        create (ti)-[:TICKETITEM_HAS_PRODUCT]->(p)",
  {batchSize: 10000, iterateList: true, params: {url:$url}});


//PREPARATION QUERIES FOR LATER QUERIES
//calculate total value of tickets
match (t:Ticket)-->(ti:TicketItem)
with t, sum(ti.netamount) as ticketvalue
set t.ticketamount = ticketvalue;

//create direct link between ticket and product (not through ticketitem)
call apoc.periodic.iterate("
match (t:Ticket)-->(ti:TicketItem)-->(p:Product) return t,p,ti","
create (t)-[:TICKET_HAS_PRODUCT {numberofunits: ti.numberofunits, netamount: ti.netamount}]->(p)",{batchSize:10000, parallel:false});

## 2-interesting queries.cql
//clients with most tickets
match (c:Client)<-[r]-(t:Ticket)
where c.id <> "Unknown"
return c.id as Client, count(r) as NrOfTickets
order by NrOfTickets desc
limit 10

//clients that are worth most
match (c:Client)<--(t:Ticket)-->(ti:TicketItem)
where c.id <> "Unknown"
return c.id, sum(ti.netamount) as totalNetamount
order by totalNetamount desc
limit 10


//products most bought in quantity
match (p:Product)<--(ti:TicketItem)
return p.description, sum(ti.numberofunits) as totalQuantity
order by totalQuantity desc
limit 10

//products most bought in sales netamount
match (p:Product)<--(ti:TicketItem)
return p.description, sum(ti.netamount) as totalNetamount
order by totalNetamount desc
limit 10

//clients that purchase in different malls
match (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
where m1<>m2 and t1<>t2 and c.id <> "Unknown"
return m1,m2,t1,t2,c
limit 100

//clients that purchase same product in different malls
match path = (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
where m1<>m2 and t1<>t2 and c.id <> "Unknown"
with path, t1, t2
match (t1)--(ti1:TicketItem)--(p:Product)--(ti2:TicketItem)--(t2)
return path,p,ti1,ti2
limit 10

//query temporal data
match (t1:Ticket), (t2:Ticket)
where t1<>t2
return t1.id, t2.id, duration.between(t1.datetime,t2.datetime)
limit 10;

## 3-productcombo queries.cql
//PRODUCTCOMBO queries
//a. what productcombo's are the highest spending customers buying most?
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
where c.id <> "Unknown"
with c, sum(t.ticketamount) as totalspend
order by totalspend desc
limit 100
match
  (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
  (t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
where id(p1)<id(p2)
return distinct p1.description+' with '+p2.description, count(*) as frequency
order by frequency desc limit 10;

//what productcombo's are the highest spending customers buying most - EXCEPT BAGS
match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
where c.id <> "Unknown"
with c, sum(t.ticketamount) as totalspend
order by totalspend desc
limit 100
match
  (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
  (t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
where id(p1)<id(p2)
and not (p1.description contains "BOLSA")
and not (p2.description contains "BOLSA")
return distinct p1.description+' with '+p2.description, count(*) as frequency
order by frequency desc limit 10

//b. PRODUCTCOMBO's based on product price - expensive products bought together
//need average pricing for product
//first look at the different prices that we have on the ticketitems, and find out that they are quite different!
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits = 1
with p,n
limit 1000
with p.description as description,  collect(n.netamount) as prices
return description, size(prices) as nrofprices
order by nrofprices desc
limit 10

//lots of different prices when we have more units of a product on a ticket
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits > 0
with p,n
limit 1000
return p.description,  collect(n.netamount/n.numberofunits)

//let's look at average and standard deviation
MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where n.numberofunits > 0
with p,n
limit 100
return p.description,  collect(n.netamount/n.numberofunits) as prices, avg(n.netamount/n.numberofunits) as avgprice, stdev(n.netamount/n.numberofunits) as stdev_price
order by stdev_price desc
limit 100

//digging in: which expensive products are being bought together
//first establish ticketitemprice
call apoc.periodic.iterate("
match (ti:TicketItem) where ti.numberofunits > 0 return ti",
"set ti.ticketitemprice = ti.netamount/ti.numberofunits",
{batchSize: 10000, parallel:true});
//then use the ticketitemprice to calculate averageprice of a product
match (ti:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
where exists(ti.ticketitemprice)
with p, avg(ti.ticketitemprice) as avgprice
set p.averageprice = avgprice;
//TABLE VIEW:find the product combinations and their prices that are often bought together
match(p:Product)
where exists(p.averageprice)
with p.averageprice as prices
order by p.averageprice desc
limit 1000
with min(prices) as cutoff
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
return distinct p1.description as product1, p1.averageprice as product1price, p2.description as product2, p2.averageprice as product2price, count(*) as frequency
order by frequency desc limit 10

//GRAPH VIEW:which expensive products are being bought together
match(p:Product)
where exists(p.averageprice)
with p.averageprice as prices
order by p.averageprice desc
limit 250
with min(prices) as cutoff
match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
return p1, t, p2

## 4-algo queries.cql
//Jaccard similarity example
// similarity of product by ticket which might not be that relevant except for co-purchase
MATCH (p:Product)<-[:TICKET_HAS_PRODUCT]-(t:Ticket)
WITH {item:id(p), categories: collect(id(t))} as userData
WITH collect(userData) as data
CALL algo.similarity.jaccard(data, {write:true, writeRelationshipType: "SIMILAR_BY_TICKET", topK:10, degreeCutoff:10, similarityCutoff:0.5})
YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p25, p50, p75, p90, p95, p99, p999, p100;

MATCH p=()-[r:SIMILAR_BY_TICKET]->() RETURN p LIMIT 100

match (p:Product)-[conn]-()
where p.description contains "HP 302"
return p,conn;

match (p:Product)-[conn]-()
where p.description contains "RIVERSIDE"
return p,conn;
	I have put the dataset on a google drive for download:
	* the cleaned up JSON file: https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
	* the ZIP of the cleaned up JSON file: https://drive.google.com/file/d/1xn2HC5APrZuoo6Tb51T5DSHbs4RArKKg/view?usp=sharing
	create index on :Ticket(id);
	create index on :Ticket(datetime);
	create index on :Mall(id);
	create index on :Client(id);
	create index on :Product(description);
	create index on :TicketItem(netamount);
	create index on :TicketItem(numberofunits);
	create index on :TicketItem(product);

	//full import of cleaned json dataset
	//you can download a cleaned up version of the .json file from
	//https://drive.google.com/file/d/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX/view?usp=sharing
	//:param url => "https://doc-00-ag-docs.googleusercontent.com/docs/securesc/4ahn2tvnfqvgk3nao9aq1fmtefllc0s6/arvg6ff0upfmgi8uks55qvm1pumkm5d3/1574704800000/01872855700700874083/01872855700700874083/1wDNAMFk_3-H1l44ID4P6fcE6K7cvG9iX?e=download&h=09716495216110682026&authuser=0&nonce=3k3ejiejn3k5e&user=01872855700700874083&hash=u7nherjph4hgc7s0u6o4o9m4g51tr9me";
	:param url => "file:///path/to/DelightingCustomersBDclean.json";

	call apoc.periodic.iterate("
	call apoc.load.json($url) yield value return value","
	create (t:Ticket {id: value._id, datetime: datetime(value.date)})
	merge (m:Mall {id: value.mall})
	create (t)-[:TICKET_PRINTED_IN_MALL]->(m)
	merge (c:Client {id: value.client})
	create (c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t)
	with value, t
	unwind value.items as item
	create (t)-[:TICKET_HAS_TICKETITEM]->(ti:TicketItem {product: item.desc, netamount: item.net_am, numberofunits: item.n_unit})
	merge (p:Product {description: item.desc})
	create (ti)-[:TICKETITEM_HAS_PRODUCT]->(p)",
	{batchSize: 10000, iterateList: true, params: {url:$url}});


	//PREPARATION QUERIES FOR LATER QUERIES
	//calculate total value of tickets
	match (t:Ticket)-->(ti:TicketItem)
	with t, sum(ti.netamount) as ticketvalue
	set t.ticketamount = ticketvalue;

	//create direct link between ticket and product (not through ticketitem)
	call apoc.periodic.iterate("
	match (t:Ticket)-->(ti:TicketItem)-->(p:Product) return t,p,ti","
	create (t)-[:TICKET_HAS_PRODUCT {numberofunits: ti.numberofunits, netamount: ti.netamount}]->(p)",{batchSize:10000, parallel:false});
	//clients with most tickets
	match (c:Client)<-[r]-(t:Ticket)
	where c.id <> "Unknown"
	return c.id as Client, count(r) as NrOfTickets
	order by NrOfTickets desc
	limit 10

	//clients that are worth most
	match (c:Client)<--(t:Ticket)-->(ti:TicketItem)
	where c.id <> "Unknown"
	return c.id, sum(ti.netamount) as totalNetamount
	order by totalNetamount desc
	limit 10


	//products most bought in quantity
	match (p:Product)<--(ti:TicketItem)
	return p.description, sum(ti.numberofunits) as totalQuantity
	order by totalQuantity desc
	limit 10

	//products most bought in sales netamount
	match (p:Product)<--(ti:TicketItem)
	return p.description, sum(ti.netamount) as totalNetamount
	order by totalNetamount desc
	limit 10

	//clients that purchase in different malls
	match (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
	where m1<>m2 and t1<>t2 and c.id <> "Unknown"
	return m1,m2,t1,t2,c
	limit 100

	//clients that purchase same product in different malls
	match path = (m1:Mall)<--(t1:Ticket)-->(c:Client)<--(t2:Ticket)-->(m2:Mall)
	where m1<>m2 and t1<>t2 and c.id <> "Unknown"
	with path, t1, t2
	match (t1)--(ti1:TicketItem)--(p:Product)--(ti2:TicketItem)--(t2)
	return path,p,ti1,ti2
	limit 10

	//query temporal data
	match (t1:Ticket), (t2:Ticket)
	where t1<>t2
	return t1.id, t2.id, duration.between(t1.datetime,t2.datetime)
	limit 10;
	//PRODUCTCOMBO queries
	//a. what productcombo's are the highest spending customers buying most?
	match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
	where c.id <> "Unknown"
	with c, sum(t.ticketamount) as totalspend
	order by totalspend desc
	limit 100
	match
	(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
	(t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
	where id(p1)<id(p2)
	return distinct p1.description+' with '+p2.description, count(*) as frequency
	order by frequency desc limit 10;

	//what productcombo's are the highest spending customers buying most - EXCEPT BAGS
	match (c:Client)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)
	where c.id <> "Unknown"
	with c, sum(t.ticketamount) as totalspend
	order by totalspend desc
	limit 100
	match
	(c)<-[:TICKET_PURCHASED_BY_CLIENT]-(t:Ticket)-[:TICKET_HAS_PRODUCT]->(p1:Product),
	(t)-[:TICKET_HAS_PRODUCT]-(p2:Product)
	where id(p1)<id(p2)
	and not (p1.description contains "BOLSA")
	and not (p2.description contains "BOLSA")
	return distinct p1.description+' with '+p2.description, count(*) as frequency
	order by frequency desc limit 10

	//b. PRODUCTCOMBO's based on product price - expensive products bought together
	//need average pricing for product
	//first look at the different prices that we have on the ticketitems, and find out that they are quite different!
	MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
	where n.numberofunits = 1
	with p,n
	limit 1000
	with p.description as description, collect(n.netamount) as prices
	return description, size(prices) as nrofprices
	order by nrofprices desc
	limit 10

	//lots of different prices when we have more units of a product on a ticket
	MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
	where n.numberofunits > 0
	with p,n
	limit 1000
	return p.description, collect(n.netamount/n.numberofunits)

	//let's look at average and standard deviation
	MATCH (n:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
	where n.numberofunits > 0
	with p,n
	limit 100
	return p.description, collect(n.netamount/n.numberofunits) as prices, avg(n.netamount/n.numberofunits) as avgprice, stdev(n.netamount/n.numberofunits) as stdev_price
	order by stdev_price desc
	limit 100

	//digging in: which expensive products are being bought together
	//first establish ticketitemprice
	call apoc.periodic.iterate("
	match (ti:TicketItem) where ti.numberofunits > 0 return ti",
	"set ti.ticketitemprice = ti.netamount/ti.numberofunits",
	{batchSize: 10000, parallel:true});
	//then use the ticketitemprice to calculate averageprice of a product
	match (ti:TicketItem)-[:TICKETITEM_HAS_PRODUCT]->(p:Product)
	where exists(ti.ticketitemprice)
	with p, avg(ti.ticketitemprice) as avgprice
	set p.averageprice = avgprice;
	//TABLE VIEW:find the product combinations and their prices that are often bought together
	match(p:Product)
	where exists(p.averageprice)
	with p.averageprice as prices
	order by p.averageprice desc
	limit 1000
	with min(prices) as cutoff
	match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
	where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
	return distinct p1.description as product1, p1.averageprice as product1price, p2.description as product2, p2.averageprice as product2price, count(*) as frequency
	order by frequency desc limit 10

	//GRAPH VIEW:which expensive products are being bought together
	match(p:Product)
	where exists(p.averageprice)
	with p.averageprice as prices
	order by p.averageprice desc
	limit 250
	with min(prices) as cutoff
	match (p1:Product)<--(t:Ticket), (p2:Product)<--(t)
	where p1.averageprice > cutoff and p2.averageprice > cutoff and id(p1)>id(p2)
	return p1, t, p2
	//Jaccard similarity example
	// similarity of product by ticket which might not be that relevant except for co-purchase
	MATCH (p:Product)<-[:TICKET_HAS_PRODUCT]-(t:Ticket)
	WITH {item:id(p), categories: collect(id(t))} as userData
	WITH collect(userData) as data
	CALL algo.similarity.jaccard(data, {write:true, writeRelationshipType: "SIMILAR_BY_TICKET", topK:10, degreeCutoff:10, similarityCutoff:0.5})
	YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
	RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p25, p50, p75, p90, p95, p99, p999, p100;

	MATCH p=()-[r:SIMILAR_BY_TICKET]->() RETURN p LIMIT 100

	match (p:Product)-[conn]-()
	where p.description contains "HP 302"
	return p,conn;

	match (p:Product)-[conn]-()
	where p.description contains "RIVERSIDE"
	return p,conn;