carolineartz/basic_query.csv

## basic_query.csv

          
            date
             num_customers
             distinct_customers
             total_bananas
             total_revenue
             revenue_per_sale

            
              2016-01-01
               345
               287
               564
               3689
               6.54

            
              2016-01-02
               364
               299
               582
               4080
               7.01

            
              ...
               ...
               ...
               ...
               ...
               ...

## basic_query.sql
SELECT
	date,
	count(*) as num_customers,
	count(distinct user_id) as distinct_customers,
	sum(bananas_sold) as total_bananas,
	sum(revenue) as total_revenue,
	avg(revenue) as revenue_per_sale
FROM banana_sales
GROUP BY date
ORDER BY date;

## histogram.csv

          
            bucket_floor
             count

            
              0
               1054

            
              5
               465

            
              10
               233

            
              ...
               ...

## histogram.sql
select
	floor(revenue/5.00)*5 as bucket_floor,
	count(*) as count
from banana_sales
group by 1
order by 1;

## histogram_label.csv

          
            bucket_floor
             bucket_name
             count

            
              0
               0 to 5
               1054

            
              5
               5 to 10
               465

            
              10
               10 to 15
               233

            
              ...
               ...
               ...

## histogram_label.sql
select
    bucket_floor,
    CONCAT(bucket_floor, ' to ', bucket_ceiling) as bucket_name,
    count(*) as count
from (
	select
		floor(revenue/5.00)*5 as bucket_floor,
		floor(revenue/5.00)*5 + 5 as bucket_ceiling
	from web_sessions_table
) a
group by 1, 2
order by 1;

## histogram_naive.sql
select
	revenue,
	count(*)
from banana_sales
group by revenue
order by revenue;

## joint.csv

          
            wait_time_bucket
             avg_revenue

            
              0
               10.87

            
              10
               8.43

            
              20
               9.01

            
              30
               7.50

            
              ...
               ...

## joint.sql
select
	floor(wait_time/10.00)*10 as wait_time_bucket,
	avg(revenue) as avg_revenue
from banana_sales
group by 1
order by 1;

## joint2.sql
select
	corr(wait_time, revenue) as correlation,
	covar_samp(wait_time, revenue) as covariance
from banana_sales;

## percentiles.csv

          
            date
             percentile_25
             percentile_50
             percentile_75
             avg

            
              2016-01-01
               18
               37
               75
               66

            
              2016-01-02
               19
               35
               77
               64

            
              ...
               ...
               ...
               ...
               ...

## percentiles.sql
SELECT
	date,
	percentile_cont (0.25) WITHIN GROUP
		(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_25,
	percentile_cont (0.50) WITHIN GROUP
		(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_50,
	percentile_cont (0.75) WITHIN GROUP
		(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_75,
	avg(wait_time) as avg -- for comparison
FROM banana_sales
GROUP BY date
ORDER BY date;

## percentiles2.csv

          
            date
             median

            
              2016-01-01
               37

            
              2016-01-02
               35

            
              ...
               ...

## percentiles2.sql
SELECT
	t1.date,
	t1.wait_time as median
FROM (
	SELECT
		date,
		wait_time,
		ROW_NUMBER() OVER(ORDER BY wait_time PARTITION BY date) as row_num
	FROM banana_sales
) t
JOIN (
	SELECT
		date,
		count(*) as total
	FROM banana_sales
	GROUP BY date
) t2
	ON
		t1.date = t2.date
-- for simplicity, we take a simple solution when the list has an even length, to just choose one value
WHERE t1.row_num =
  CASE when t2.total % 2 = 0
		THEN t2.total / 2
		ELSE (t2.total + 1) / 2
	END;
date	num_customers	distinct_customers	total_bananas	total_revenue	revenue_per_sale
2016-01-01	345	287	564	3689	6.54
2016-01-02	364	299	582	4080	7.01
...	...	...	...	...	...
	SELECT
	date,
	count(*) as num_customers,
	count(distinct user_id) as distinct_customers,
	sum(bananas_sold) as total_bananas,
	sum(revenue) as total_revenue,
	avg(revenue) as revenue_per_sale
	FROM banana_sales
	GROUP BY date
	ORDER BY date;
	select
	floor(revenue/5.00)*5 as bucket_floor,
	count(*) as count
	from banana_sales
	group by 1
	order by 1;
bucket_floor	bucket_name	count
0	0 to 5	1054
5	5 to 10	465
10	10 to 15	233
...	...	...
	select
	bucket_floor,
	CONCAT(bucket_floor, ' to ', bucket_ceiling) as bucket_name,
	count(*) as count
	from (
	select
	floor(revenue/5.00)*5 as bucket_floor,
	floor(revenue/5.00)*5 + 5 as bucket_ceiling
	from web_sessions_table
	) a
	group by 1, 2
	order by 1;
	select
	revenue,
	count(*)
	from banana_sales
	group by revenue
	order by revenue;
	select
	floor(wait_time/10.00)*10 as wait_time_bucket,
	avg(revenue) as avg_revenue
	from banana_sales
	group by 1
	order by 1;
	select
	corr(wait_time, revenue) as correlation,
	covar_samp(wait_time, revenue) as covariance
	from banana_sales;
date	percentile_25	percentile_50	percentile_75	avg
2016-01-01	18	37	75	66
2016-01-02	19	35	77	64
...	...	...	...	...
	SELECT
	date,
	percentile_cont (0.25) WITHIN GROUP
	(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_25,
	percentile_cont (0.50) WITHIN GROUP
	(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_50,
	percentile_cont (0.75) WITHIN GROUP
	(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_75,
	avg(wait_time) as avg -- for comparison
	FROM banana_sales
	GROUP BY date
	ORDER BY date;