Skip to content

Instantly share code, notes, and snippets.

@carolineartz
Forked from mdeland/basic_query.csv
Created March 5, 2017 03:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save carolineartz/5f1fe3316fbc33fdd6a5a55b9bdfa8f7 to your computer and use it in GitHub Desktop.
Save carolineartz/5f1fe3316fbc33fdd6a5a55b9bdfa8f7 to your computer and use it in GitHub Desktop.
Summarizing Data in SQL
date num_customers distinct_customers total_bananas total_revenue revenue_per_sale
2016-01-01 345 287 564 3689 6.54
2016-01-02 364 299 582 4080 7.01
... ... ... ... ... ...
SELECT
date,
count(*) as num_customers,
count(distinct user_id) as distinct_customers,
sum(bananas_sold) as total_bananas,
sum(revenue) as total_revenue,
avg(revenue) as revenue_per_sale
FROM banana_sales
GROUP BY date
ORDER BY date;
bucket_floor count
0 1054
5 465
10 233
... ...
select
floor(revenue/5.00)*5 as bucket_floor,
count(*) as count
from banana_sales
group by 1
order by 1;
bucket_floor bucket_name count
0 0 to 5 1054
5 5 to 10 465
10 10 to 15 233
... ... ...
select
bucket_floor,
CONCAT(bucket_floor, ' to ', bucket_ceiling) as bucket_name,
count(*) as count
from (
select
floor(revenue/5.00)*5 as bucket_floor,
floor(revenue/5.00)*5 + 5 as bucket_ceiling
from web_sessions_table
) a
group by 1, 2
order by 1;
select
revenue,
count(*)
from banana_sales
group by revenue
order by revenue;
wait_time_bucket avg_revenue
0 10.87
10 8.43
20 9.01
30 7.50
... ...
select
floor(wait_time/10.00)*10 as wait_time_bucket,
avg(revenue) as avg_revenue
from banana_sales
group by 1
order by 1;
select
corr(wait_time, revenue) as correlation,
covar_samp(wait_time, revenue) as covariance
from banana_sales;
date percentile_25 percentile_50 percentile_75 avg
2016-01-01 18 37 75 66
2016-01-02 19 35 77 64
... ... ... ... ...
SELECT
date,
percentile_cont (0.25) WITHIN GROUP
(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_25,
percentile_cont (0.50) WITHIN GROUP
(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_50,
percentile_cont (0.75) WITHIN GROUP
(ORDER BY wait_time ASC) OVER(PARTITION BY date) as percentile_75,
avg(wait_time) as avg -- for comparison
FROM banana_sales
GROUP BY date
ORDER BY date;
date median
2016-01-01 37
2016-01-02 35
... ...
SELECT
t1.date,
t1.wait_time as median
FROM (
SELECT
date,
wait_time,
ROW_NUMBER() OVER(ORDER BY wait_time PARTITION BY date) as row_num
FROM banana_sales
) t
JOIN (
SELECT
date,
count(*) as total
FROM banana_sales
GROUP BY date
) t2
ON
t1.date = t2.date
-- for simplicity, we take a simple solution when the list has an even length, to just choose one value
WHERE t1.row_num =
CASE when t2.total % 2 = 0
THEN t2.total / 2
ELSE (t2.total + 1) / 2
END;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment