Skip to content

Instantly share code, notes, and snippets.

@daigotanaka
Last active July 5, 2019 02:04
Show Gist options
  • Save daigotanaka/3907b11126114a5c74b84f89d0041e19 to your computer and use it in GitHub Desktop.
Save daigotanaka/3907b11126114a5c74b84f89d0041e19 to your computer and use it in GitHub Desktop.
SQL histogram template
/* Histogram template
Edit Config and Input sections at the top of this query.
Config:
- bin_size: Width of the bin
- min_bin
- max_bin
Input:
- c: The column of interest
- t: Table
Output:
- bin: The text label of the bin. For the out of range (min_bin, max_bin), '<=' and '>' is added
- n: Count
- bin_num: Numeric bin labels convenient for sorting bin correctly in business intelligence tools
*/
with
/* Config */
config as (
select
10 as bin_size,
10 as min_bin,
100 as max_bin
),
/* Inputs */
t as (
select
-- Set the column of interest in the next line
<your_column>
as c
-- Set the table of interest in the next line
from <your_table>
-- Any filters?
where 1=1
and date_diff(current_date, registered_on, day) > (select max_bin from config)
),
/* Don't touch below */
-- Create a number sequence from 0~999
seed as (
select v from (
select 0 as v union all select 1 union all select 2 union all select 3 union all select 4 union all select 5
union all select 6 union all select 7 union all select 8 union all select 9)
),
n999 as (
select (s0.v + 10 * s1.v + 100 * s2.v) as v
from seed as s0 cross join seed as s1 cross join seed as s2
),
-- Create bins in the range min_bin ~ max_bin
bins as (
select bin,
0 as n
from (select v * (select bin_size from config) as bin from n999)
where (select min_bin from config) < bin and bin <= (select max_bin from config)
order by 1
),
-- Count instances by bin
-- Also generate the sum for the out of range values
counts as (
-- Equal or less than min_bin
select
(select min_bin from config) as bin,
sum(case when c < (select min_bin from config) then 1 else 0 end) as n
from t
-- Greater than max_bin
union all(
select
(select max_bin from config) + 1 as bin,
sum(case when c >= (select max_bin from config) then 1 else 0 end) as n
from t)
-- Everything in between
union all (
select
(floor(c / (select bin_size from config)) + 1) * (select bin_size from config) as bin,
count(*) as n
from t
where (select min_bin from config) <= c and c < (select max_bin from config)
group by 1)
order by 1
),
with_zero_bins as (
select
coalesce(b.bin, c.bin) as bin,
coalesce(b.n, 0) + coalesce(c.n, 0) as n
from bins as b
full join counts as c
on b.bin=c.bin
),
histogram as (
select
bin as bin_num,
case
when bin = (select min_bin from config) then concat('<', cast((select min_bin from config) as string))
when bin > (select max_bin from config) then concat ('>=', cast((select max_bin from config) as string))
else cast(bin as string)
end as bin,
n
from with_zero_bins
)
-- Check sum
-- select count(*) as n from t union all select sum(n) from histogram
select
bin,
n,
-- Use bin_num to sort bins
bin_num
from histogram
order by bin_num
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment