evan-burke/postgres bulk percentiles calculation

## postgres bulk percentiles calculation

This will return the percentiles in order.

-- #1
select unnest(
percentile_cont(
	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
) WITHIN GROUP (ORDER BY SIZE))
from mytable


Made easier using LATERAL, and this one gives the (fractional) percentile too:

-- #2
select percentile, value
from generate_series(0, 1, 0.2) as percentile,
lateral (select percentile_cont(percentile) WITHIN GROUP (ORDER BY SIZE) as value from mytable) values

The lateral approach is a lot slower, though. Calculation time scales linearly based on the number of percentiles you want to calculate - percentile_cont() is being called for each item in generate_series, while the version above just runs it once.


This is the best way I've found to use the first method while returning the input percentile - though this is pretty hacky, as it more or less makes up a column to join on.

-- #3
select pct, value
from (select row_number() OVER () as rownum, value
	from (select unnest(
		percentile_cont(
			(select array_agg(s) from generate_series(0, 1, 0.2) as s)
		) WITHIN GROUP (ORDER BY SIZE)) as value
		from mytable) s2
	) s3
join (select row_number() OVER () as rownum, pct
	from (select generate_series(0, 1, 0.2) as pct) s) p
	on p.rownum = s3.rownum


ntile() looks a bit better but getting lower/upper bounds is a bit clumsy, and values here are not quite the same as percentile_cont. (Might be the same as percentile_disc().)
This is about half as fast as query #3 on my dataset.

-- #4
select ((nt-1)/5::float) as pctile,  min(size)
from (select size, ntile(6) over (order by size) as nt
   from mytable
 ) as dt
group by nt
order by nt asc


This is the approach I selected. A lot cleaner than #3 above.

SELECT unnest(
  (select array_agg(fraction) from generate_series(0, 1, 0.2) AS fraction)
) as percentile,
unnest(
  (select percentile_cont((select array_agg(s) from generate_series(0, 1, 0.2) as s)) WITHIN GROUP (ORDER BY SIZE) FROM mytable)
) as value;


-- Or with manually defined percentiles (tested on pg 9.6)
select unnest(
	(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
) as percentile,
unnest(
	(select percentile_cont(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
		WITHIN GROUP (ORDER BY value) FROM mytable
	)
) as value

	This will return the percentiles in order.

	-- #1
	select unnest(
	percentile_cont(
	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
	) WITHIN GROUP (ORDER BY SIZE))
	from mytable


	Made easier using LATERAL, and this one gives the (fractional) percentile too:

	-- #2
	select percentile, value
	from generate_series(0, 1, 0.2) as percentile,
	lateral (select percentile_cont(percentile) WITHIN GROUP (ORDER BY SIZE) as value from mytable) values

	The lateral approach is a lot slower, though. Calculation time scales linearly based on the number of percentiles you want to calculate - percentile_cont() is being called for each item in generate_series, while the version above just runs it once.


	This is the best way I've found to use the first method while returning the input percentile - though this is pretty hacky, as it more or less makes up a column to join on.

	-- #3
	select pct, value
	from (select row_number() OVER () as rownum, value
	from (select unnest(
	percentile_cont(
	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
	) WITHIN GROUP (ORDER BY SIZE)) as value
	from mytable) s2
	) s3
	join (select row_number() OVER () as rownum, pct
	from (select generate_series(0, 1, 0.2) as pct) s) p
	on p.rownum = s3.rownum


	ntile() looks a bit better but getting lower/upper bounds is a bit clumsy, and values here are not quite the same as percentile_cont. (Might be the same as percentile_disc().)
	This is about half as fast as query #3 on my dataset.

	-- #4
	select ((nt-1)/5::float) as pctile, min(size)
	from (select size, ntile(6) over (order by size) as nt
	from mytable
	) as dt
	group by nt
	order by nt asc


	This is the approach I selected. A lot cleaner than #3 above.

	SELECT unnest(
	(select array_agg(fraction) from generate_series(0, 1, 0.2) AS fraction)
	) as percentile,
	unnest(
	(select percentile_cont((select array_agg(s) from generate_series(0, 1, 0.2) as s)) WITHIN GROUP (ORDER BY SIZE) FROM mytable)
	) as value;


	-- Or with manually defined percentiles (tested on pg 9.6)
	select unnest(
	(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
	) as percentile,
	unnest(
	(select percentile_cont(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
	WITHIN GROUP (ORDER BY value) FROM mytable
	)
	) as value