Skip to content

Instantly share code, notes, and snippets.

@ajw0100
Last active July 1, 2020 23:28
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save ajw0100/955882add24932a343e9 to your computer and use it in GitHub Desktop.
Save ajw0100/955882add24932a343e9 to your computer and use it in GitHub Desktop.
The Z-Order Curve and Amazon Redshift's Interleaved Sort Keys (https://chartio.com/blog/understanding-interleaved-sort-keys-in-amazon-redshift-part-1/)
with
years(year_id, year) as (
select i, 'Year ' || i
from generate_series(0, 3) i
),
regions(region_id, region) as (
select i, 'Region ' || i
from generate_series(0, 3) i
),
customers(customer_id, customer) as (
select i, 'Customer ' || i
from generate_series(0, 3) i
),
products(product_id, product) as (
select i, 'Product ' || i
from generate_series(0, 3) i
),
base as (
select
*,
year_id::bit(2)::varchar as year_id_bit,
region_id::bit(2)::varchar as region_id_bit,
customer_id::bit(2)::varchar as customer_id_bit,
product_id::bit(2)::varchar as product_id_bit
from years, regions, customers, products
),
compound as (
select
(row_number() over (
order by year_id, region_id, customer_id, product_id
)
-1)/16 as block,
year_id,
region_id,
customer_id,
product_id,
year,
region,
customer,
product
from base
order by
year_id,
region_id,
customer_id,
product_id
),
compound_zone_map as (
select
block,
min(year_id) min_year,
max(year_id) max_year,
min(region_id) min_region,
max(region_id) max_region,
min(customer_id) min_customer,
max(customer_id) max_customer,
min(product_id) min_product,
max(product_id) max_product
from compound
group by block
),
compound_zone_map_selectivity as (
select
(avg(year_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
as year_selectivity,
(avg(region_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
as region_selectivity,
(avg(customer_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
as customer_selectivity,
(avg(product_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
as product_selectivity,
((avg(year_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
+ (avg(region_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
+ (avg(customer_cnt)::numeric/(select count(*) from compound_zone_map)::numeric)
+ (avg(product_cnt)::numeric/(select count(*) from compound_zone_map)::numeric))
/ 4 as average_selectivity
from generate_series(0, 3) i
left join lateral (
select count(*) as year_cnt
from compound_zone_map
where i between min_year and max_year
) y on 1=1
left join lateral (
select count(*) as region_cnt
from compound_zone_map
where i between min_region and max_region
) r on 1=1
left join lateral (
select count(*) as customer_cnt
from compound_zone_map
where i between min_customer and max_customer
) c on 1=1
left join lateral (
select count(*) as product_cnt
from compound_zone_map
where i between min_product and max_product
) p on 1=1
),
-- There's probably a smarter bit-twiddling way to do the interleaving
interleaved as (
select
((row_number() over (
order by substr(year_id_bit, 1, 1)
|| substr(region_id_bit, 1, 1)
|| substr(customer_id_bit, 1, 1)
|| substr(product_id_bit, 1, 1)
|| substr(year_id_bit, 2, 1)
|| substr(region_id_bit, 2, 1)
|| substr(customer_id_bit, 2, 1)
|| substr(product_id_bit, 2, 1)
)
-1)/16
) as block,
(substr(year_id_bit, 1, 1)
|| substr(region_id_bit, 1, 1)
|| substr(customer_id_bit, 1, 1)
|| substr(product_id_bit, 1, 1)
|| substr(year_id_bit, 2, 1)
|| substr(region_id_bit, 2, 1)
|| substr(customer_id_bit, 2, 1)
|| substr(product_id_bit, 2, 1)
) as interleaved_id,
year_id,
region_id,
customer_id,
product_id,
year,
region,
customer,
product
from base
order by interleaved_id asc
),
interleaved_zone_map as (
select
block,
min(year_id) min_year,
max(year_id) max_year,
min(region_id) min_region,
max(region_id) max_region,
min(customer_id) min_customer,
max(customer_id) max_customer,
min(product_id) min_product,
max(product_id) max_product
from interleaved
group by block
),
interleaved_zone_map_selectivity as (
select
(avg(year_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
as year_selectivity,
(avg(region_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
as region_selectivity,
(avg(customer_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
as customer_selectivity,
(avg(product_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
as product_selectivity,
((avg(year_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
+ (avg(region_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
+ (avg(customer_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric)
+ (avg(product_cnt)::numeric/(select count(*) from interleaved_zone_map)::numeric))
/ 4 as average_selectivity
from generate_series(0, 3) i
left join lateral (
select count(*) as year_cnt
from interleaved_zone_map
where i between min_year and max_year
) y on 1=1
left join lateral (
select count(*) as region_cnt
from interleaved_zone_map
where i between min_region and max_region
) r on 1=1
left join lateral (
select count(*) as customer_cnt
from interleaved_zone_map
where i between min_customer and max_customer
) c on 1=1
left join lateral (
select count(*) as product_cnt
from interleaved_zone_map
where i between min_product and max_product
) p on 1=1
)
/*---------------------------------------------------------------------
Take a look at the resulting sort for each sort style
----------------------------------------------------------------------*/
select *
from compound
order by
year_id,
region_id,
customer_id,
product_id
-- select *
-- from interleaved
-- order by interleaved_id
/*---------------------------------------------------------------------
Take a look at the resulting zone maps for each sort style
----------------------------------------------------------------------*/
-- select *
-- from compound_zone_map
-- order by block
-- select *
-- from interleaved_zone_map
-- order by block
/*---------------------------------------------------------------------
Spot check how well each sort style's zone map facilitates pruning
------------------------------------------------------------------*/
/*-----------------------------------------
1 of 4 sort key predicates
----------------------------------------*/
/*--------------------------
Compound: 4 blocks
Interleaved: 8 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 1 between min_year and max_year
-- select count(*)
-- from interleaved_zone_map
-- where 1 between min_year and max_year
/*--------------------------
Compound: 16 blocks
Interleaved: 8 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 2 between min_product and max_product
-- select count(*)
-- from interleaved_zone_map
-- where 2 between min_product and max_product
/*-----------------------------------------
2 of 4 sort key predicates
----------------------------------------*/
/*--------------------------
Compound: 4 blocks
Interleaved: 4 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 0 between min_year and max_year
-- and 3 between min_customer and max_customer
-- select count(*)
-- from interleaved_zone_map
-- where 0 between min_year and max_year
-- and 3 between min_customer and max_customer
/*--------------------------
Compound: 16 blocks
Interleaved: 4 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 3 between min_customer and max_customer
-- and 2 between min_product and max_product
-- select count(*)
-- from interleaved_zone_map
-- where 3 between min_customer and max_customer
-- and 2 between min_product and max_product
/*-----------------------------------------
3 of 4 sort key predicates
----------------------------------------*/
/*--------------------------
Compound: 4 blocks
Interleaved: 2 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 3 between min_year and max_year
-- and 1 between min_customer and max_customer
-- and 0 between min_product and max_product
-- select count(*)
-- from interleaved_zone_map
-- where 3 between min_year and max_year
-- and 1 between min_customer and max_customer
-- and 0 between min_product and max_product
/*--------------------------
Compound: 4 blocks
Interleaved: 2 blocks
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 1 between min_region and max_region
-- and 0 between min_customer and max_customer
-- and 2 between min_product and max_product
-- select count(*)
-- from interleaved_zone_map
-- where 1 between min_region and max_region
-- and 0 between min_customer and max_customer
-- and 2 between min_product and max_product
/*-----------------------------------------
4 of 4 sort key predicates
----------------------------------------*/
/*--------------------------
Compound: 1 block
Interleaved: 1 block
-------------------------*/
-- select count(*)
-- from compound_zone_map
-- where 0 between min_year and max_year
-- and 0 between min_region and max_region
-- and 3 between min_customer and max_customer
-- and 2 between min_product and max_product
-- select count(*)
-- from interleaved_zone_map
-- where 0 between min_year and max_year
-- and 0 between min_region and max_region
-- and 3 between min_customer and max_customer
-- and 2 between min_product and max_product
/*---------------------------------------------------------------------
Ideally, the block selectivity of a where clause can be calculated as
follows with the interleaved sort style:
N^((S-P)/S)
where
N = total # of blocks
S = # of columns in the sort key
P = # of sort key columns specified in the where clause
When comparing the average selectivity of the interleaved sort style to
that of the compound sort style we can see that the interleaved sort
style is preferable for ad hoc multidimensional analysis
----------------------------------------------------------------------*/
-- select
-- 'compound' as style,
-- (select count(*) from compound_zone_map) total,
-- (select average_selectivity from compound_zone_map_selectivity)
-- as one_pred_selectivity,
-- ceil(
-- ((select average_selectivity from compound_zone_map_selectivity)::numeric)
-- * (select count(*) from compound_zone_map)::numeric
-- ) as one_pred_cnt,
-- pow((select average_selectivity from compound_zone_map_selectivity), 2)
-- as two_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from compound_zone_map_selectivity), 2)
-- * (select count(*) from compound_zone_map)::numeric
-- ) as two_pred_cnt,
-- pow((select average_selectivity from compound_zone_map_selectivity), 3)
-- as three_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from compound_zone_map_selectivity), 3)
-- * (select count(*) from compound_zone_map)::numeric
-- ) as three_pred_cnt,
-- pow((select average_selectivity from compound_zone_map_selectivity), 4)
-- as four_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from compound_zone_map_selectivity), 4)
-- * (select count(*) from compound_zone_map)::numeric
-- ) as four_pred_cnt
-- from compound_zone_map_selectivity
-- union
-- select
-- 'interleaved' as style,
-- (select count(*) from interleaved_zone_map) total,
-- (select average_selectivity from interleaved_zone_map_selectivity)
-- as one_pred_selectivity,
-- ceil(
-- ((select average_selectivity from interleaved_zone_map_selectivity)::numeric)
-- * (select count(*) from interleaved_zone_map)::numeric
-- ) as one_pred_cnt,
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 2)
-- as two_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 2)
-- * (select count(*) from interleaved_zone_map)::numeric
-- ) as two_pred_cnt,
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 3)
-- as three_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 3)
-- * (select count(*) from interleaved_zone_map)::numeric
-- ) as three_pred_cnt,
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 4)
-- as four_pred_selectivity,
-- ceil(
-- pow((select average_selectivity from interleaved_zone_map_selectivity), 4)
-- * (select count(*) from interleaved_zone_map)::numeric
-- ) as four_pred_cnt
-- from interleaved_zone_map_selectivity
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment