Skip to content

Instantly share code, notes, and snippets.

@leonpanokarren
Last active October 12, 2020 22:52
Show Gist options
  • Save leonpanokarren/1e6512747cd7c191d497cf8588b96a55 to your computer and use it in GitHub Desktop.
Save leonpanokarren/1e6512747cd7c191d497cf8588b96a55 to your computer and use it in GitHub Desktop.
/*
Following queries perform the tests described here -
* I run tests with 3 probability distributions, in order -
- Log Normal
- Normal
- Uniform
1. I pick random samples (10MM) from the distribution (so I know quantiles apriori)
2. For 3 compression factors - DEFAULT, 500, 800, for the above 3 distributions, I build three digests
a) scaled_up_digest = MERGE("distribution left of median" weighted 20X, SCALE_TDIGEST("distribution right of median" weighted 1X, 20.0))
b) scaled_down_digest = MERGE(SCALE_TDIGEST("distribution left of median" weighted 20X, 0.05), "distribution right of median" weighted 1X)
c) unweighted digest = distribution
3. List P1, P50, P99 from 2 - a), b), c) against "apriori quantiles"
*/
SET SESSION experimental_functions_enabled = True;
WITH random_vector AS (
SELECT CAST(idx AS DOUBLE) AS idx
FROM (
SELECT FLATTEN(TRANSFORM(SEQUENCE(1, 10000), x -> TRANSFORM(SEQUENCE(1, 1000), y -> RANDOM()))) AS seq
)
CROSS JOIN UNNEST(seq) AS _(idx)
),
log_normal AS (
SELECT
idx,
idx <= EXP(12) AS le_median
FROM (
SELECT
EXP(INVERSE_NORMAL_CDF(12, 5, idx)) AS idx
FROM random_vector
)
),
normal AS (
SELECT
idx,
idx <= 1E5 AS le_median
FROM (
SELECT
INVERSE_NORMAL_CDF(1E5, 1E2, idx) AS idx
FROM random_vector
)
),
uniform AS (
-- uniform between 100K and 2M
SELECT
idx,
idx <= ((2000000 - 100000) / 2.0) AS le_median
FROM (
SELECT
(2000000 - 100000) * idx + 100000 AS idx
FROM random_vector
)
)
SELECT
distribution,
compression_factor,
TRANSFORM(
ARRAY[0.01, 0.5, 0.99],
x -> CASE distribution
WHEN 'log_normal'
THEN EXP(INVERSE_NORMAL_CDF(12, 5, x))
WHEN 'normal'
THEN INVERSE_NORMAL_CDF(1E5, 1E2, x)
WHEN 'uniform'
THEN (2000000 - 100000) * x + 100000
END
) AS apriori_quantiles,
VALUES_AT_QUANTILES(scaled_up_digest, ARRAY[0.01, 0.5, 0.99]) AS scaled_up_quantiles,
VALUES_AT_QUANTILES(scaled_down_digest, ARRAY[0.01, 0.5, 0.99]) AS scaled_down_quantiles,
VALUES_AT_QUANTILES(unscaled_digest, ARRAY[0.01, 0.5, 0.99]) AS unscaled_quantiles
FROM (
SELECT
'uniform' AS distribution,
MERGE(scaled_up_digest_default_cf) AS scaled_up_digest_default_cf,
MERGE(scaled_down_digest_default_cf) AS scaled_down_digest_default_cf,
MERGE(unscaled_digest_default_cf) AS unscaled_digest_default_cf,
MERGE(scaled_up_digest_500_cf) AS scaled_up_digest_500_cf,
MERGE(scaled_down_digest_500_cf) AS scaled_down_digest_500_cf,
MERGE(unscaled_digest_500_cf) AS unscaled_digest_500_cf,
MERGE(scaled_up_digest_800_cf) AS scaled_up_digest_800_cf,
MERGE(scaled_down_digest_800_cf) AS scaled_down_digest_800_cf,
MERGE(unscaled_digest_800_cf) AS unscaled_digest_800_cf
FROM (
SELECT
TDIGEST_AGG(idx, 20) FILTER(WHERE le_median) AS half_weighted_up_default_cf,
TDIGEST_AGG(idx, 1) FILTER(WHERE NOT le_median) AS half_weighted_down_default_cf,
TDIGEST_AGG(idx, 1) AS unweighted_digest_default_cf,
TDIGEST_AGG(idx, 20, 500) FILTER(WHERE le_median) AS half_weighted_up_500_cf,
TDIGEST_AGG(idx, 1, 500) FILTER(WHERE NOT le_median) AS half_weighted_down_500_cf,
TDIGEST_AGG(idx, 1, 500) AS unweighted_digest_500_cf,
TDIGEST_AGG(idx, 20, 800) FILTER(WHERE le_median) AS half_weighted_up_800_cf,
TDIGEST_AGG(idx, 1, 800) FILTER(WHERE NOT le_median) AS half_weighted_down_800_cf,
TDIGEST_AGG(idx, 1, 800) AS unweighted_digest_800_cf
FROM uniform
) T
CROSS JOIN UNNEST(
ARRAY[
half_weighted_up_default_cf,
SCALE_TDIGEST(half_weighted_down_default_cf, 20.0),
NULL
],
ARRAY[
SCALE_TDIGEST(half_weighted_up_default_cf, 0.05),
half_weighted_down_default_cf,
NULL
],
ARRAY[
NULL,
NULL,
unweighted_digest_default_cf
],
ARRAY[
half_weighted_up_500_cf,
SCALE_TDIGEST(half_weighted_down_500_cf, 20.0),
NULL
],
ARRAY[
SCALE_TDIGEST(half_weighted_up_500_cf, 0.05),
half_weighted_down_500_cf,
NULL
],
ARRAY[
NULL,
NULL,
unweighted_digest_500_cf
],
ARRAY[
half_weighted_up_800_cf,
SCALE_TDIGEST(half_weighted_down_800_cf, 20.0),
NULL
],
ARRAY[
SCALE_TDIGEST(half_weighted_up_800_cf, 0.05),
half_weighted_down_800_cf,
NULL
],
ARRAY[
NULL,
NULL,
unweighted_digest_800_cf
]
) AS _(
scaled_up_digest_default_cf,
scaled_down_digest_default_cf,
unscaled_digest_default_cf,
scaled_up_digest_500_cf,
scaled_down_digest_500_cf,
unscaled_digest_500_cf,
scaled_up_digest_800_cf,
scaled_down_digest_800_cf,
unscaled_digest_800_cf
)
) CROSS JOIN UNNEST(
ARRAY[
'DEFAULT',
'500',
'800'
],
ARRAY[
scaled_up_digest_default_cf,
scaled_up_digest_500_cf,
scaled_up_digest_800_cf
],
ARRAY[
scaled_down_digest_default_cf,
scaled_down_digest_500_cf,
scaled_down_digest_800_cf
],
ARRAY[
unscaled_digest_default_cf,
unscaled_digest_500_cf,
unscaled_digest_800_cf
]
) AS _(
compression_factor,
scaled_up_digest,
scaled_down_digest,
unscaled_digest
)\G
-[ RECORD 1 ]---------+----------------------------------------------------------------
distribution | log_normal
compression_factor | DEFAULT
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10]
scaled_up_quantiles | [1.5359592204064665, 307554.85457878053, 2.0796682865726635E10]
scaled_down_quantiles | [1.5359592204064665, 249218.0610434275, 2.0796682865726635E10]
unscaled_quantiles | [1.515353159434415, 249718.7921916679, 2.1399015944893288E10]
-[ RECORD 2 ]---------+----------------------------------------------------------------
distribution | log_normal
compression_factor | 500
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10]
scaled_up_quantiles | [1.4585852160385713, 167986.47184101312, 1.8294047407103294E10]
scaled_down_quantiles | [1.4585852160385713, 166057.02483024818, 1.82940474071033E10]
unscaled_quantiles | [1.4584148172412785, 165228.93909224155, 1.8318858520169685E10]
-[ RECORD 3 ]---------+----------------------------------------------------------------
distribution | log_normal
compression_factor | 800
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10]
scaled_up_quantiles | [1.454231729843197, 163825.55677102658, 1.818698820822231E10]
scaled_down_quantiles | [1.4542317298431973, 162945.1544370374, 1.818698820822231E10]
unscaled_quantiles | [1.4544817842480005, 163928.87357066962, 1.822910421989535E10]
-[ RECORD 1 ]---------+------------------------------------------------------------
distribution | normal
compression_factor | DEFAULT
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408]
scaled_up_quantiles | [99766.08650306136, 100000.05677218233, 100233.77473874774]
scaled_down_quantiles | [99766.08650306136, 100000.00193136233, 100233.77473874774]
unscaled_quantiles | [99766.85487211181, 99999.93926257321, 100233.95187430267]
-[ RECORD 2 ]---------+------------------------------------------------------------
distribution | normal
compression_factor | 500
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408]
scaled_up_quantiles | [99767.22764265713, 99999.98083959654, 100232.87933125815]
scaled_down_quantiles | [99767.22764265716, 99999.97985982196, 100232.87933125815]
unscaled_quantiles | [99767.23443965145, 99999.98473728393, 100232.87091707803]
-[ RECORD 3 ]---------+------------------------------------------------------------
distribution | normal
compression_factor | 800
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408]
scaled_up_quantiles | [99767.23880577942, 99999.97996331369, 100232.84768207217]
scaled_down_quantiles | [99767.2388057794, 99999.98012490512, 100232.84768207218]
unscaled_quantiles | [99767.24062037654, 99999.97565511118, 100232.87977874378]
-[ RECORD 1 ]---------+-------------------------------------------------------------
distribution | uniform
compression_factor | DEFAULT
apriori_quantiles | [119000.0, 1050000.0, 1981000.0]
scaled_up_quantiles | [119164.85849159476, 1049955.7801500722, 1980776.9472385834]
scaled_down_quantiles | [119164.85849159476, 1049923.267660422, 1980776.9472385834]
unscaled_quantiles | [119063.47189594447, 1050608.2550718652, 1980741.0190435227]
-[ RECORD 2 ]---------+-------------------------------------------------------------
distribution | uniform
compression_factor | 500
apriori_quantiles | [119000.0, 1050000.0, 1981000.0]
scaled_up_quantiles | [118972.85646847171, 1049570.6637134296, 1980910.3718745543]
scaled_down_quantiles | [118972.85646847173, 1049580.3177275367, 1980910.3718745538]
unscaled_quantiles | [118961.91274724042, 1049689.2777592111, 1980911.0140026675]
-[ RECORD 3 ]---------+-------------------------------------------------------------
distribution | uniform
compression_factor | 800
apriori_quantiles | [119000.0, 1050000.0, 1981000.0]
scaled_up_quantiles | [118962.0543740767, 1049542.7212385887, 1980913.7779269682]
scaled_down_quantiles | [118962.05437407669, 1049529.021825261, 1980913.7779269682]
unscaled_quantiles | [118960.52340109413, 1049543.653287954, 1980919.5487901]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment