Last active
October 12, 2020 22:52
-
-
Save leonpanokarren/1e6512747cd7c191d497cf8588b96a55 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Following queries perform the tests described here - | |
* I run tests with 3 probability distributions, in order - | |
- Log Normal | |
- Normal | |
- Uniform | |
1. I pick random samples (10MM) from the distribution (so I know quantiles apriori) | |
2. For 3 compression factors - DEFAULT, 500, 800, for the above 3 distributions, I build three digests | |
a) scaled_up_digest = MERGE("distribution left of median" weighted 20X, SCALE_TDIGEST("distribution right of median" weighted 1X, 20.0)) | |
b) scaled_down_digest = MERGE(SCALE_TDIGEST("distribution left of median" weighted 20X, 0.05), "distribution right of median" weighted 1X) | |
c) unweighted digest = distribution | |
3. List P1, P50, P99 from 2 - a), b), c) against "apriori quantiles" | |
*/ | |
SET SESSION experimental_functions_enabled = True; | |
WITH random_vector AS ( | |
SELECT CAST(idx AS DOUBLE) AS idx | |
FROM ( | |
SELECT FLATTEN(TRANSFORM(SEQUENCE(1, 10000), x -> TRANSFORM(SEQUENCE(1, 1000), y -> RANDOM()))) AS seq | |
) | |
CROSS JOIN UNNEST(seq) AS _(idx) | |
), | |
log_normal AS ( | |
SELECT | |
idx, | |
idx <= EXP(12) AS le_median | |
FROM ( | |
SELECT | |
EXP(INVERSE_NORMAL_CDF(12, 5, idx)) AS idx | |
FROM random_vector | |
) | |
), | |
normal AS ( | |
SELECT | |
idx, | |
idx <= 1E5 AS le_median | |
FROM ( | |
SELECT | |
INVERSE_NORMAL_CDF(1E5, 1E2, idx) AS idx | |
FROM random_vector | |
) | |
), | |
uniform AS ( | |
-- uniform between 100K and 2M | |
SELECT | |
idx, | |
idx <= ((2000000 - 100000) / 2.0) AS le_median | |
FROM ( | |
SELECT | |
(2000000 - 100000) * idx + 100000 AS idx | |
FROM random_vector | |
) | |
) | |
SELECT | |
distribution, | |
compression_factor, | |
TRANSFORM( | |
ARRAY[0.01, 0.5, 0.99], | |
x -> CASE distribution | |
WHEN 'log_normal' | |
THEN EXP(INVERSE_NORMAL_CDF(12, 5, x)) | |
WHEN 'normal' | |
THEN INVERSE_NORMAL_CDF(1E5, 1E2, x) | |
WHEN 'uniform' | |
THEN (2000000 - 100000) * x + 100000 | |
END | |
) AS apriori_quantiles, | |
VALUES_AT_QUANTILES(scaled_up_digest, ARRAY[0.01, 0.5, 0.99]) AS scaled_up_quantiles, | |
VALUES_AT_QUANTILES(scaled_down_digest, ARRAY[0.01, 0.5, 0.99]) AS scaled_down_quantiles, | |
VALUES_AT_QUANTILES(unscaled_digest, ARRAY[0.01, 0.5, 0.99]) AS unscaled_quantiles | |
FROM ( | |
SELECT | |
'uniform' AS distribution, | |
MERGE(scaled_up_digest_default_cf) AS scaled_up_digest_default_cf, | |
MERGE(scaled_down_digest_default_cf) AS scaled_down_digest_default_cf, | |
MERGE(unscaled_digest_default_cf) AS unscaled_digest_default_cf, | |
MERGE(scaled_up_digest_500_cf) AS scaled_up_digest_500_cf, | |
MERGE(scaled_down_digest_500_cf) AS scaled_down_digest_500_cf, | |
MERGE(unscaled_digest_500_cf) AS unscaled_digest_500_cf, | |
MERGE(scaled_up_digest_800_cf) AS scaled_up_digest_800_cf, | |
MERGE(scaled_down_digest_800_cf) AS scaled_down_digest_800_cf, | |
MERGE(unscaled_digest_800_cf) AS unscaled_digest_800_cf | |
FROM ( | |
SELECT | |
TDIGEST_AGG(idx, 20) FILTER(WHERE le_median) AS half_weighted_up_default_cf, | |
TDIGEST_AGG(idx, 1) FILTER(WHERE NOT le_median) AS half_weighted_down_default_cf, | |
TDIGEST_AGG(idx, 1) AS unweighted_digest_default_cf, | |
TDIGEST_AGG(idx, 20, 500) FILTER(WHERE le_median) AS half_weighted_up_500_cf, | |
TDIGEST_AGG(idx, 1, 500) FILTER(WHERE NOT le_median) AS half_weighted_down_500_cf, | |
TDIGEST_AGG(idx, 1, 500) AS unweighted_digest_500_cf, | |
TDIGEST_AGG(idx, 20, 800) FILTER(WHERE le_median) AS half_weighted_up_800_cf, | |
TDIGEST_AGG(idx, 1, 800) FILTER(WHERE NOT le_median) AS half_weighted_down_800_cf, | |
TDIGEST_AGG(idx, 1, 800) AS unweighted_digest_800_cf | |
FROM uniform | |
) T | |
CROSS JOIN UNNEST( | |
ARRAY[ | |
half_weighted_up_default_cf, | |
SCALE_TDIGEST(half_weighted_down_default_cf, 20.0), | |
NULL | |
], | |
ARRAY[ | |
SCALE_TDIGEST(half_weighted_up_default_cf, 0.05), | |
half_weighted_down_default_cf, | |
NULL | |
], | |
ARRAY[ | |
NULL, | |
NULL, | |
unweighted_digest_default_cf | |
], | |
ARRAY[ | |
half_weighted_up_500_cf, | |
SCALE_TDIGEST(half_weighted_down_500_cf, 20.0), | |
NULL | |
], | |
ARRAY[ | |
SCALE_TDIGEST(half_weighted_up_500_cf, 0.05), | |
half_weighted_down_500_cf, | |
NULL | |
], | |
ARRAY[ | |
NULL, | |
NULL, | |
unweighted_digest_500_cf | |
], | |
ARRAY[ | |
half_weighted_up_800_cf, | |
SCALE_TDIGEST(half_weighted_down_800_cf, 20.0), | |
NULL | |
], | |
ARRAY[ | |
SCALE_TDIGEST(half_weighted_up_800_cf, 0.05), | |
half_weighted_down_800_cf, | |
NULL | |
], | |
ARRAY[ | |
NULL, | |
NULL, | |
unweighted_digest_800_cf | |
] | |
) AS _( | |
scaled_up_digest_default_cf, | |
scaled_down_digest_default_cf, | |
unscaled_digest_default_cf, | |
scaled_up_digest_500_cf, | |
scaled_down_digest_500_cf, | |
unscaled_digest_500_cf, | |
scaled_up_digest_800_cf, | |
scaled_down_digest_800_cf, | |
unscaled_digest_800_cf | |
) | |
) CROSS JOIN UNNEST( | |
ARRAY[ | |
'DEFAULT', | |
'500', | |
'800' | |
], | |
ARRAY[ | |
scaled_up_digest_default_cf, | |
scaled_up_digest_500_cf, | |
scaled_up_digest_800_cf | |
], | |
ARRAY[ | |
scaled_down_digest_default_cf, | |
scaled_down_digest_500_cf, | |
scaled_down_digest_800_cf | |
], | |
ARRAY[ | |
unscaled_digest_default_cf, | |
unscaled_digest_500_cf, | |
unscaled_digest_800_cf | |
] | |
) AS _( | |
compression_factor, | |
scaled_up_digest, | |
scaled_down_digest, | |
unscaled_digest | |
)\G | |
-[ RECORD 1 ]---------+---------------------------------------------------------------- | |
distribution | log_normal | |
compression_factor | DEFAULT | |
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10] | |
scaled_up_quantiles | [1.5359592204064665, 307554.85457878053, 2.0796682865726635E10] | |
scaled_down_quantiles | [1.5359592204064665, 249218.0610434275, 2.0796682865726635E10] | |
unscaled_quantiles | [1.515353159434415, 249718.7921916679, 2.1399015944893288E10] | |
-[ RECORD 2 ]---------+---------------------------------------------------------------- | |
distribution | log_normal | |
compression_factor | 500 | |
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10] | |
scaled_up_quantiles | [1.4585852160385713, 167986.47184101312, 1.8294047407103294E10] | |
scaled_down_quantiles | [1.4585852160385713, 166057.02483024818, 1.82940474071033E10] | |
unscaled_quantiles | [1.4584148172412785, 165228.93909224155, 1.8318858520169685E10] | |
-[ RECORD 3 ]---------+---------------------------------------------------------------- | |
distribution | log_normal | |
compression_factor | 800 | |
apriori_quantiles | [1.4452186569362502, 162754.79141900392, 1.8328798900228962E10] | |
scaled_up_quantiles | [1.454231729843197, 163825.55677102658, 1.818698820822231E10] | |
scaled_down_quantiles | [1.4542317298431973, 162945.1544370374, 1.818698820822231E10] | |
unscaled_quantiles | [1.4544817842480005, 163928.87357066962, 1.822910421989535E10] | |
-[ RECORD 1 ]---------+------------------------------------------------------------ | |
distribution | normal | |
compression_factor | DEFAULT | |
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408] | |
scaled_up_quantiles | [99766.08650306136, 100000.05677218233, 100233.77473874774] | |
scaled_down_quantiles | [99766.08650306136, 100000.00193136233, 100233.77473874774] | |
unscaled_quantiles | [99766.85487211181, 99999.93926257321, 100233.95187430267] | |
-[ RECORD 2 ]---------+------------------------------------------------------------ | |
distribution | normal | |
compression_factor | 500 | |
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408] | |
scaled_up_quantiles | [99767.22764265713, 99999.98083959654, 100232.87933125815] | |
scaled_down_quantiles | [99767.22764265716, 99999.97985982196, 100232.87933125815] | |
unscaled_quantiles | [99767.23443965145, 99999.98473728393, 100232.87091707803] | |
-[ RECORD 3 ]---------+------------------------------------------------------------ | |
distribution | normal | |
compression_factor | 800 | |
apriori_quantiles | [99767.36521259592, 100000.0, 100232.63478740408] | |
scaled_up_quantiles | [99767.23880577942, 99999.97996331369, 100232.84768207217] | |
scaled_down_quantiles | [99767.2388057794, 99999.98012490512, 100232.84768207218] | |
unscaled_quantiles | [99767.24062037654, 99999.97565511118, 100232.87977874378] | |
-[ RECORD 1 ]---------+------------------------------------------------------------- | |
distribution | uniform | |
compression_factor | DEFAULT | |
apriori_quantiles | [119000.0, 1050000.0, 1981000.0] | |
scaled_up_quantiles | [119164.85849159476, 1049955.7801500722, 1980776.9472385834] | |
scaled_down_quantiles | [119164.85849159476, 1049923.267660422, 1980776.9472385834] | |
unscaled_quantiles | [119063.47189594447, 1050608.2550718652, 1980741.0190435227] | |
-[ RECORD 2 ]---------+------------------------------------------------------------- | |
distribution | uniform | |
compression_factor | 500 | |
apriori_quantiles | [119000.0, 1050000.0, 1981000.0] | |
scaled_up_quantiles | [118972.85646847171, 1049570.6637134296, 1980910.3718745543] | |
scaled_down_quantiles | [118972.85646847173, 1049580.3177275367, 1980910.3718745538] | |
unscaled_quantiles | [118961.91274724042, 1049689.2777592111, 1980911.0140026675] | |
-[ RECORD 3 ]---------+------------------------------------------------------------- | |
distribution | uniform | |
compression_factor | 800 | |
apriori_quantiles | [119000.0, 1050000.0, 1981000.0] | |
scaled_up_quantiles | [118962.0543740767, 1049542.7212385887, 1980913.7779269682] | |
scaled_down_quantiles | [118962.05437407669, 1049529.021825261, 1980913.7779269682] | |
unscaled_quantiles | [118960.52340109413, 1049543.653287954, 1980919.5487901] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment