Created
February 19, 2021 11:12
-
-
Save eugeneks/d8752452eeadfbb5001d986d6d47c778 to your computer and use it in GitHub Desktop.
несколько скриптов для тестирования различных особенностей поведения EXASOL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- здесь используеться схема PUB3318 для сохранения view. Вам нужно заменить на имя своей схемы, которую вы получите при регистрации https://www.exasol.com/test-it-now/ | |
-- шаг 0. Что у нас с основнвми таблицами | |
OPEN SCHEMA RETAIL; | |
SELECT COUNT(*) FROM SALES | |
; | |
--322 186 828 | |
SELECT COUNT(*) FROM SALES_POSITIONS | |
--3 381 550 778 | |
; | |
SELECT COUNT(*) FROM MARKETS | |
--1 002 | |
; | |
SELECT COUNT(*) FROM CITIES | |
--10 736 | |
; | |
DESCRIBE SALES; | |
DESCRIBE SALES_POSITIONS; | |
DESCRIBE MARKETS; | |
DESCRIBE CITIES; | |
SELECT COUNT(*), IPROC() FROM SALES GROUP BY 2 ; | |
SELECT * FROM SALES limit 10 | |
; | |
SELECT COUNT(*), VALUE2PROC(MARKET_ID) FROM SALES GROUP BY 2 ; | |
SELECT COUNT(*), VALUE2PROC(EMPLOYEE_ID) FROM SALES GROUP BY 2 ; | |
SELECT CURRENT_SESSION --полученный в данном запросе id сессии нужно вставиь в первый запрос скрипта 20200215_user_group_script_profile.sql вместо SESSION_ID = 1691764873478995969 | |
-- шаг 2. Простой запрос | |
-- включаем профилирование запросов | |
alter session set profile='ON'; | |
-- отключаем кеширование результатов запросов | |
alter session set query_cache='OFF'; | |
SELECT | |
CITY_NAME | |
, SUM(AMOUNT) AS AMOUNT | |
FROM | |
SALES AS s | |
JOIN | |
SALES_POSITIONS AS sp | |
ON | |
s.SALES_ID = sp.SALES_ID | |
JOIN | |
MARKETS AS m | |
ON | |
s.MARKET_ID = m.MARKET_ID | |
JOIN | |
CITIES AS c | |
ON | |
m.CITY_ID = c.CITY_ID | |
GROUP BY | |
1 | |
; | |
-- шаг 3. А что будет если запрос запускается первый раз? | |
DROP LOCAL INDEX ON SALES(SALES_ID); | |
DROP LOCAL INDEX ON SALES_POSITIONS(SALES_ID); | |
-- шаг 4. Включаем кеширование | |
alter session set query_cache='ON'; | |
SELECT | |
CITY_NAME | |
, SUM(AMOUNT) AS AMOUNT | |
FROM | |
SALES AS s | |
JOIN | |
SALES_POSITIONS AS sp | |
ON | |
s.SALES_ID = sp.SALES_ID | |
JOIN | |
MARKETS AS m | |
ON | |
s.MARKET_ID = m.MARKET_ID | |
JOIN | |
CITIES AS c | |
ON | |
m.CITY_ID = c.CITY_ID | |
GROUP BY | |
1 | |
; | |
-- шаг 5. Собираем в отчет табло | |
-- шаг 6. Немного статистики и UDF | |
CREATE OR REPLACE VIEW PUB3318.base_stats_by_city AS | |
WITH agg AS ( | |
SELECT | |
EXTRACT(HOUR FROM SALES_TIMESTAMP) AS h | |
, CITY_NAME | |
, SUM(AMOUNT) AS AMOUNT | |
FROM | |
SALES AS s | |
JOIN | |
SALES_POSITIONS AS sp | |
ON | |
s.SALES_ID = sp.SALES_ID | |
JOIN | |
MARKETS AS m | |
ON | |
s.MARKET_ID = m.MARKET_ID | |
JOIN | |
CITIES AS c | |
ON | |
m.CITY_ID = c.CITY_ID | |
GROUP BY | |
local.h | |
, CITY_NAME | |
) | |
SELECT | |
CITY_NAME | |
, AVG(AMOUNT) AS mean | |
, STDDEV_SAMP(AMOUNT) AS std | |
, VAR_SAMP(AMOUNT) AS s | |
, COUNT(*) AS nobs | |
FROM | |
agg | |
GROUP BY | |
CITY_NAME | |
; | |
--/ | |
CREATE OR REPLACE PYTHON3 SCALAR SCRIPT PUB3318.ttest_ind_from_stats(mean1 DOUBLE, std1 DOUBLE, nobs1 DECIMAL, mean2 DOUBLE, std2 DOUBLE, nobs2 DECIMAL, equal_var BOOLEAN) | |
EMITS (statistic DOUBLE, pvalue DOUBLE) AS | |
from scipy.stats import ttest_ind_from_stats | |
def run(ctx): | |
t=ttest_ind_from_stats(mean1=ctx.mean1, std1=ctx.std1, nobs1=ctx.nobs1, | |
mean2=ctx.mean2, std2=ctx.std2, nobs2=ctx.nobs2, equal_var = ctx.equal_var) | |
ctx.emit(t[0].item(), t[1].item()) | |
/ | |
SELECT | |
t.* | |
, CONCAT(CITY_NAME1, '_vs_' , CITY_NAME2) as test | |
, PUB3318.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, FALSE) | |
FROM | |
( | |
SELECT | |
CITY_NAME AS CITY_NAME1 | |
, mean AS mean1 | |
, std AS std1 | |
, nobs AS nobs1 | |
, LAG(CITY_NAME, 1) OVER (ORDER BY CITY_NAME) AS CITY_NAME2 | |
, LAG(mean, 1) OVER (ORDER BY CITY_NAME) AS mean2 | |
, LAG(std, 1) OVER (ORDER BY CITY_NAME) AS std2 | |
, LAG(nobs, 1) OVER (ORDER BY CITY_NAME) AS nobs2 | |
FROM | |
PUB3318.base_stats_by_city | |
WHERE | |
CITY_NAME in ('Heilbronn', 'Hildesheim') | |
) AS t | |
WHERE | |
CITY_NAME2 IS NOT NULL | |
; | |
--https://www.evanmiller.org/ab-testing/t-test.html#!1442468.666667/1138101.005901/24;1402354.458333/987668.752685/24@95 | |
CREATE OR REPLACE VIEW PUB3318.stats_by_city AS | |
WITH c1 AS ( | |
SELECT | |
CITY_NAME AS CITY_NAME1 | |
, mean AS mean1 | |
, std AS std1 | |
, nobs AS nobs1 | |
, s/nobs AS s1 | |
FROM | |
PUB3318.base_stats_by_city | |
ORDER BY | |
CITY_NAME | |
) | |
, c2 AS ( | |
SELECT | |
CITY_NAME1 AS CITY_NAME2 | |
, mean1 AS mean2 | |
, std1 AS std2 | |
, nobs1 AS nobs2 | |
, s1 AS s2 | |
FROM c1 | |
) | |
SELECT | |
CITY_NAME1 | |
, mean1 | |
, std1 | |
, nobs1 | |
, CITY_NAME2 | |
, mean2 | |
, std2 | |
, nobs2 | |
, CONCAT(CITY_NAME1, '_vs_' , CITY_NAME2) as test | |
, SQRT(s1+s2) AS se | |
, mean1 - mean2 AS diff_mean | |
, (mean1 - mean2)/CAST(GREATEST(mean1, mean2) AS DOUBLE) AS uplift | |
, 4*SQRT(s1+s2)/CAST(SQRT(nobs1 + nobs2 - 2) AS DOUBLE) AS mde | |
, (mean1 - mean2) / CAST(SQRT(s1+s2) AS double) AS t_statistic | |
, PUB3318.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, FALSE) | |
FROM | |
c1 | |
FULL JOIN | |
c2 | |
ON | |
c1.CITY_NAME1 != c2.CITY_NAME2 | |
; | |
SELECT | |
* | |
FROM | |
PUB3318.stats_by_city | |
WHERE | |
CITY_NAME1 = 'Heilbronn' | |
AND CITY_NAME2 = 'Hildesheim' | |
-- шаг 7. Перцентильный анализ | |
CREATE OR REPLACE VIEW PUB3318.percentile_base AS | |
SELECT | |
EXTRACT(YEAR FROM SALES_TIMESTAMP) AS year_ | |
, m.MARKET_ID as dim | |
, SUM(AMOUNT) AS AMOUNT | |
, COUNT(DISTINCT s.SALES_ID) AS sales_cnt | |
FROM | |
SALES AS s | |
JOIN | |
SALES_POSITIONS AS sp | |
ON | |
s.SALES_ID = sp.SALES_ID | |
JOIN | |
MARKETS AS m | |
ON | |
s.MARKET_ID = m.MARKET_ID | |
JOIN | |
CITIES AS c | |
ON | |
m.CITY_ID = c.CITY_ID | |
GROUP BY | |
1,2 | |
; | |
WITH ranked_2016 AS ( | |
SELECT | |
dim | |
, AMOUNT | |
, RANK() OVER ( ORDER BY AMOUNT DESC) AS rnk_2016 | |
, COUNT(DISTINCT dim) OVER() AS cnt_2016 | |
, sales_cnt | |
FROM | |
PUB3318.percentile_base WHERE year_ = '2016' | |
) | |
, ranked_2015 AS ( | |
SELECT | |
dim | |
, AMOUNT | |
, sales_cnt | |
, RANK() OVER ( ORDER BY AMOUNT DESC) AS rnk_2015 | |
, COUNT(DISTINCT dim) OVER() AS cnt_2015 | |
FROM | |
PUB3318.percentile_base WHERE year_ = '2015' | |
) | |
SELECT | |
COALESCE(ranked_2016.dim, ranked_2015.dim) AS dim | |
, COALESCE(ranked_2016.AMOUNT, 0) AS AMOUNT_2016 | |
, COALESCE(ranked_2015.AMOUNT, 0) AS AMOUNT_2015 | |
, COALESCE(ranked_2016.sales_cnt,0) AS sales_cnt_2016 | |
, COALESCE(ranked_2015.sales_cnt,0) AS sales_cnt_2015 | |
, COALESCE(1- rnk_2016/cnt_2016, 0) AS prcentile_2016 | |
, COALESCE(1- rnk_2015/cnt_2015, 0) AS prcentile_2015 | |
FROM | |
ranked_2016 | |
FULL JOIN | |
ranked_2015 | |
ON | |
ranked_2016.dim = ranked_2015.dim | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment