Skip to content

Instantly share code, notes, and snippets.

@eugeneks
Created February 19, 2021 11:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eugeneks/d8752452eeadfbb5001d986d6d47c778 to your computer and use it in GitHub Desktop.
Save eugeneks/d8752452eeadfbb5001d986d6d47c778 to your computer and use it in GitHub Desktop.
несколько скриптов для тестирования различных особенностей поведения EXASOL
-- здесь используеться схема PUB3318 для сохранения view. Вам нужно заменить на имя своей схемы, которую вы получите при регистрации https://www.exasol.com/test-it-now/
-- шаг 0. Что у нас с основнвми таблицами
OPEN SCHEMA RETAIL;
SELECT COUNT(*) FROM SALES
;
--322 186 828
SELECT COUNT(*) FROM SALES_POSITIONS
--3 381 550 778
;
SELECT COUNT(*) FROM MARKETS
--1 002
;
SELECT COUNT(*) FROM CITIES
--10 736
;
DESCRIBE SALES;
DESCRIBE SALES_POSITIONS;
DESCRIBE MARKETS;
DESCRIBE CITIES;
SELECT COUNT(*), IPROC() FROM SALES GROUP BY 2 ;
SELECT * FROM SALES limit 10
;
SELECT COUNT(*), VALUE2PROC(MARKET_ID) FROM SALES GROUP BY 2 ;
SELECT COUNT(*), VALUE2PROC(EMPLOYEE_ID) FROM SALES GROUP BY 2 ;
SELECT CURRENT_SESSION --полученный в данном запросе id сессии нужно вставиь в первый запрос скрипта 20200215_user_group_script_profile.sql вместо SESSION_ID = 1691764873478995969
-- шаг 2. Простой запрос
-- включаем профилирование запросов
alter session set profile='ON';
-- отключаем кеширование результатов запросов
alter session set query_cache='OFF';
SELECT
CITY_NAME
, SUM(AMOUNT) AS AMOUNT
FROM
SALES AS s
JOIN
SALES_POSITIONS AS sp
ON
s.SALES_ID = sp.SALES_ID
JOIN
MARKETS AS m
ON
s.MARKET_ID = m.MARKET_ID
JOIN
CITIES AS c
ON
m.CITY_ID = c.CITY_ID
GROUP BY
1
;
-- шаг 3. А что будет если запрос запускается первый раз?
DROP LOCAL INDEX ON SALES(SALES_ID);
DROP LOCAL INDEX ON SALES_POSITIONS(SALES_ID);
-- шаг 4. Включаем кеширование
alter session set query_cache='ON';
SELECT
CITY_NAME
, SUM(AMOUNT) AS AMOUNT
FROM
SALES AS s
JOIN
SALES_POSITIONS AS sp
ON
s.SALES_ID = sp.SALES_ID
JOIN
MARKETS AS m
ON
s.MARKET_ID = m.MARKET_ID
JOIN
CITIES AS c
ON
m.CITY_ID = c.CITY_ID
GROUP BY
1
;
-- шаг 5. Собираем в отчет табло
-- шаг 6. Немного статистики и UDF
CREATE OR REPLACE VIEW PUB3318.base_stats_by_city AS
WITH agg AS (
SELECT
EXTRACT(HOUR FROM SALES_TIMESTAMP) AS h
, CITY_NAME
, SUM(AMOUNT) AS AMOUNT
FROM
SALES AS s
JOIN
SALES_POSITIONS AS sp
ON
s.SALES_ID = sp.SALES_ID
JOIN
MARKETS AS m
ON
s.MARKET_ID = m.MARKET_ID
JOIN
CITIES AS c
ON
m.CITY_ID = c.CITY_ID
GROUP BY
local.h
, CITY_NAME
)
SELECT
CITY_NAME
, AVG(AMOUNT) AS mean
, STDDEV_SAMP(AMOUNT) AS std
, VAR_SAMP(AMOUNT) AS s
, COUNT(*) AS nobs
FROM
agg
GROUP BY
CITY_NAME
;
--/
CREATE OR REPLACE PYTHON3 SCALAR SCRIPT PUB3318.ttest_ind_from_stats(mean1 DOUBLE, std1 DOUBLE, nobs1 DECIMAL, mean2 DOUBLE, std2 DOUBLE, nobs2 DECIMAL, equal_var BOOLEAN)
EMITS (statistic DOUBLE, pvalue DOUBLE) AS
from scipy.stats import ttest_ind_from_stats
def run(ctx):
t=ttest_ind_from_stats(mean1=ctx.mean1, std1=ctx.std1, nobs1=ctx.nobs1,
mean2=ctx.mean2, std2=ctx.std2, nobs2=ctx.nobs2, equal_var = ctx.equal_var)
ctx.emit(t[0].item(), t[1].item())
/
SELECT
t.*
, CONCAT(CITY_NAME1, '_vs_' , CITY_NAME2) as test
, PUB3318.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, FALSE)
FROM
(
SELECT
CITY_NAME AS CITY_NAME1
, mean AS mean1
, std AS std1
, nobs AS nobs1
, LAG(CITY_NAME, 1) OVER (ORDER BY CITY_NAME) AS CITY_NAME2
, LAG(mean, 1) OVER (ORDER BY CITY_NAME) AS mean2
, LAG(std, 1) OVER (ORDER BY CITY_NAME) AS std2
, LAG(nobs, 1) OVER (ORDER BY CITY_NAME) AS nobs2
FROM
PUB3318.base_stats_by_city
WHERE
CITY_NAME in ('Heilbronn', 'Hildesheim')
) AS t
WHERE
CITY_NAME2 IS NOT NULL
;
--https://www.evanmiller.org/ab-testing/t-test.html#!1442468.666667/1138101.005901/24;1402354.458333/987668.752685/24@95
CREATE OR REPLACE VIEW PUB3318.stats_by_city AS
WITH c1 AS (
SELECT
CITY_NAME AS CITY_NAME1
, mean AS mean1
, std AS std1
, nobs AS nobs1
, s/nobs AS s1
FROM
PUB3318.base_stats_by_city
ORDER BY
CITY_NAME
)
, c2 AS (
SELECT
CITY_NAME1 AS CITY_NAME2
, mean1 AS mean2
, std1 AS std2
, nobs1 AS nobs2
, s1 AS s2
FROM c1
)
SELECT
CITY_NAME1
, mean1
, std1
, nobs1
, CITY_NAME2
, mean2
, std2
, nobs2
, CONCAT(CITY_NAME1, '_vs_' , CITY_NAME2) as test
, SQRT(s1+s2) AS se
, mean1 - mean2 AS diff_mean
, (mean1 - mean2)/CAST(GREATEST(mean1, mean2) AS DOUBLE) AS uplift
, 4*SQRT(s1+s2)/CAST(SQRT(nobs1 + nobs2 - 2) AS DOUBLE) AS mde
, (mean1 - mean2) / CAST(SQRT(s1+s2) AS double) AS t_statistic
, PUB3318.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, FALSE)
FROM
c1
FULL JOIN
c2
ON
c1.CITY_NAME1 != c2.CITY_NAME2
;
SELECT
*
FROM
PUB3318.stats_by_city
WHERE
CITY_NAME1 = 'Heilbronn'
AND CITY_NAME2 = 'Hildesheim'
-- шаг 7. Перцентильный анализ
CREATE OR REPLACE VIEW PUB3318.percentile_base AS
SELECT
EXTRACT(YEAR FROM SALES_TIMESTAMP) AS year_
, m.MARKET_ID as dim
, SUM(AMOUNT) AS AMOUNT
, COUNT(DISTINCT s.SALES_ID) AS sales_cnt
FROM
SALES AS s
JOIN
SALES_POSITIONS AS sp
ON
s.SALES_ID = sp.SALES_ID
JOIN
MARKETS AS m
ON
s.MARKET_ID = m.MARKET_ID
JOIN
CITIES AS c
ON
m.CITY_ID = c.CITY_ID
GROUP BY
1,2
;
WITH ranked_2016 AS (
SELECT
dim
, AMOUNT
, RANK() OVER ( ORDER BY AMOUNT DESC) AS rnk_2016
, COUNT(DISTINCT dim) OVER() AS cnt_2016
, sales_cnt
FROM
PUB3318.percentile_base WHERE year_ = '2016'
)
, ranked_2015 AS (
SELECT
dim
, AMOUNT
, sales_cnt
, RANK() OVER ( ORDER BY AMOUNT DESC) AS rnk_2015
, COUNT(DISTINCT dim) OVER() AS cnt_2015
FROM
PUB3318.percentile_base WHERE year_ = '2015'
)
SELECT
COALESCE(ranked_2016.dim, ranked_2015.dim) AS dim
, COALESCE(ranked_2016.AMOUNT, 0) AS AMOUNT_2016
, COALESCE(ranked_2015.AMOUNT, 0) AS AMOUNT_2015
, COALESCE(ranked_2016.sales_cnt,0) AS sales_cnt_2016
, COALESCE(ranked_2015.sales_cnt,0) AS sales_cnt_2015
, COALESCE(1- rnk_2016/cnt_2016, 0) AS prcentile_2016
, COALESCE(1- rnk_2015/cnt_2015, 0) AS prcentile_2015
FROM
ranked_2016
FULL JOIN
ranked_2015
ON
ranked_2016.dim = ranked_2015.dim
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment