|
-- 1. Get the data |
|
|
|
CREATE TABLE src_lineorder |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/dbgen/lineorder.tbl', 'CSV') |
|
; |
|
|
|
select * from src_lineorder limit 200 ; |
|
|
|
-- 2. Clickhouse internal storage |
|
|
|
CREATE TABLE lineorder_internal |
|
engine = MergeTree() |
|
ORDER BY |
|
( |
|
LO_ORDERDATE |
|
, LO_ORDERKEY |
|
) |
|
PARTITION BY (toYear(LO_ORDERDATE)) |
|
AS SELECT * FROM src_lineorder |
|
; |
|
|
|
select count(*) from lineorder_internal ; |
|
select * from lineorder_internal limit 200 ; |
|
|
|
-- 3. Checkout storage size |
|
|
|
SELECT table, |
|
formatReadableSize(sum(bytes)) as size, |
|
min(min_date) as min_date, |
|
max(max_date) as max_date |
|
FROM system.parts |
|
WHERE active |
|
AND ("table" ilike '%lineorder%') |
|
GROUP BY "table" |
|
ORDER BY sum(bytes) DESC |
|
; |
|
|
|
-- 4. Clickhouse 1-year data |
|
|
|
CREATE TABLE lineorder_clickhouse |
|
engine = MergeTree() |
|
ORDER BY |
|
( |
|
LO_ORDERDATE |
|
, LO_ORDERKEY |
|
) |
|
AS SELECT * FROM lineorder_internal |
|
WHERE LO_ORDERDATE BETWEEN '1992-01-01' AND '1992-12-31' |
|
; |
|
|
|
select count(*) from lineorder_clickhouse ; |
|
|
|
-- 5. Unload to JSON |
|
|
|
DROP TABLE lineorder_json ; |
|
CREATE TABLE lineorder_json |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/json/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'JSONEachRow' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_json |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
SELECT * FROM lineorder_json LIMIT 200 ; |
|
|
|
-- 6. Unload to CSV |
|
DROP TABLE lineorder_csv ; |
|
CREATE TABLE lineorder_csv |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/csv/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'CSV' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_csv |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
-- 7. Add compression |
|
|
|
DROP TABLE lineorder_csv_compressed ; |
|
CREATE TABLE lineorder_csv_compressed |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/csv_compressed/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'CSV', |
|
'zstd' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_csv_compressed |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
select count(*) from lineorder_csv_compressed ; |
|
|
|
-- 8. Unload to Parquet |
|
|
|
DROP TABLE lineorder_parquet ; |
|
CREATE TABLE lineorder_parquet |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/parquet/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'Parquet' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_parquet |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
-- 9. Parquet compressed |
|
|
|
DROP TABLE lineorder_parquet_compressed ; |
|
CREATE TABLE lineorder_parquet_compressed |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/parquet_compressed/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'Parquet', |
|
'zstd' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_parquet_compressed |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
-- 10. Unload to ORC |
|
|
|
DROP TABLE lineorder_orc ; |
|
CREATE TABLE lineorder_orc |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/orc/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'ORC' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_orc |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
-- 11. Unload to ORC compressed |
|
|
|
DROP TABLE lineorder_orc_compressed ; |
|
CREATE TABLE lineorder_orc_compressed |
|
( |
|
LO_ORDERKEY UInt32, |
|
LO_LINENUMBER UInt8, |
|
LO_CUSTKEY UInt32, |
|
LO_PARTKEY UInt32, |
|
LO_SUPPKEY UInt32, |
|
LO_ORDERDATE Date, |
|
LO_ORDERPRIORITY LowCardinality(String), |
|
LO_SHIPPRIORITY UInt8, |
|
LO_QUANTITY UInt8, |
|
LO_EXTENDEDPRICE UInt32, |
|
LO_ORDTOTALPRICE UInt32, |
|
LO_DISCOUNT UInt8, |
|
LO_REVENUE UInt32, |
|
LO_SUPPLYCOST UInt32, |
|
LO_TAX UInt8, |
|
LO_COMMITDATE Date, |
|
LO_SHIPMODE LowCardinality(String) |
|
) |
|
ENGINE = S3('https://storage.yandexcloud.net/otus-dwh/file_formats/orc_compressed/', |
|
'aws_access_key_id', |
|
'aws_secret_access_key', |
|
'ORC', |
|
'zstd' |
|
) |
|
; |
|
|
|
INSERT INTO lineorder_orc_compressed |
|
SELECT * FROM lineorder_clickhouse ; |
|
|
|
-- 12. Benchmark query latency |
|
|
|
-- CSV |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_csv |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
-- JSON |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_json |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
-- CSV compressed |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_csv_compressed |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
-- Parquet |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_parquet |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
|
|
-- ORC |
|
EXPLAIN |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_orc |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
-- Parquet compressed |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_parquet_compressed |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|
|
-- ORC compressed |
|
SELECT |
|
date_trunc('month', LO_ORDERDATE) AS mnth |
|
, count(DISTINCT LO_ORDERKEY) AS num_orders |
|
, count(DISTINCT LO_CUSTKEY) AS num_customer |
|
, sum(LO_REVENUE) AS revenue |
|
FROM |
|
lineorder_orc_compressed |
|
WHERE LO_ORDERDATE BETWEEN '1992-12-01' AND '1992-12-31' |
|
GROUP BY |
|
date_trunc('month', LO_ORDERDATE) |
|
ORDER BY date_trunc('month', LO_ORDERDATE) |
|
; |
|
|