Skip to content

Instantly share code, notes, and snippets.

@den-crane
Created October 16, 2020 17:28
Show Gist options
  • Save den-crane/42bf77285b53762f11d88d73a46179de to your computer and use it in GitHub Desktop.
Save den-crane/42bf77285b53762f11d88d73a46179de to your computer and use it in GitHub Desktop.
SAMPLING
https://stackoverflow.com/questions/64369622/using-sample-in-clickhouse-seems-to-read-all-rows-and-more-bytes-is-this-expect
CREATE TABLE table_one
( timestamp UInt64,
transaction_id UInt64,
banner_id UInt16,
value UInt32
)
ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(toDateTime(timestamp))
ORDER BY (banner_id, toStartOfHour(toDateTime(timestamp)), cityHash64(transaction_id))
SAMPLE BY cityHash64(transaction_id)
SETTINGS index_granularity = 8192
insert into table_one select 1602809234+intDiv(number,100000), number, number%991, toUInt32(rand())
from numbers(10000000000);
select banner_id, sum(value), count(value), max(value)
from table_one
group by banner_id format Null;
0 rows in set. Elapsed: 11.490 sec. Processed 10.00 billion rows, 60.00 GB (870.30 million rows/s., 5.22 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
group by banner_id format Null;
0 rows in set. Elapsed: 1.316 sec. Processed 452.67 million rows, 6.34 GB (343.85 million rows/s., 4.81 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one
WHERE banner_id = 42
group by banner_id format Null;
0 rows in set. Elapsed: 0.020 sec. Processed 10.30 million rows, 61.78 MB (514.37 million rows/s., 3.09 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
WHERE banner_id = 42
group by banner_id format Null;
0 rows in set. Elapsed: 0.008 sec. Processed 696.32 thousand rows, 9.75 MB (92.49 million rows/s., 1.29 GB/s.)
select banner_id, toStartOfHour(toDateTime(timestamp)) hr, sum(value), count(value), max(value)
from table_one
group by banner_id, hr format Null;
0 rows in set. Elapsed: 36.660 sec. Processed 10.00 billion rows, 140.00 GB (272.77 million rows/s., 3.82 GB/s.)
select banner_id, toStartOfHour(toDateTime(timestamp)) hr, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
group by banner_id, hr format Null;
0 rows in set. Elapsed: 3.741 sec. Processed 452.67 million rows, 9.96 GB (121.00 million rows/s., 2.66 GB/s.)
select count()
from table_one
where value = 666 format Null;
1 rows in set. Elapsed: 6.056 sec. Processed 10.00 billion rows, 40.00 GB (1.65 billion rows/s., 6.61 GB/s.)
select count()
from table_one SAMPLE 0.01
where value = 666 format Null;
1 rows in set. Elapsed: 1.214 sec. Processed 452.67 million rows, 5.43 GB (372.88 million rows/s., 4.47 GB/s.)
---------
CREATE TABLE table_one
( timestamp UInt64,
transaction_id UInt64,
banner_id UInt16,
value UInt32
)
ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(toDateTime(timestamp))
ORDER BY (banner_id, timestamp, cityHash64(transaction_id))
SAMPLE BY cityHash64(transaction_id)
SETTINGS index_granularity = 8192
insert into table_one select 1602809234+intDiv(number,100000), number, number%991, toUInt32(rand())
from numbers(10000000000);
select banner_id, sum(value), count(value), max(value)
from table_one
group by banner_id format Null;
0 rows in set. Elapsed: 11.196 sec. Processed 10.00 billion rows, 60.00 GB (893.15 million rows/s., 5.36 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
group by banner_id format Null;
0 rows in set. Elapsed: 24.378 sec. Processed 10.00 billion rows, 140.00 GB (410.21 million rows/s., 5.74 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one
WHERE banner_id = 42
group by banner_id format Null;
0 rows in set. Elapsed: 0.022 sec. Processed 10.27 million rows, 61.64 MB (459.28 million rows/s., 2.76 GB/s.)
select banner_id, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
WHERE banner_id = 42
group by banner_id format Null;
0 rows in set. Elapsed: 0.037 sec. Processed 10.27 million rows, 143.82 MB (275.16 million rows/s., 3.85 GB/s.)
select banner_id, toStartOfHour(toDateTime(timestamp)) hr, sum(value), count(value), max(value)
from table_one
group by banner_id, hr format Null;
0 rows in set. Elapsed: 21.663 sec. Processed 10.00 billion rows, 140.00 GB (461.62 million rows/s., 6.46 GB/s.)
select banner_id, toStartOfHour(toDateTime(timestamp)) hr, sum(value), count(value), max(value)
from table_one SAMPLE 0.01
group by banner_id, hr format Null;
0 rows in set. Elapsed: 26.697 sec. Processed 10.00 billion rows, 220.00 GB (374.57 million rows/s., 8.24 GB/s.)
select count()
from table_one
where value = 666 format Null;
0 rows in set. Elapsed: 7.679 sec. Processed 10.00 billion rows, 40.00 GB (1.30 billion rows/s., 5.21 GB/s.)
select count()
from table_one SAMPLE 0.01
where value = 666 format Null;
0 rows in set. Elapsed: 21.668 sec. Processed 10.00 billion rows, 120.00 GB (461.51 million rows/s., 5.54 GB/s.)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment