Skip to content

Instantly share code, notes, and snippets.

@den-crane
Last active February 9, 2023 08:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save den-crane/6a7fc9501b8d496c0e8cb9bba2d30133 to your computer and use it in GitHub Desktop.
Save den-crane/6a7fc9501b8d496c0e8cb9bba2d30133 to your computer and use it in GitHub Desktop.
Clickhouse GIN

Plain Table VS Table with Projection VS Table with GIN

try to insert 100 000 000 rows fails with OOM

set allow_experimental_inverted_index=1;

CREATE TABLE bench(c_int Int64, c_str varchar(255), c_float Float64) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(100000000);

0 rows in set. Elapsed: 36.980 sec. Processed 100.65 million rows, 805.21 MB (2.72 million rows/s., 21.77 MB/s.)


CREATE TABLE bench_p(c_int Int64, c_str varchar(255), c_float Float64,
   projection p1 (select c_int, c_str order by c_str)) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(100000000);

0 rows in set. Elapsed: 64.473 sec. Processed 200.65 million rows, 6.51 GB (3.11 million rows/s., 100.90 MB/s.)


CREATE TABLE bench_i(c_int Int64, c_str varchar(255), c_float Float64,
INDEX af (c_str) TYPE inverted() GRANULARITY 1) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(100000000);

0 rows in set. Elapsed: 594.427 sec. Processed 24.11 million rows, 192.91 MB (40.57 thousand rows/s., 324.54 KB/s.)
Received exception from server (version 22.13.1):
Code: 241. DB::Exception: Received from localhost:9000. DB::Exception: 
Memory limit (total) exceeded: would use 56.57 GiB (attempt to allocate chunk of 5094480 bytes), maximum: 56.53 GiB. 
OvercommitTracker decision: Query was selected to stop by OvercommitTracker.. (MEMORY_LIMIT_EXCEEDED)


DROP TABLE bench
0 rows in set. Elapsed: 1.718 sec.

DROP TABLE bench_p
0 rows in set. Elapsed: 0.001 sec.

DROP TABLE bench_i
Error on processing query: Code: 32. DB::Exception: Attempt to read after eof: while receiving packet from localhost:9000. (ATTEMPT_TO_READ_AFTER_EOF) (version 22.13.1.2671)
Connecting to localhost:9000 as user default.
Code: 210. DB::NetException: Connection refused (localhost:9000). (NETWORK_ERROR)

seems OOM

/etc/init.d/clickhouse-server restart

seems OOM again

rm /var/lib/clickhouse/metadata/default/bench_i.sql

try to insert 10 000 000 rows: insert is slower 64 times, Clickhouse used 25 GB RAM.

set allow_experimental_inverted_index=1;

CREATE TABLE bench(c_int Int64, c_str varchar(255), c_float Float64) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(10000000);

0 rows in set. Elapsed: 3.539 sec. Processed 10.48 million rows, 83.88 MB (2.96 million rows/s., 23.70 MB/s.)


CREATE TABLE bench_p(c_int Int64, c_str varchar(255), c_float Float64,
   projection p1 (select c_int, c_str order by c_str)) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(10000000);

0 rows in set. Elapsed: 5.912 sec. Processed 20.48 million rows, 653.88 MB (3.46 million rows/s., 110.60 MB/s.)


CREATE TABLE bench_i(c_int Int64, c_str varchar(255), c_float Float64,
INDEX af (c_str) TYPE inverted() GRANULARITY 1
) 
Engine=ReplacingMergeTree partition by c_int%10 order by c_int as
select toInt64(cityHash64(number)), 
  lower(hex(MD5(toString(number)))), 
  number/cityHash64(number)*10000000 
from numbers_mt(10000000);

0 rows in set. Elapsed: 226.779 sec. Processed 10.48 million rows, 83.88 MB (46.23 thousand rows/s., 369.86 KB/s.)

OK

select * from bench where c_str = lower(hex(MD5(toString(999999))));
┌───────────────c_int─┬─c_str────────────────────────────┬──────────────────c_float─┐
│ 3311836950280227934 │ 52c69e3a57331081823331c4e69d3f2e │ 0.0000030194693006109073 │
└─────────────────────┴──────────────────────────────────┴──────────────────────────┘
1 row in set. Elapsed: 0.074 sec. Processed 10.00 million rows, 410.10 MB (135.31 million rows/s., 5.55 GB/s.)


select * from bench_p where c_int in (
   select c_int from bench_p where c_str = lower(hex(MD5(toString(999999)))))
and c_str = lower(hex(MD5(toString(999999))));
┌───────────────c_int─┬─c_str────────────────────────────┬──────────────────c_float─┐
│ 3311836950280227934 │ 52c69e3a57331081823331c4e69d3f2e │ 0.0000030194693006109073 │
└─────────────────────┴──────────────────────────────────┴──────────────────────────┘
1 row in set. Elapsed: 0.021 sec. Processed 1.19 million rows, 48.86 MB (56.97 million rows/s., 2.34 GB/s.)


select * from bench_i where c_str = lower(hex(MD5(toString(999999))));
┌───────────────c_int─┬─c_str────────────────────────────┬──────────────────c_float─┐
│ 3311836950280227934 │ 52c69e3a57331081823331c4e69d3f2e │ 0.0000030194693006109073 │
└─────────────────────┴──────────────────────────────────┴──────────────────────────┘
1 row in set. Elapsed: 0.004 sec. Processed 8.19 thousand rows, 466.94 KB (2.03 million rows/s., 115.98 MB/s.)

Fail

select count() from bench where c_str < lower(hex(MD5(toString(999999))));

1 row in set. Elapsed: 0.074 sec. Processed 10.00 million rows, 410.10 MB (135.31 million rows/s., 5.55 GB/s.)


select count() from bench_p where c_int in (
   select c_int from bench_p where c_str < lower(hex(MD5(toString(999999)))))
and c_str < lower(hex(MD5(toString(999999))));

1 row in set. Elapsed: 0.705 sec. Processed 7.78 million rows, 381.34 MB (11.03 million rows/s., 540.55 MB/s.)


select count() from bench_i where c_str < lower(hex(MD5(toString(999999))));
┌─count()─┐
│ 3233584 │
└─────────┘
1 row in set. Elapsed: 0.075 sec. Processed 10.00 million rows, 410.00 MB (133.77 million rows/s., 5.48 GB/s.)

OK

select count() from bench where c_str in ('52c69e3a57331081823331c4e69d3f2e' , 'A', 'B', 'C', 'D', 'E', 'F');
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.065 sec. Processed 10.00 million rows, 410.05 MB (154.13 million rows/s., 6.32 GB/s.)


select count() from bench_p where c_int in (
   select c_int from bench_p where c_str in ('52c69e3a57331081823331c4e69d3f2e' , 'A', 'B', 'C', 'D', 'E', 'F'))
and c_str in ('52c69e3a57331081823331c4e69d3f2e' , 'A', 'B', 'C', 'D', 'E', 'F');
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.029 sec. Processed 2.03 million rows, 83.39 MB (69.81 million rows/s., 2.87 GB/s.)


select count() from bench_i where c_str in ('52c69e3a57331081823331c4e69d3f2e' , 'A', 'B', 'C', 'D', 'E', 'F');
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.006 sec. Processed 8.19 thousand rows, 335.87 KB (1.35 million rows/s., 55.48 MB/s.)

OK

select count() from bench where c_str ='52c69e3a57331081823331c4e69d3f2e'  or c_str='A' or c_str= 'B';
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.061 sec. Processed 10.00 million rows, 410.05 MB (165.13 million rows/s., 6.77 GB/s.)


select count() from bench_p where c_int in (
   select c_int from bench_p where c_str = '52c69e3a57331081823331c4e69d3f2e'  or c_str='A' or c_str= 'B')
and c_str = '52c69e3a57331081823331c4e69d3f2e'  or c_str='A' or c_str= 'B';
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.047 sec. Processed 3.24 million rows, 146.01 MB (68.76 million rows/s., 3.09 GB/s.)


select count() from bench_i where c_str = '52c69e3a57331081823331c4e69d3f2e'  or c_str='A' or c_str= 'B';
┌─count()─┐
│       1 │
└─────────┘
1 row in set. Elapsed: 0.004 sec. Processed 8.19 thousand rows, 335.87 KB (1.93 million rows/s., 79.20 MB/s.)

Fail / index is not uses if there is no value in the index ???? Or it's about spaces/empty string?

select count() from bench_i where c_str ='';
┌─count()─┐
│       0 │
└─────────┘
1 row in set. Elapsed: 0.078 sec. Processed 10.00 million rows, 410.00 MB (127.70 million rows/s., 5.24 GB/s.)


select count() from bench_i where c_str ='           ';
┌─count()─┐
│       0 │
└─────────┘
1 row in set. Elapsed: 0.061 sec. Processed 10.00 million rows, 410.09 MB (164.80 million rows/s., 6.76 GB/s.)


select count() from bench_i where c_str ='A';
┌─count()─┐
│       0 │
└─────────┘
1 row in set. Elapsed: 0.004 sec.

QPS worse 40 times

clickhouse-benchmark -c 3 -i 1000 <<< 'insert into bench select toInt64(cityHash64(number)), lower(hex(MD5(toString(number)))), number/cityHash64(number)*10000000 from numbers(1000)'

localhost:9000, queries 1000, QPS: 303.071, RPS: 303070.946, MiB/s: 2.312, result RPS: 0.000, result MiB/s: 0.000.


clickhouse-benchmark -c 3 -i 1000 <<< 'insert into bench_p select toInt64(cityHash64(number)), lower(hex(MD5(toString(number)))), number/cityHash64(number)*10000000 from numbers(1000)'

localhost:9000, queries 1000, QPS: 181.182, RPS: 362363.706, MiB/s: 11.231, result RPS: 0.000, result MiB/s: 0.000.


clickhouse-benchmark -c 3 -i 1000 <<< 'insert into bench_i select toInt64(cityHash64(number)), lower(hex(MD5(toString(number)))), number/cityHash64(number)*10000000 from numbers(1000)'

localhost:9000, queries 513, QPS: 8.482, RPS: 8481.658, MiB/s: 0.065, result RPS: 0.000, result MiB/s: 0.000.

optimize

optimize table bench final;
0 rows in set. Elapsed: 2.451 sec.

optimize table bench_p final;
0 rows in set. Elapsed: 2.909 sec.

optimize table bench_i final;
0 rows in set. Elapsed: 256.493 sec.

size

select active, table, count(), sum(rows) from system.parts where table like 'bench%' group by 1,2 order by 2;
┌─active─┬─table───┬─count()─┬─sum(rows)─┐
│      1 │ bench   │      1910000000 │
│      1 │ bench_i │      1910000000 │
│      1 │ bench_p │      1910000000 │
└────────┴─────────┴─────────┴───────────┘

du -sh bench*/
310M	bench/
5.1G	bench_i/
545M	bench_p/
@den-crane
Copy link
Author

Fail / index is not uses if there is no value in the index ???? Or it's about spaces/empty string?

All granules are skipped when search term is not found in inverted index. That's why it doesn't say how many rows are processed.

I see. When i created this gist, I have not realized yet that it's a full-text search index.

@chenlujjj
Copy link

What does the "GIN" means in title ?

Its Generalized Inverted Index, here is the PR - ClickHouse/ClickHouse#38667

Got it, thanks.

@ucasfl
Copy link

ucasfl commented Feb 9, 2023

QPS worse 40 times

How about the query QPS?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment