Last active
July 2, 2021 01:45
-
-
Save bbhavsar/006df9c40b4b0528e297fac29824ceb4 to your computer and use it in GitHub Desktop.
Performance test with Bloom filter support in Apache Kudu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Environment: | |
CDP 7.1.5 | |
6 nodes (Dell PowerEdge R430, 20c/40t Xeon e5-2630 v4 @ 2.2Ghz, 128GB Ram, 4-2TB disks) | |
1) generate big table (260M) with all random data | |
2) copy big table to parquet | |
3) generate small table with top 1000 and bottom 1000 keys off big one | |
4) generate small table with top 1000 and bottom 1000 of non-key field off big one | |
5) compute stats for all tables | |
6) select big kudu based on half of small (filter by some int field mod 2), joining on key | |
7) select big kudu based on half of small (filter by some int field mod 2), joining on non-key | |
8) select big parquet based on half of small (filter by some int field mod 2), joining on key | |
9) select big parquet based on half of small (filter by some int field mod 2), joining on non-key | |
10) update big kudu based on all small | |
1) generate big table (1B) with all random data | |
================================================================================================================= | |
CREATE TABLE IF NOT EXISTS default.test_kudu_bloom_filter_bigtable_kudu ( | |
key1 BIGINT, | |
key2 STRING, | |
some_string1 STRING, | |
some_string2 STRING, | |
some_string3 STRING, | |
some_int1 INT, | |
some_int2 INT, | |
some_int3 INT, | |
some_timestamp1 TIMESTAMP, | |
some_timestamp2 TIMESTAMP, | |
some_timestamp3 TIMESTAMP, | |
PRIMARY KEY(key1,key2) | |
) | |
PARTITION BY HASH (key1) PARTITIONS 20 | |
STORED AS KUDU; | |
sudo -u kudu kudu perf loadgen master-1 -num_rows_per_thread=130000000 -num_threads=2 -show_first_n_errors=10 -string_len=20 -table_name=impala::default.test_kudu_bloom_filter_bigtable_kudu -use_random=true | |
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_bigtable_kudu; | |
count(*) min(key1) max(key1) min(some_int1) max(some_int1) | |
260301048 23622505005 9223372013232270802 17 2147483630 | |
Note: I rebalanced the cluster to be sure a balanced layout after load | |
sudo -u kudu kudu cluster rebalance ve0219.halxg.cloudera.com -report_only | |
sudo -u kudu kudu cluster rebalance ve0219.halxg.cloudera.com | |
2) copy big table to parquet | |
================================================================================================================= | |
CREATE TABLE default.test_kudu_bloom_filter_bigtable_parquet ( | |
key1 BIGINT, | |
key2 STRING, | |
some_string1 STRING, | |
some_string2 STRING, | |
some_string3 STRING, | |
some_int1 INT, | |
some_int2 INT, | |
some_int3 INT, | |
some_timestamp1 TIMESTAMP, | |
some_timestamp2 TIMESTAMP, | |
some_timestamp3 TIMESTAMP | |
) | |
STORED AS PARQUET; | |
INSERT OVERWRITE default.test_kudu_bloom_filter_bigtable_parquet SELECT * FROM default.test_kudu_bloom_filter_bigtable_kudu; | |
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_bigtable_parquet; | |
3) generate small table with top 1000 and bottom 1000 keys off big one | |
================================================================================================================= | |
CREATE TABLE default.test_kudu_bloom_filter_smaltable_pickedbykey ( | |
key1 BIGINT, | |
key2 STRING, | |
some_string1 STRING, | |
some_string2 STRING, | |
some_string3 STRING, | |
some_int1 INT, | |
some_int2 INT, | |
some_int3 INT, | |
some_timestamp1 TIMESTAMP, | |
some_timestamp2 TIMESTAMP, | |
some_timestamp3 TIMESTAMP | |
) | |
STORED AS PARQUET; | |
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbykey | |
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN ( | |
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY key1, key2 limit 1000 | |
) topkeys | |
ON bigtable.key1 = topkeys.key1 AND bigtable.key2 = topkeys.key2; | |
NOTE: I noticed a bunch of Kudu table 'impala::default.test_kudu_bloom_filter_bigtable_kudu' column 'some_timestamp1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31. (1 of 12977300 similar) | |
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbykey | |
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN ( | |
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY key1 DESC, key2 DESC limit 1000 | |
) bottomkeys | |
ON bigtable.key1 = bottomkeys.key1 AND bigtable.key2 = bottomkeys.key2; | |
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_smaltable_pickedbykey; | |
4) generate small table with top 1000 and bottom 1000 of non-key field off big one | |
================================================================================================================= | |
CREATE TABLE default.test_kudu_bloom_filter_smaltable_pickedbynonkey ( | |
key1 BIGINT, | |
key2 STRING, | |
some_string1 STRING, | |
some_string2 STRING, | |
some_string3 STRING, | |
some_int1 INT, | |
some_int2 INT, | |
some_int3 INT, | |
some_timestamp1 TIMESTAMP, | |
some_timestamp2 TIMESTAMP, | |
some_timestamp3 TIMESTAMP | |
) | |
STORED AS PARQUET; | |
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbynonkey | |
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN ( | |
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY some_int1, some_int2 limit 1000 | |
) topkeys | |
ON bigtable.key1 = topkeys.key1 AND bigtable.key2 = topkeys.key2; | |
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbynonkey | |
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN ( | |
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY some_int1 DESC, some_int2 DESC limit 1000 | |
) bottomkeys | |
ON bigtable.key1 = bottomkeys.key1 AND bigtable.key2 = bottomkeys.key2; | |
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_smaltable_pickedbynonkey; | |
count(*) min(key1) max(key1) min(some_int1) max(some_int1) | |
5) compute stats for all tables | |
================================================================================================================= | |
COMPUTE STATS default.test_kudu_bloom_filter_bigtable_kudu; | |
COMPUTE STATS default.test_kudu_bloom_filter_bigtable_parquet; | |
COMPUTE STATS default.test_kudu_bloom_filter_smaltable_pickedbykey; | |
COMPUTE STATS default.test_kudu_bloom_filter_smaltable_pickedbynonkey; | |
6) select big kudu based on half of small (filter by some int field mod 2), joining on key | |
================================================================================================================= | |
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail | |
00:SCAN KUDU 5 5 1s962ms 2s519ms 259.65M 260.00M 14.08 MB 9.38 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail | |
00:SCAN KUDU 5 5 424.072ms 438.111ms 1.03K 260.00M 310.00 KB 9.38 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
6.299 | |
6.197 | |
6.200 | |
6.223 | |
6.209 | |
6.079 | |
5.884 | |
6.199 | |
5.984 | |
6.095 | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
1.118 | |
1.070 | |
1.079 | |
1.176 | |
1.773 | |
1.072 | |
1.767 | |
1.170 | |
1.065 | |
1.069 | |
7) select big kudu based on half of small (filter by some int field mod 2), joining on non-key | |
================================================================================================================= | |
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail | |
00:SCAN KUDU 5 5 1s456ms 1s676ms 127.70M 26.00M 3.50 MB 5.62 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail | |
00:SCAN KUDU 5 5 413.417ms 423.044ms 973 26.00M 182.00 KB 5.62 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
3.411 | |
3.281 | |
3.278 | |
3.184 | |
3.266 | |
3.280 | |
3.348 | |
3.385 | |
3.295 | |
3.275 | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
1.081 | |
0.988 | |
1.096 | |
1.091 | |
1.073 | |
0.988 | |
0.987 | |
0.988 | |
0.964 | |
0.967 | |
8) select big parquet based on half of small (filter by some int field mod 2), joining on key | |
================================================================================================================= | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
1.777 | |
1.674 | |
1.580 | |
1.685 | |
1.691 | |
1.684 | |
1.568 | |
1.592 | |
1.565 | |
1.663 | |
9) select big parquet based on half of small (filter by some int field mod 2), joining on non-key | |
================================================================================================================= | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1) | |
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable | |
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable | |
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
1.171 | |
1.273 | |
1.285 | |
1.175 | |
1.177 | |
1.179 | |
1.181 | |
1.189 | |
1.172 | |
1.177 | |
10) update big based on all small - kudu | |
================================================================================================================= | |
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu | |
SELECT | |
smalltable.key1, | |
smalltable.key2, | |
smalltable.some_string1, | |
smalltable.some_string2, | |
smalltable.some_string3, | |
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT), | |
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT), | |
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT), | |
smalltable.some_timestamp1, | |
smalltable.some_timestamp2, | |
smalltable.some_timestamp3 | |
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX; | |
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu | |
SELECT | |
smalltable.key1, | |
smalltable.key2, | |
smalltable.some_string1, | |
smalltable.some_string2, | |
smalltable.some_string3, | |
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT), | |
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT), | |
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT), | |
smalltable.some_timestamp1, | |
smalltable.some_timestamp2, | |
smalltable.some_timestamp3 | |
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
19.865 | |
19.348 | |
19.863 | |
18.831 | |
19.349 | |
19.370 | |
19.365 | |
18.884 | |
18.869 | |
18.852 | |
TIMEFORMAT=%R | |
for i in {1..10}; do | |
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q " | |
SET ENABLED_RUNTIME_FILTER_TYPES = ALL; | |
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu | |
SELECT | |
smalltable.key1, | |
smalltable.key2, | |
smalltable.some_string1, | |
smalltable.some_string2, | |
smalltable.some_string3, | |
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT), | |
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT), | |
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT), | |
smalltable.some_timestamp1, | |
smalltable.some_timestamp2, | |
smalltable.some_timestamp3 | |
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable | |
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2 | |
WHERE smalltable.some_int1%2=true; | |
" >/dev/null; } 2>&1 | tail -n1 | |
done | |
1.361 | |
1.196 | |
1.190 | |
1.300 | |
1.179 | |
1.186 | |
1.180 | |
1.203 | |
1.199 | |
1.190 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment