Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bbhavsar/006df9c40b4b0528e297fac29824ceb4 to your computer and use it in GitHub Desktop.
Save bbhavsar/006df9c40b4b0528e297fac29824ceb4 to your computer and use it in GitHub Desktop.
Performance test with Bloom filter support in Apache Kudu
Environment:
CDP 7.1.5
6 nodes (Dell PowerEdge R430, 20c/40t Xeon e5-2630 v4 @ 2.2Ghz, 128GB Ram, 4-2TB disks)
1) generate big table (260M) with all random data
2) copy big table to parquet
3) generate small table with top 1000 and bottom 1000 keys off big one
4) generate small table with top 1000 and bottom 1000 of non-key field off big one
5) compute stats for all tables
6) select big kudu based on half of small (filter by some int field mod 2), joining on key
7) select big kudu based on half of small (filter by some int field mod 2), joining on non-key
8) select big parquet based on half of small (filter by some int field mod 2), joining on key
9) select big parquet based on half of small (filter by some int field mod 2), joining on non-key
10) update big kudu based on all small
1) generate big table (1B) with all random data
=================================================================================================================
CREATE TABLE IF NOT EXISTS default.test_kudu_bloom_filter_bigtable_kudu (
key1 BIGINT,
key2 STRING,
some_string1 STRING,
some_string2 STRING,
some_string3 STRING,
some_int1 INT,
some_int2 INT,
some_int3 INT,
some_timestamp1 TIMESTAMP,
some_timestamp2 TIMESTAMP,
some_timestamp3 TIMESTAMP,
PRIMARY KEY(key1,key2)
)
PARTITION BY HASH (key1) PARTITIONS 20
STORED AS KUDU;
sudo -u kudu kudu perf loadgen master-1 -num_rows_per_thread=130000000 -num_threads=2 -show_first_n_errors=10 -string_len=20 -table_name=impala::default.test_kudu_bloom_filter_bigtable_kudu -use_random=true
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_bigtable_kudu;
count(*) min(key1) max(key1) min(some_int1) max(some_int1)
260301048 23622505005 9223372013232270802 17 2147483630
Note: I rebalanced the cluster to be sure a balanced layout after load
sudo -u kudu kudu cluster rebalance ve0219.halxg.cloudera.com -report_only
sudo -u kudu kudu cluster rebalance ve0219.halxg.cloudera.com
2) copy big table to parquet
=================================================================================================================
CREATE TABLE default.test_kudu_bloom_filter_bigtable_parquet (
key1 BIGINT,
key2 STRING,
some_string1 STRING,
some_string2 STRING,
some_string3 STRING,
some_int1 INT,
some_int2 INT,
some_int3 INT,
some_timestamp1 TIMESTAMP,
some_timestamp2 TIMESTAMP,
some_timestamp3 TIMESTAMP
)
STORED AS PARQUET;
INSERT OVERWRITE default.test_kudu_bloom_filter_bigtable_parquet SELECT * FROM default.test_kudu_bloom_filter_bigtable_kudu;
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_bigtable_parquet;
3) generate small table with top 1000 and bottom 1000 keys off big one
=================================================================================================================
CREATE TABLE default.test_kudu_bloom_filter_smaltable_pickedbykey (
key1 BIGINT,
key2 STRING,
some_string1 STRING,
some_string2 STRING,
some_string3 STRING,
some_int1 INT,
some_int2 INT,
some_int3 INT,
some_timestamp1 TIMESTAMP,
some_timestamp2 TIMESTAMP,
some_timestamp3 TIMESTAMP
)
STORED AS PARQUET;
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbykey
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN (
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY key1, key2 limit 1000
) topkeys
ON bigtable.key1 = topkeys.key1 AND bigtable.key2 = topkeys.key2;
NOTE: I noticed a bunch of Kudu table 'impala::default.test_kudu_bloom_filter_bigtable_kudu' column 'some_timestamp1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31. (1 of 12977300 similar)
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbykey
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN (
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY key1 DESC, key2 DESC limit 1000
) bottomkeys
ON bigtable.key1 = bottomkeys.key1 AND bigtable.key2 = bottomkeys.key2;
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_smaltable_pickedbykey;
4) generate small table with top 1000 and bottom 1000 of non-key field off big one
=================================================================================================================
CREATE TABLE default.test_kudu_bloom_filter_smaltable_pickedbynonkey (
key1 BIGINT,
key2 STRING,
some_string1 STRING,
some_string2 STRING,
some_string3 STRING,
some_int1 INT,
some_int2 INT,
some_int3 INT,
some_timestamp1 TIMESTAMP,
some_timestamp2 TIMESTAMP,
some_timestamp3 TIMESTAMP
)
STORED AS PARQUET;
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbynonkey
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN (
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY some_int1, some_int2 limit 1000
) topkeys
ON bigtable.key1 = topkeys.key1 AND bigtable.key2 = topkeys.key2;
INSERT INTO default.test_kudu_bloom_filter_smaltable_pickedbynonkey
SELECT bigtable.* FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN (
SELECT key1, key2 from default.test_kudu_bloom_filter_bigtable_kudu ORDER BY some_int1 DESC, some_int2 DESC limit 1000
) bottomkeys
ON bigtable.key1 = bottomkeys.key1 AND bigtable.key2 = bottomkeys.key2;
select count(*), min(key1), max(key1), min(some_int1), max(some_int1) from default.test_kudu_bloom_filter_smaltable_pickedbynonkey;
count(*) min(key1) max(key1) min(some_int1) max(some_int1)
5) compute stats for all tables
=================================================================================================================
COMPUTE STATS default.test_kudu_bloom_filter_bigtable_kudu;
COMPUTE STATS default.test_kudu_bloom_filter_bigtable_parquet;
COMPUTE STATS default.test_kudu_bloom_filter_smaltable_pickedbykey;
COMPUTE STATS default.test_kudu_bloom_filter_smaltable_pickedbynonkey;
6) select big kudu based on half of small (filter by some int field mod 2), joining on key
=================================================================================================================
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail
00:SCAN KUDU 5 5 1s962ms 2s519ms 259.65M 260.00M 14.08 MB 9.38 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail
00:SCAN KUDU 5 5 424.072ms 438.111ms 1.03K 260.00M 310.00 KB 9.38 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
6.299
6.197
6.200
6.223
6.209
6.079
5.884
6.199
5.984
6.095
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
1.118
1.070
1.079
1.176
1.773
1.072
1.767
1.170
1.065
1.069
7) select big kudu based on half of small (filter by some int field mod 2), joining on non-key
=================================================================================================================
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail
00:SCAN KUDU 5 5 1s456ms 1s676ms 127.70M 26.00M 3.50 MB 5.62 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
Operator #Hosts #Inst Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail
00:SCAN KUDU 5 5 413.417ms 423.044ms 973 26.00M 182.00 KB 5.62 MB default.test_kudu_bloom_filter_bigtable_kudu bigtable
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
3.411
3.281
3.278
3.184
3.266
3.280
3.348
3.385
3.295
3.275
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_kudu bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
1.081
0.988
1.096
1.091
1.073
0.988
0.987
0.988
0.964
0.967
8) select big parquet based on half of small (filter by some int field mod 2), joining on key
=================================================================================================================
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
1.777
1.674
1.580
1.685
1.691
1.684
1.568
1.592
1.565
1.663
9) select big parquet based on half of small (filter by some int field mod 2), joining on non-key
=================================================================================================================
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
SELECT count(*), sum(bigtable.some_int1), avg(bigtable.some_int2), max(bigtable.some_string1)
FROM default.test_kudu_bloom_filter_bigtable_parquet bigtable
INNER JOIN default.test_kudu_bloom_filter_smaltable_pickedbynonkey smalltable
ON bigtable.some_int1 = smalltable.some_int1 AND bigtable.some_int2 = smalltable.some_int2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
1.171
1.273
1.285
1.175
1.177
1.179
1.181
1.189
1.172
1.177
10) update big based on all small - kudu
=================================================================================================================
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu
SELECT
smalltable.key1,
smalltable.key2,
smalltable.some_string1,
smalltable.some_string2,
smalltable.some_string3,
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT),
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT),
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT),
smalltable.some_timestamp1,
smalltable.some_timestamp2,
smalltable.some_timestamp3
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = MIN_MAX;
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu
SELECT
smalltable.key1,
smalltable.key2,
smalltable.some_string1,
smalltable.some_string2,
smalltable.some_string3,
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT),
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT),
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT),
smalltable.some_timestamp1,
smalltable.some_timestamp2,
smalltable.some_timestamp3
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
19.865
19.348
19.863
18.831
19.349
19.370
19.365
18.884
18.869
18.852
TIMEFORMAT=%R
for i in {1..10}; do
{ time impala-shell -i ve0220.halxg.cloudera.com --quiet -q "
SET ENABLED_RUNTIME_FILTER_TYPES = ALL;
UPSERT INTO default.test_kudu_bloom_filter_bigtable_kudu
SELECT
smalltable.key1,
smalltable.key2,
smalltable.some_string1,
smalltable.some_string2,
smalltable.some_string3,
CAST(smalltable.some_int1 + IFNULL(bigtable.some_int1, 0) AS INT),
CAST(smalltable.some_int2 + IFNULL(bigtable.some_int2, 0) AS INT),
CAST(smalltable.some_int3 + IFNULL(bigtable.some_int3, 0) AS INT),
smalltable.some_timestamp1,
smalltable.some_timestamp2,
smalltable.some_timestamp3
FROM default.test_kudu_bloom_filter_smaltable_pickedbykey smalltable
LEFT JOIN default.test_kudu_bloom_filter_bigtable_kudu bigtable ON bigtable.key1 = smalltable.key1 AND bigtable.key2 = smalltable.key2
WHERE smalltable.some_int1%2=true;
" >/dev/null; } 2>&1 | tail -n1
done
1.361
1.196
1.190
1.300
1.179
1.186
1.180
1.203
1.199
1.190
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment