Skip to content

Instantly share code, notes, and snippets.

@sumerc
Last active March 2, 2022 10:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sumerc/6716c178e81e93316030dd8c46730078 to your computer and use it in GitHub Desktop.
Save sumerc/6716c178e81e93316030dd8c46730078 to your computer and use it in GitHub Desktop.
Benchmark different codecs in Clickhouse table
// I am trying to benchmark different ways to store stack traces in Clickhouse db with different codecs and benchmark the compression
// The way it works is:
// 1. drop/create table with selected codecs
// 2. insert ARRAY(UInt32) (or ARRAY(UUID)) of data which mimics a valid stacktrace. Similar to pprof list of LocationIDs.
// minCallStackDepth/maxCallStackDepth defines the height range of the callstack and maxLocationCount defines the LocationID cardinality.
// The default value is selected as 10_000 which might be OKish since for a mid-sized application there will not be more than 10_000 unique
// functions/LocationsIDs? This is an assumption, though.
// 3. Optimize table and run stats Query to see column stats. Most important is the ratio. For example with (T64+ZSTD) on ARRAY(Uint32) I get
// ~2.3x compression on my local machine.
// This is the way I ran clickhouse on my M1:
// docker run \
// --platform linux/amd64 \
// -ti \
// --ulimit nofile=262144:262144 \
// --volume=$HOME/clickhouse-db:/var/lib/clickhouse \
// -p 8123:8123 \
// -p 9000:9000 \
// clickhouse/clickhouse-server
package main
import (
"context"
"fmt"
"log"
"math/rand"
"time"
"github.com/ClickHouse/clickhouse-go"
)
const tableName = `t31`
const minCallStackDepth = 1
const maxCallStackDepth = 100
const maxLocationCount = 10_000
var ddl = fmt.Sprintf("CREATE TABLE %s (A Array(UInt32) CODEC(T64, ZSTD)) Engine=MergeTree ORDER BY A ;", tableName)
var result []struct {
A []uint32 `ch:"A"`
}
var statsSql = fmt.Sprintf(`SELECT
database,
table,
column,
type,
sum(rows) AS rows,
sum(column_data_compressed_bytes) AS compressed_bytes,
formatReadableSize(compressed_bytes) AS compressed,
formatReadableSize(sum(column_data_uncompressed_bytes)) AS uncompressed,
sum(column_data_uncompressed_bytes) / compressed_bytes AS ratio,
any(compression_codec) AS codec
FROM system.parts_columns AS pc
LEFT JOIN system.columns AS c
ON (pc.database = c.database) AND (c.table = pc.table) AND (c.name = pc.column)
WHERE (database LIKE '%%') AND (table LIKE '%s%%') AND active
GROUP BY
database,
table,
column,
type
ORDER BY ratio DESC`, tableName)
func randInt(min int, max int) int {
return min + rand.Intn(max-min)
}
func getRandomSlice(i int64) []uint32 {
// todo: use generics?
rand.Seed(i)
slice := make([]uint32, randInt(minCallStackDepth, maxCallStackDepth))
for i := 0; i < len(slice); i++ {
slice[i] = uint32(rand.Intn(maxLocationCount))
}
return slice
}
// var result []struct {
// A []uuid.UUID `ch:"A"`
// }
// func getRandomUUIDSlice() []uuid.UUID {
// rand.Seed(1)
// slice := make([]uuid.UUID, randInt(minCallStackDepth, maxCallStackDepth))
// for i := 0; i < len(slice); i++ {
// slice[i] = uuid.New()
// }
// return slice
// }
func main() {
var (
ctx = context.Background()
conn, err = clickhouse.Open(&clickhouse.Options{
Addr: []string{"127.0.0.1:9000"},
Auth: clickhouse.Auth{
Database: "default",
Username: "default",
Password: "",
},
//Debug: true,
DialTimeout: time.Second,
MaxOpenConns: 10,
MaxIdleConns: 5,
ConnMaxLifetime: time.Hour,
})
)
if err != nil {
log.Fatal(err)
}
if err := conn.Exec(ctx, fmt.Sprintf("DROP TABLE IF EXISTS %s SYNC", tableName)); err != nil {
log.Fatal(err)
}
if err := conn.Exec(ctx, ddl); err != nil {
log.Fatal(err)
}
batch, err := conn.PrepareBatch(ctx, fmt.Sprintf("INSERT INTO %s", tableName))
if err != nil {
log.Fatal(err)
}
for i := 0; i < 100_000; i++ {
err := batch.Append(getRandomSlice(int64(i)))
if err != nil {
log.Fatal(err)
}
}
err = batch.Send()
if err != nil {
log.Fatal(err)
}
// optimize table
if err = conn.Exec(ctx, fmt.Sprintf("optimize table %s final", tableName)); err != nil {
log.Fatal(err)
}
// retrieve column stats
var result []struct {
Col1 string `ch:"database"`
Col2 string `ch:"table"`
Col3 string `ch:"column"`
Col4 string `ch:"type"`
Col5 uint64 `ch:"rows"`
Col6 uint64 `ch:"compressed_bytes"`
Col7 string `ch:"compressed"`
Col8 string `ch:"uncompressed"`
Col9 float64 `ch:"ratio"`
Col10 string `ch:"codec"`
}
if err = conn.Select(ctx, &result, statsSql); err != nil {
log.Fatal(err)
}
log.Println(result)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment