Skip to content

Instantly share code, notes, and snippets.

@sundy-li
Last active November 4, 2020 13:19
Show Gist options
  • Save sundy-li/fff3b1d81beeb81de0a98dfd0beab71d to your computer and use it in GitHub Desktop.
Save sundy-li/fff3b1d81beeb81de0a98dfd0beab71d to your computer and use it in GitHub Desktop.
gen_tool.go
{
"instance_id" : 1,
"data_dir" : "/data1/clickhouse",
"log_dir": "/data/logs/clickhouse-server",
"tcp_port": 9000,
"http_port": 8123,
"interserver_http_port": 9009,
"max_concurrent_queries": 500,
"user" : "default",
"password": "test_password",
"logs": [
[
{ "ip": "xxxx235" , "shard" : "shard01"},
{ "ip": "xxxx236" , "shard" : "shard01"},
{ "ip": "xxxx66" , "shard" : "shard01"}
],
[
{ "ip": "xxxx237" , "shard" : "shard02"},
{ "ip": "xxxx238" , "shard" : "shard02"},
{ "ip": "xxxx65" , "shard" : "shard02"}
]
],
"zookeepers": [
{
"ip": "xxxx",
"port": 2181
},
{
"ip": "xxxx",
"port": 2181
},
{
"ip": "xxxx",
"port": 2181
},
{
"ip": "xxxx",
"port": 2181
}
]
}
<?xml version="1.0"?>
<!--
NOTE: User and query level settings are set up in "users.xml" file.
-->
<yandex>
<logger>
<!-- Possible levels: https://github.com/pocoproject/poco/blob/develop/Foundation/include/Poco/Logger.h#L105 -->
<level>debug</level>
<log>{{.In.data_dir}}/log/clickhouse-server.log</log>
<errorlog>{{.In.data_dir}}/log/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>10</count>
<!-- <console>1</console> --> <!-- Default behavior is autodetection (log to console if not daemon mode and is tty) -->
</logger>
<!--display_name>production</display_name--> <!-- It is the name that will be shown in the client -->
<http_port>{{.In.http_port}}</http_port>
<tcp_port>{{.In.tcp_port}}</tcp_port>
<!-- Default root page on http[s] server. For example load UI from https://tabix.io/ when opening http://localhost:8123 -->
<http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>
<!-- Port for communication between replicas. Used for data exchange. -->
<interserver_http_port>{{.In.interserver_http_port}}</interserver_http_port>
<interserver_http_host>{{.D.ip_addr}}</interserver_http_host>
<!-- Listen specified host. use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere. -->
<listen_host>0.0.0.0</listen_host>
<max_connections>4096</max_connections>
<keep_alive_timeout>3</keep_alive_timeout>
<!-- Maximum number of concurrent queries. -->
<max_concurrent_queries>{{.In.max_concurrent_queries}}</max_concurrent_queries>
<!-- Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
correct maximum value. -->
<max_open_files>262144</max_open_files>
<!-- Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
In bytes. Cache is single for server. Memory is allocated only on demand.
Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
Uncompressed cache is advantageous only for very short queries and in rare cases.
-->
<uncompressed_cache_size>8589934592</uncompressed_cache_size>
<!-- Approximate size of mark cache, used in tables of MergeTree family.
In bytes. Cache is single for server. Memory is allocated only on demand.
You should not lower this value.
-->
<mark_cache_size>5368709120</mark_cache_size>
<max_table_size_to_drop>0</max_table_size_to_drop>
<max_partition_size_to_drop>0</max_partition_size_to_drop>
<!-- Path to data directory, with trailing slash. -->
<path>{{.In.data_dir}}/data</path>
<!-- Path to temporary data for processing hard queries. -->
<tmp_path>{{.In.data_dir}}/data/tmp</tmp_path>
<!-- Directory with user provided files that are accessible by 'file' table function. -->
<user_files_path>{{.In.data_dir}}/data/user_files</user_files_path>
<!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
<users_config>users.xml</users_config>
<!-- Default profile of settings. -->
<default_profile>default</default_profile>
<!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distibuted DDL worker and so on). -->
<!-- <system_profile>default</system_profile> -->
<!-- Default database. -->
<default_database>default</default_database>
<!-- Server time zone could be set here.
Time zone is used when converting between String and DateTime types,
when printing DateTime in text formats and parsing DateTime from text,
it is used in date and time related functions, if specific time zone was not passed as an argument.
Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.
If not specified, system time zone at server startup is used.
Please note, that server could display time zone alias instead of specified name.
Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.
-->
<timezone>Asia/Shanghai</timezone>
<!-- Perform mlockall after startup to lower first queries latency
and to prevent clickhouse executable from being paged out under high IO load.
Enabling this option is recommended but will lead to increased startup time for up to a few seconds.
-->
<mlock_executable>true</mlock_executable>
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.yandex/docs/en/table_engines/distributed/
-->
<remote_servers incl="clickhouse_remote_servers" >
</remote_servers>
<!-- If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
Values for substitutions are specified in /yandex/name_of_substitution elements in that file.
-->
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.yandex/docs/en/table_engines/replication/
-->
<zookeeper incl="zookeeper-servers" optional="true" />
<!-- Substitutions for parameters of replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.yandex/docs/en/table_engines/replication/#creating-replicated-tables
-->
<macros incl="macros" optional="true" />
<!-- Reloading interval for embedded dictionaries, in seconds. Default: 3600. -->
<builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>
<!-- Maximum session timeout, in seconds. Default: 3600. -->
<max_session_timeout>3600</max_session_timeout>
<!-- Default session timeout, in seconds. Default: 60. -->
<default_session_timeout>60</default_session_timeout>
<!-- Query log. Used only for queries with setting log_queries = 1. -->
<query_log>
<!-- What table to insert data. If table is not exist, it will be created.
When query log structure is changed after system update,
then old table will be renamed and new table will be created automatically.
-->
<database>system</database>
<table>query_log</table>
<!--
PARTITION BY expr https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/
Example:
event_date
toMonday(event_date)
toYYYYMM(event_date)
toStartOfHour(event_time)
-->
<partition_by>toYYYYMM(event_date)</partition_by>
<!-- Interval of flushing data. -->
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_log>
<!-- Trace log. Stores stack traces collected by query profilers.
See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
<trace_log>
<database>system</database>
<table>trace_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</trace_log>
<!-- Query thread log. Has information about all threads participated in query execution.
Used only for queries with setting log_query_threads = 1. -->
<query_thread_log>
<database>system</database>
<table>query_thread_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_thread_log>
<!-- Configuration of external dictionaries. See:
https://clickhouse.yandex/docs/en/dicts/external_dicts/
-->
<dictionaries_config>*_dictionary.xml</dictionaries_config>
<!-- Uncomment if you want data to be compressed 30-100% better.
Don't do that if you just started using ClickHouse.
-->
<compression incl="clickhouse_compression">
<!--
<!- - Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used. - ->
<case>
<!- - Conditions. All must be satisfied. Some conditions may be omitted. - ->
<min_part_size>10000000000</min_part_size> <!- - Min part size in bytes. - ->
<min_part_size_ratio>0.01</min_part_size_ratio> <!- - Min size of part relative to whole table size. - ->
<!- - What compression method to use. - ->
<method>zstd</method>
</case>
-->
</compression>
<!-- Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.
Works only if ZooKeeper is enabled. Comment it if such functionality isn't required. -->
<distributed_ddl>
<!-- Path in ZooKeeper to queue with DDL queries -->
<path>/clickhouse/task_queue/ddl</path>
<!-- Settings from this profile will be used to execute DDL queries -->
<!-- <profile>default</profile> -->
</distributed_ddl>
<!-- Directory in <clickhouse-path> containing schema files for various input formats.
The directory will be created if it doesn't exist.
-->
<format_schema_path>{{.In.data_dir}}/format_schemas</format_schema_path>
<include_from>{{.In.data_dir}}/conf/metrika.xml</include_from>
</yandex>
// 根据模板,自动生成配置文件
/**
usage:
./gen_tool -data=config.json -d ip_addr=${local_ip} config.xml.tmpl
**/
package main
import (
"bytes"
"encoding/json"
"flag"
"fmt"
"go/format"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strings"
"text/template"
)
const Ext = ".tmpl"
type pathSpec struct {
in, out string
}
func (p *pathSpec) String() string { return p.in + " → " + p.out }
func (p *pathSpec) IsGoFile() bool { return filepath.Ext(p.out) == ".go" }
func parsePath(path string) (string, string) {
p := strings.IndexByte(path, '=')
if p == -1 {
if filepath.Ext(path) != Ext {
errExit("template file '%s' must have .tmpl extension", path)
}
return path, path[:len(path)-len(Ext)]
}
return path[:p], path[p+1:]
}
type data struct {
In interface{}
D listValue
Env listValue
}
func errExit(format string, a ...interface{}) {
fmt.Fprintf(os.Stderr, format, a...)
fmt.Fprintln(os.Stderr)
os.Exit(1)
}
type listValue map[string]string
func (l listValue) String() string {
res := make([]string, 0, len(l))
for k, v := range l {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
return strings.Join(res, ", ")
}
func (l listValue) Set(v string) error {
nw := strings.Split(v, ",")
for _, v := range nw {
if err := l.setKV(v); err != nil {
return err
}
}
return nil
}
func (l listValue) setKV(v string) error {
if v == "" {
return nil
}
nv := strings.Split(v, "=")
if len(nv) != 2 {
return fmt.Errorf("expected NAME=VALUE, got %s", v)
}
l[nv[0]] = nv[1]
return nil
}
func main() {
var (
dataArg = flag.String("data", "", "input JSON data")
gi = flag.Bool("i", false, "run goimports")
in = &data{D: make(listValue), Env: make(listValue)}
)
// could extract from file : cat ~/.bashrc | awk 'ORS=" "{print $0}'
flag.Var(&in.D, "d", "-d NAME=VALUE")
for _, key := range os.Environ() {
in.Env[key] = os.Getenv(key)
}
flag.Parse()
if *dataArg == "" {
errExit("data option is required")
}
if *gi {
if _, err := exec.LookPath("goimports"); err != nil {
errExit("failed to find goimports: %s", err.Error())
}
formatter = formatSource
} else {
formatter = format.Source
}
paths := flag.Args()
if len(paths) == 0 {
errExit("no tmpl files specified")
}
specs := make([]pathSpec, len(paths))
for i, p := range paths {
in, out := parsePath(p)
specs[i] = pathSpec{in: in, out: out}
}
in.In = readData(*dataArg)
process(in, specs)
}
func mustReadAll(path string) []byte {
data, err := ioutil.ReadFile(path)
if err != nil {
errExit(err.Error())
}
return data
}
func readData(path string) interface{} {
data := mustReadAll(path)
var v interface{}
if err := json.Unmarshal(StripComments(data), &v); err != nil {
errExit("invalid JSON data: %s", err.Error())
}
return v
}
func fileMode(path string) os.FileMode {
stat, err := os.Stat(path)
if err != nil {
errExit(err.Error())
}
return stat.Mode()
}
var funcs = template.FuncMap{
"lower": strings.ToLower,
"upper": strings.ToUpper,
// The name "inc" is what the function will be called in the template text.
"inc": func(i int) int {
return i + 1
},
}
func process(data interface{}, specs []pathSpec) {
for _, spec := range specs {
var (
t *template.Template
err error
)
t, err = template.New("gen").Funcs(funcs).Parse(string(mustReadAll(spec.in)))
if err != nil {
errExit("error processing template '%s': %s", spec.in, err.Error())
}
var buf bytes.Buffer
if spec.IsGoFile() {
// preamble
fmt.Fprintf(&buf, "// Code generated by %s.\n// DO NOT EDIT.\n", spec.in)
fmt.Fprintln(&buf)
}
err = t.Execute(&buf, data)
if err != nil {
errExit("error executing template '%s': %s", spec.in, err.Error())
}
generated := buf.Bytes()
if spec.IsGoFile() {
generated, err = formatter(generated)
if err != nil {
errExit("error formatting '%s': %s", spec.in, err.Error())
}
}
ioutil.WriteFile(spec.out, generated, fileMode(spec.in))
}
}
var (
formatter func([]byte) ([]byte, error)
)
func formatSource(in []byte) ([]byte, error) {
r := bytes.NewReader(in)
cmd := exec.Command("goimports")
cmd.Stdin = r
out, err := cmd.Output()
if err != nil {
if ee, ok := err.(*exec.ExitError); ok {
return nil, fmt.Errorf("error running goimports: %s", string(ee.Stderr))
}
return nil, fmt.Errorf("error running goimports: %s", string(out))
}
return out, nil
}
func StripComments(raw []byte) []byte {
var (
quoted, esc bool
comment bool
)
buf := bytes.Buffer{}
for i := 0; i < len(raw); i++ {
b := raw[i]
if comment {
switch b {
case '/':
comment = false
j := bytes.IndexByte(raw[i+1:], '\n')
if j == -1 {
i = len(raw)
} else {
i += j // keep new line
}
case '*':
j := bytes.Index(raw[i+1:], []byte("*/"))
if j == -1 {
i = len(raw)
} else {
i += j + 2
comment = false
}
}
continue
}
if esc {
esc = false
continue
}
if b == '\\' && quoted {
esc = true
continue
}
if b == '"' || b == '\'' {
quoted = !quoted
}
if b == '/' && !quoted {
comment = true
continue
}
buf.WriteByte(b)
}
if quoted || esc || comment {
// unexpected state, so return raw bytes
return raw
}
return buf.Bytes()
}
<yandex>
{{ $ip_addr := .D.ip_addr }}
{{ $user := .In.user }}
{{ $password := .In.password }}
{{ $port := .In.tcp_port }}
<clickhouse_remote_servers>
<logs>
{{range $index, $shards := .In.logs}}
<shard>
<weight>1</weight>
<internal_replication>true</internal_replication>
{{range $index2, $shard := $shards}}
<replica>
<host>{{$shard.ip}}</host>
<port>{{ $port}}</port>
<user>{{ $user}}</user>
<password>{{ $password}}</password>
</replica>
{{end}}
</shard>
{{end}}
</logs>
<ck_all>
{{range $index, $shards := .In.logs}}
{{range $index2, $shard := $shards}}
<shard>
<weight>1</weight>
<internal_replication>true</internal_replication>
<replica>
<host>{{$shard.ip}}</host>
<port>{{ $port}}</port>
<user>{{ $user}}</user>
<password>{{ $password}}</password>
</replica>
</shard>
{{end}}
{{end}}
</ck_all>
</clickhouse_remote_servers>
<!-- 这部分要更新脚本自动生成, TODO 得解决一下集群共用shard的问题 -->
{{range $index, $shards := .In.logs}}
{{range $index2, $shard := $shards}}
{{if eq $ip_addr $shard.ip }}
<macros>
<layer>logs</layer>
<shard>{{ $shard.shard}}</shard>
<replica>{{ $ip_addr }}</replica>
</macros>
{{end}}
{{end}}
{{end}}
<!-- ZK -->
<zookeeper-servers>
{{range $index, $element := .In.zookeepers}}
<node index="{{$index}}">
<host>{{$element.ip}}</host>
<port>{{$element.port}}</port>
</node>
{{end}}
</zookeeper-servers>
<clickhouse_compression>
<case>
<min_part_size>10000000000</min_part_size>
<min_part_size_ratio>0.01</min_part_size_ratio>
<method>lz4</method>
</case>
</clickhouse_compression>
</yandex>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment