Skip to content

Instantly share code, notes, and snippets.

@prune998
Created October 2, 2019 11:45
Show Gist options
  • Save prune998/7976be90e04b9f61a04976860b124c73 to your computer and use it in GitHub Desktop.
Save prune998/7976be90e04b9f61a04976860b124c73 to your computer and use it in GitHub Desktop.
BanzaiCloud Kafka Operator Manifest
apiVersion: kafka.banzaicloud.io/v1beta1
kind: KafkaCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
kafka_cr: kf-kafka
name: kf-kafka
namespace: alerting
spec:
headlessServiceEnabled: false
zkAddresses:
- "zk-zookeeper:2181"
rackAwareness:
labels:
- "failure-domain.beta.kubernetes.io/region"
- "failure-domain.beta.kubernetes.io/zone"
oneBrokerPerNode: false
clusterImage: "private-docker-repo/kafka:2.3.0.7"
#clusterWideConfig: |
# background.threads=10
rollingUpgradeConfig:
failureThreshold: 1
readOnlyConfig: |
offsets.topic.replication.factor=2
default.replication.factor=2
transaction.state.log.min.isr=1
log.dirs=/kafka-logs/data
delete.topic.enable=true
num.partitions=32
auto.create.topics.enable=false
transaction.state.log.replication.factor=2
brokerConfigGroups:
# Specify desired group name (eg., 'default_group')
default_group:
# all the brokerConfig settings are available here
serviceAccountName: "kf-kafka"
kafkaJvmPerfOpts: "-server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -Djava.awt.headless=true -Dsun.net.inetaddr.ttl=60 -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=${HOSTNAME} -Dcom.sun.management.jmxremote.rmi.port=9099"
storageConfigs:
- mountPath: "/kafka-logs"
pvcSpec:
accessModes:
- ReadWriteOnce
storageClassName: ssd
resources:
requests:
storage: 30Gi
brokers:
- id: 0
brokerConfigGroup: "default_group"
brokerConfig:
resourceRequirements:
limits:
memory: "3Gi"
requests:
cpu: "0.3"
memory: "512Mi"
- id: 1
brokerConfigGroup: "default_group"
brokerConfig:
resourceRequirements:
limits:
memory: "3Gi"
requests:
cpu: "0.3"
memory: "512Mi"
- id: 2
brokerConfigGroup: "default_group"
brokerConfig:
resourceRequirements:
limits:
memory: "3Gi"
requests:
cpu: "0.3"
memory: "512Mi"
listenersConfig:
internalListeners:
- type: "plaintext"
name: "plaintext"
containerPort: 9092
usedForInnerBrokerCommunication: true
cruiseControlConfig:
image: "solsson/kafka-cruise-control:latest"
serviceAccountName: "kf-kafka"
config: |
# Copyright 2017 LinkedIn Corp. Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
#
# This is an example property file for Kafka Cruise Control. See KafkaCruiseControlConfig for more details.
# Configuration for the metadata client.
# =======================================
# The maximum interval in milliseconds between two metadata refreshes.
#metadata.max.age.ms=300000
# Client id for the Cruise Control. It is used for the metadata client.
#client.id=kafka-cruise-control
# The size of TCP send buffer bytes for the metadata client.
#send.buffer.bytes=131072
# The size of TCP receive buffer size for the metadata client.
#receive.buffer.bytes=131072
# The time to wait before disconnect an idle TCP connection.
#connections.max.idle.ms=540000
# The time to wait before reconnect to a given host.
#reconnect.backoff.ms=50
# The time to wait for a response from a host after sending a request.
#request.timeout.ms=30000
# Configurations for the load monitor
# =======================================
# The number of metric fetcher thread to fetch metrics for the Kafka cluster
num.metric.fetchers=1
# The metric sampler class
metric.sampler.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.CruiseControlMetricsReporterSampler
# Configurations for CruiseControlMetricsReporterSampler
metric.reporter.topic.pattern=__CruiseControlMetrics
# The sample store class name
sample.store.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.KafkaSampleStore
# The config for the Kafka sample store to save the partition metric samples
partition.metric.sample.store.topic=__KafkaCruiseControlPartitionMetricSamples
# The config for the Kafka sample store to save the model training samples
broker.metric.sample.store.topic=__KafkaCruiseControlModelTrainingSamples
# The replication factor of Kafka metric sample store topic
sample.store.topic.replication.factor=2
# The config for the number of Kafka sample store consumer threads
num.sample.loading.threads=8
# The partition assignor class for the metric samplers
metric.sampler.partition.assignor.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.DefaultMetricSamplerPartitionAssignor
# The metric sampling interval in milliseconds
metric.sampling.interval.ms=120000
# The partition metrics window size in milliseconds
partition.metrics.window.ms=300000
# The number of partition metric windows to keep in memory
num.partition.metrics.windows=1
# The minimum partition metric samples required for a partition in each window
min.samples.per.partition.metrics.window=1
# The broker metrics window size in milliseconds
broker.metrics.window.ms=300000
# The number of broker metric windows to keep in memory
num.broker.metrics.windows=20
# The minimum broker metric samples required for a partition in each window
min.samples.per.broker.metrics.window=1
# The configuration for the BrokerCapacityConfigFileResolver (supports JBOD and non-JBOD broker capacities)
capacity.config.file=config/capacity.json
#capacity.config.file=config/capacityJBOD.json
# Configurations for the analyzer
# =======================================
# The list of goals to optimize the Kafka cluster for with pre-computed proposals
default.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal
# The list of supported goals
goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerDiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PreferredLeaderElectionGoal
# The list of supported hard goals
hard.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
# The minimum percentage of well monitored partitions out of all the partitions
min.monitored.partition.percentage=0.95
# The balance threshold for CPU
cpu.balance.threshold=1.1
# The balance threshold for disk
disk.balance.threshold=1.1
# The balance threshold for network inbound utilization
network.inbound.balance.threshold=1.1
# The balance threshold for network outbound utilization
network.outbound.balance.threshold=1.1
# The balance threshold for the replica count
replica.count.balance.threshold=1.1
# The capacity threshold for CPU in percentage
cpu.capacity.threshold=0.8
# The capacity threshold for disk in percentage
disk.capacity.threshold=0.8
# The capacity threshold for network inbound utilization in percentage
network.inbound.capacity.threshold=0.8
# The capacity threshold for network outbound utilization in percentage
network.outbound.capacity.threshold=0.8
# The threshold to define the cluster to be in a low CPU utilization state
cpu.low.utilization.threshold=0.0
# The threshold to define the cluster to be in a low disk utilization state
disk.low.utilization.threshold=0.0
# The threshold to define the cluster to be in a low network inbound utilization state
network.inbound.low.utilization.threshold=0.0
# The threshold to define the cluster to be in a low disk utilization state
network.outbound.low.utilization.threshold=0.0
# The metric anomaly percentile upper threshold
metric.anomaly.percentile.upper.threshold=90.0
# The metric anomaly percentile lower threshold
metric.anomaly.percentile.lower.threshold=10.0
# How often should the cached proposal be expired and recalculated if necessary
proposal.expiration.ms=60000
# The maximum number of replicas that can reside on a broker at any given time.
max.replicas.per.broker=10000
# The number of threads to use for proposal candidate precomputing.
num.proposal.precompute.threads=1
# the topics that should be excluded from the partition movement.
#topics.excluded.from.partition.movement
# Configurations for the executor
# =======================================
# The max number of partitions to move in/out on a given broker at a given time.
num.concurrent.partition.movements.per.broker=10
# The interval between two execution progress checks.
execution.progress.check.interval.ms=10000
# Configurations for anomaly detector
# =======================================
# The goal violation notifier class
anomaly.notifier.class=com.linkedin.kafka.cruisecontrol.detector.notifier.SelfHealingNotifier
# The metric anomaly finder class
metric.anomaly.finder.class=com.linkedin.kafka.cruisecontrol.detector.KafkaMetricAnomalyFinder
# The anomaly detection interval
anomaly.detection.interval.ms=10000
# The goal violation to detect.
anomaly.detection.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
# The interested metrics for metric anomaly analyzer.
metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN
## Adjust accordingly if your metrics reporter is an older version and does not produce these metrics.
#metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_50TH,BROKER_PRODUCE_LOCAL_TIME_MS_999TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_50TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_999TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_50TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_999TH,BROKER_LOG_FLUSH_TIME_MS_50TH,BROKER_LOG_FLUSH_TIME_MS_999TH
# The zk path to store failed broker information.
failed.brokers.zk.path=/CruiseControlBrokerList
# Topic config provider class
topic.config.provider.class=com.linkedin.kafka.cruisecontrol.config.KafkaTopicConfigProvider
# The cluster configurations for the KafkaTopicConfigProvider
cluster.configs.file=config/clusterConfigs.json
# The maximum time in milliseconds to store the response and access details of a completed user task.
completed.user.task.retention.time.ms=21600000
# The maximum time in milliseconds to retain the demotion history of brokers.
demotion.history.retention.time.ms=86400000
# The maximum number of completed user tasks for which the response and access details will be cached.
max.cached.completed.user.tasks=100
# The maximum number of user tasks for concurrently running in async endpoints across all users.
max.active.user.tasks=5
# Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled
self.healing.enabled=true
# Enable self healing for broker failure detector
#self.healing.broker.failure.enabled=true
# Enable self healing for goal violation detector
#self.healing.goal.violation.enabled=true
# Enable self healing for metric anomaly detector
#self.healing.metric.anomaly.enabled=true
# configurations for the webserver
# ================================
# HTTP listen port
webserver.http.port=9090
# HTTP listen address
webserver.http.address=0.0.0.0
# Whether CORS support is enabled for API or not
webserver.http.cors.enabled=false
# Value for Access-Control-Allow-Origin
webserver.http.cors.origin=http://localhost:8080/
# Value for Access-Control-Request-Method
webserver.http.cors.allowmethods=OPTIONS,GET,POST
# Headers that should be exposed to the Browser (Webapp)
# This is a special header that is used by the
# User Tasks subsystem and should be explicitly
# Enabled when CORS mode is used as part of the
# Admin Interface
webserver.http.cors.exposeheaders=User-Task-ID
# REST API default prefix
# (dont forget the ending *)
webserver.api.urlprefix=/kafkacruisecontrol/*
# Location where the Cruise Control frontend is deployed
webserver.ui.diskpath=./cruise-control-ui/dist/
# URL path prefix for UI
# (dont forget the ending *)
webserver.ui.urlprefix=/*
# Time After which request is converted to Async
webserver.request.maxBlockTimeMs=10000
# Default Session Expiry Period
webserver.session.maxExpiryTimeMs=60000
# Session cookie path
webserver.session.path=/
# Server Access Logs
webserver.accesslog.enabled=true
# Location of HTTP Request Logs
webserver.accesslog.path=access.log
# HTTP Request Log retention days
webserver.accesslog.retention.days=14
capacityConfig: |
{
"brokerCapacities":[
{
"brokerId": "-1",
"capacity": {
"DISK": "200000",
"CPU": "100",
"NW_IN": "50000",
"NW_OUT": "50000"
},
"doc": "This is the default capacity. Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB."
}
]
}
clusterConfigs: |
{
"min.insync.replicas": 2
}
monitoringConfig:
# jmxImage describes the used prometheus jmx exporter agent container
jmxImage: "banzaicloud/jmx-javaagent:0.12.0"
# pathToJar describes the path to the jar file in the given image
pathToJar: "/opt/jmx_exporter/jmx_prometheus_javaagent-0.12.0.jar"
# kafkaJMXExporterConfig describes jmx exporter config for Kafka
kafkaJMXExporterConfig: |
lowercaseOutputName: true
rules:
# Special cases and very specific rules
- pattern : kafka.server<type=(.+), name=(.+), clientId=(.+), topic=(.+), partition=(.*)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
topic: "$4"
partition: "$5"
- pattern : kafka.server<type=(.+), name=(.+), clientId=(.+), brokerHost=(.+), brokerPort=(.+)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
broker: "$4:$5"
# Generic per-second counters with 0-2 key/value pairs
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+)><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
labels:
"$4": "$5"
type: COUNTER
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
# Generic gauges with 0-2 key/value pairs
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
# Emulate Prometheus 'Summary' metrics for the exported 'Histogram's.
#
# Note that these are missing the '_sum' metric!
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*), (.+)=(.+)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
"$6": "$7"
quantile: "0.$8"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
labels:
"$4": "$5"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
quantile: "0.$6"
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
quantile: "0.$4"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment