Skip to content

Instantly share code, notes, and snippets.

@fabiog1901

fabiog1901/chaos.yaml

Last active Nov 25, 2020
Embed
What would you like to do?
#################
# NETWORK CHAOS #
#################
---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: delay-uswest-useast
labels:
app: cockroachdb
spec:
action: delay # chaos action
mode: all
selector: # define the pods belong to dc-a
pods:
default: # namespace of the target pods
- roach-seattle-1
- roach-seattle-2
- roach-seattle-3
- jumpbox
delay:
latency: "60ms"
direction: to
target:
selector: # define the pods belong to dc-b and dc-c
pods:
default: # namespace of the target pods
- roach-newyork-1
- roach-newyork-2
- roach-newyork-3
mode: all
---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: delay-uswest-euwest
labels:
app: cockroachdb
spec:
action: delay # chaos action
mode: all
selector: # define the pods belong to dc-a
pods:
default: # namespace of the target pods
- roach-london-1
- roach-london-2
- roach-london-3
- jumpbox
delay:
latency: "180ms"
direction: to
target:
selector: # define the pods belong to dc-b and dc-c
pods:
default: # namespace of the target pods
- roach-seattle-1
- roach-seattle-2
- roach-seattle-3
mode: all
---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: delay-useast-euwest
labels:
app: cockroachdb
spec:
action: delay # chaos action
mode: all
selector: # define the pods belong to dc-a
pods:
default: # namespace of the target pods
- roach-newyork-1
- roach-newyork-2
- roach-newyork-3
- jumpbox
delay:
latency: "120ms"
direction: to
target:
selector: # define the pods belong to dc-b and dc-c
pods:
default: # namespace of the target pods
- roach-london-1
- roach-london-2
- roach-london-3
mode: all
############
# SERVICES #
############
---
# us-west2
apiVersion: v1
kind: Service
metadata:
name: us-west2
labels:
app: cockroachdb
spec:
type: NodePort
ports:
# SQL client port
- name: grpc
port: 26257
targetPort: 26257
nodePort: 31257
# Admin UI
- name: http
port: 8080
targetPort: 8080
nodePort: 31080
selector:
app: cockroachdb
region: us-west2
---
# us-east4
apiVersion: v1
kind: Service
metadata:
name: us-east4
labels:
app: cockroachdb
spec:
type: NodePort
ports:
# SQL client port
- name: grpc
port: 26257
targetPort: 26257
nodePort: 31258
# Admin UI
- name: http
port: 8080
targetPort: 8080
nodePort: 31180
selector:
app: cockroachdb
region: us-east4
---
# eu-west2
apiVersion: v1
kind: Service
metadata:
name: eu-west2
labels:
app: cockroachdb
spec:
type: NodePort
ports:
# SQL client port
- name: grpc
port: 26257
targetPort: 26257
nodePort: 31259
# Admin UI
- name: http
port: 8080
targetPort: 8080
nodePort: 31280
selector:
app: cockroachdb
region: eu-west2
---
# intra-node service
apiVersion: v1
kind: Service
metadata:
name: cockroachdb
labels:
app: cockroachdb
annotations:
service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
prometheus.io/scrape: "true"
prometheus.io/path: "_status/vars"
prometheus.io/port: "8080"
spec:
ports:
- port: 26257
targetPort: 26257
name: grpc
- port: 8080
targetPort: 8080
name: http
publishNotReadyAddresses: true
clusterIP: None
selector:
app: cockroachdb
##############
# PODS + PVC #
##############
---
# roach-seattle-1
apiVersion: v1
kind: Pod
metadata:
name: roach-seattle-1
labels:
app: cockroachdb
region: us-west2
spec:
hostname: roach-seattle-1
subdomain: cockroachdb
containers:
- name: roach-seattle-1
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-west2,zone=a
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-seattle-1-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-seattle-1-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-seattle-2
apiVersion: v1
kind: Pod
metadata:
name: roach-seattle-2
labels:
app: cockroachdb
region: us-west2
spec:
hostname: roach-seattle-2
subdomain: cockroachdb
containers:
- name: roach-seattle-2
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-west2,zone=b
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-seattle-2-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-seattle-2-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-seattle-3
apiVersion: v1
kind: Pod
metadata:
name: roach-seattle-3
labels:
app: cockroachdb
region: us-west2
spec:
hostname: roach-seattle-3
subdomain: cockroachdb
containers:
- name: roach-seattle-3
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-west2,zone=c
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-seattle-3-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-seattle-3-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-newyork-1
apiVersion: v1
kind: Pod
metadata:
name: roach-newyork-1
labels:
app: cockroachdb
region: us-east4
spec:
hostname: roach-newyork-1
subdomain: cockroachdb
containers:
- name: roach-newyork-1
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-east4,zone=a
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-newyork-1-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-newyork-1-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-newyork-2
apiVersion: v1
kind: Pod
metadata:
name: roach-newyork-2
labels:
app: cockroachdb
region: us-east4
spec:
hostname: roach-newyork-2
subdomain: cockroachdb
containers:
- name: roach-newyork-2
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-east4,zone=b
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-newyork-2-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-newyork-2-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-newyork-3
apiVersion: v1
kind: Pod
metadata:
name: roach-newyork-3
labels:
app: cockroachdb
region: us-east4
spec:
hostname: roach-newyork-3
subdomain: cockroachdb
containers:
- name: roach-newyork-3
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=us-east4,zone=c
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-newyork-3-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-newyork-3-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-london-1
apiVersion: v1
kind: Pod
metadata:
name: roach-london-1
labels:
app: cockroachdb
region: eu-west2
spec:
hostname: roach-london-1
subdomain: cockroachdb
containers:
- name: roach-london-1
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=eu-west2,zone=a
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-london-1-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-london-1-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-london-2
apiVersion: v1
kind: Pod
metadata:
name: roach-london-2
labels:
app: cockroachdb
region: eu-west2
spec:
hostname: roach-london-2
subdomain: cockroachdb
containers:
- name: roach-london-2
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=eu-west2,zone=b
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-london-2-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-london-2-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
---
# roach-london-3
apiVersion: v1
kind: Pod
metadata:
name: roach-london-3
labels:
app: cockroachdb
region: eu-west2
spec:
hostname: roach-london-3
subdomain: cockroachdb
containers:
- name: roach-london-3
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 26257
name: grpc
- containerPort: 8080
name: http
livenessProbe:
httpGet:
path: "/health"
port: http
initialDelaySeconds: 30
periodSeconds: 5
readinessProbe:
httpGet:
path: "/health?ready=1"
port: http
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
volumeMounts:
- name: datadir
mountPath: /cockroach/cockroach-data
env:
- name: COCKROACH_CHANNEL
value: kubernetes-insecure
- name: GOMAXPROCS
valueFrom:
resourceFieldRef:
resource: limits.cpu
divisor: "1"
- name: MEMORY_LIMIT_MIB
valueFrom:
resourceFieldRef:
resource: limits.memory
divisor: "1Mi"
command:
- "/bin/bash"
- "-ecx"
- exec
/cockroach/cockroach
start
--logtostderr
--insecure
--advertise-host $(hostname -f)
--http-addr 0.0.0.0
--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
--locality=region=eu-west2,zone=c
terminationGracePeriodSeconds: 60
volumes:
- name: datadir
persistentVolumeClaim:
claimName: roach-london-3-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: roach-london-3-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
########################
# INIT AND CONFIG JOBS #
########################
---
apiVersion: batch/v1
kind: Job
metadata:
name: cluster-init
labels:
app: cockroachdb
spec:
template:
spec:
containers:
- name: cluster-init
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
command:
- "/cockroach/cockroach"
- "init"
- "--insecure"
- "--host=roach-seattle-1.cockroachdb"
restartPolicy: OnFailure
---
apiVersion: batch/v1
kind: Job
metadata:
name: cluster-sql-init
labels:
app: cockroachdb
spec:
template:
spec:
containers:
- name: cluster-sql-init
image: cockroachdb/cockroach:latest
imagePullPolicy: IfNotPresent
command:
- "/cockroach/cockroach"
- "sql"
- "--insecure"
- "--url"
- "postgresql://roach-seattle-1.cockroachdb:26257/defaultdb?sslmode=disable"
- "-e"
- "UPSERT into system.locations VALUES ('region', 'us-east4', 37.478397, -76.453077), ('region', 'us-west2', 43.804133, -120.554201), ('region', 'eu-west2', 51.5073509, -0.1277583);"
restartPolicy: OnFailure
#############
# MINIO #
#############
---
apiVersion: v1
kind: Service
metadata:
name: minio
labels:
app: minio
spec:
type: NodePort
ports:
# UI
- name: http
port: 9000
targetPort: 9000
nodePort: 31900
selector:
app: minio
---
apiVersion: v1
kind: Pod
metadata:
name: minio
labels:
app: minio
spec:
hostname: minio
containers:
- name: minio
image: minio/minio
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9000
name: http
volumeMounts:
- name: minio-data
mountPath: /data
args:
- server
- /data
volumes:
- name: minio-data
persistentVolumeClaim:
claimName: minio-data
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: minio-data
labels:
app: cockroachdb
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
storageClassName: standard
resources:
requests:
storage: 1Gi
##################
# PROMETHEUS #
##################
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
---
global:
scrape_interval: 10s
evaluation_interval: 10s
rule_files:
# what to alert for
- /etc/prometheus/alerts.rules.yml
# what metrics to collect
- /etc/prometheus/aggregation.rules.yml
# whom to alert
alerting:
alertmanagers:
- static_configs:
- targets:
- cockroachdb:9093
scrape_configs:
- job_name: "cockroachdb"
metrics_path: "/_status/vars"
scheme: "http"
tls_config:
insecure_skip_verify: true
static_configs:
# what hosts to monitor
- targets:
- roach-seattle-1.cockroachdb:8080
- roach-seattle-2.cockroachdb:8080
- roach-seattle-3.cockroachdb:8080
- roach-newyork-1.cockroachdb:8080
- roach-newyork-2.cockroachdb:8080
- roach-newyork-3.cockroachdb:8080
- roach-london-1.cockroachdb:8080
- roach-london-2.cockroachdb:8080
- roach-london-3.cockroachdb:8080
labels:
cluster: "crdb"
alerts.rules.yml: |
groups:
- name: rules/alerts.rules
rules:
# Alert for any instance that is unreachable for >15 minutes.
- alert: InstanceDead
expr: up{job="cockroachdb"} == 0
for: 1m
annotations:
description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
down for more than 15 minutes.'
summary: Instance {{ $labels.instance }} dead
# Alert for any instance that is not ready for a while.
- alert: InstanceNotReady
# This alert applies only to Kubernetes deployments and requires that you run kube-state-metrics: https://github.com/kubernetes/kube-state-metrics
expr: kube_statefulset_status_replicas_ready{statefulset="cockroachdb"} != kube_statefulset_status_replicas{statefulset="cockroachdb"}
for: 45m
annotations:
description: 'there has been an unready replica for cluster {{ $labels.cluster }}
for more than 15 minutes.'
summary: Instance not ready
# Alert on instance restarts.
- alert: InstanceRestart
expr: resets(sys_uptime{job="cockroachdb"}[24h]) > 1
annotations:
description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 24h'
summary: Instance {{ $labels.instance }} restarted
# Alert on flapping instances (frequent restarts).
- alert: InstancesFlapping
# Aggregated.
# This alert assumes that rolling restarts or rolling upgrades leave at least 3 minutes between each node being updated or restarted.
expr: sum by (cluster)(resets(sys_uptime{job="cockroachdb"}[5m])) > 2
annotations:
description: 'instances in cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 5m'
summary: Instances in {{ $labels.cluster }} flapping
# Alert on flapping instances (frequent restarts).
- alert: InstanceFlapping
# Un-aggregated.
expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 1
annotations:
description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 10m'
summary: Instance {{ $labels.instance }} flapping
# Alert on version mismatch.
# This alert is intentionally loose (4 hours) to allow for rolling upgrades.
# This may need to be adjusted for large clusters.
- alert: VersionMismatch
expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"}))
> 1
for: 4h
annotations:
description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
summary: Binary version mismatch on {{ $labels.cluster }}
# Available capacity alerts.
- alert: StoreDiskLow
expr: capacity_available:ratio{job="cockroachdb"} < 0.15
annotations:
summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
}} available disk fraction
- alert: ClusterDiskLow
expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2
annotations:
summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
# Unavailable ranges.
- alert: UnavailableRanges
expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 0
for: 10m
annotations:
summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
# Cockroach-measured clock offset nearing limit (by default, servers kill themselves at 400ms from the mean, so alert at 300ms)
- alert: ClockOffsetNearMax
expr: clock_offset_meannanos{job="cockroachdb"} > 300 * 1000 * 1000
for: 5m
annotations:
summary: Clock on {{ $labels.instance }} as measured by cockroach is offset by {{ $value }} nanoseconds from the cluster mean # Certificate expiration. Alerts are per node.
- alert: CACertificateExpiresSoon
expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"}
- time()) < 86400 * 366
labels:
frequency: daily
annotations:
summary: CA certificate for {{ $labels.instance }} expires in less than a year
- alert: ClientCACertificateExpiresSoon
expr: (security_certificate_expiration_client_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_client_ca{job="cockroachdb"}
- time()) < 86400 * 366
labels:
frequency: daily
annotations:
summary: Client CA certificate for {{ $labels.instance }} expires in less than a year
- alert: UICACertificateExpiresSoon
expr: (security_certificate_expiration_ui_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ui_ca{job="cockroachdb"}
- time()) < 86400 * 366
labels:
frequency: daily
annotations:
summary: UI CA certificate for {{ $labels.instance }} expires in less than a year
- alert: NodeCertificateExpiresSoon
expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"}
- time()) < 86400 * 183
labels:
frequency: daily
annotations:
summary: Node certificate for {{ $labels.instance }} expires in less than six months
- alert: NodeClientCertificateExpiresSoon
expr: (security_certificate_expiration_node_client{job="cockroachdb"} > 0) and (security_certificate_expiration_node_client{job="cockroachdb"}
- time()) < 86400 * 183
labels:
frequency: daily
annotations:
summary: Client certificate for {{ $labels.instance }} expires in less than six months
- alert: UICertificateExpiresSoon
expr: (security_certificate_expiration_ui{job="cockroachdb"} > 0) and (security_certificate_expiration_ui{job="cockroachdb"}
- time()) < 86400 * 20
labels:
frequency: daily
annotations:
summary: UI certificate for {{ $labels.instance }} expires in less than 20 days
# Slow Latch/Lease/Raft requests.
- alert: SlowLatchRequest
expr: requests_slow_latch{job="cockroachdb"} > 0
for: 5m
labels:
severity: testing
annotations:
summary: '{{ $value }} slow latch requests on {{ $labels.instance }}'
- alert: SlowLeaseRequest
expr: requests_slow_lease{job="cockroachdb"} > 0
for: 5m
labels:
severity: testing
annotations:
summary: '{{ $value }} slow lease requests on {{ $labels.instance }}'
- alert: SlowRaftRequest
expr: requests_slow_raft{job="cockroachdb"} > 0
for: 5m
labels:
severity: testing
annotations:
summary: '{{ $value }} slow raft requests on {{ $labels.instance }}'
# Getting close to open file descriptor limit.
- alert: HighOpenFDCount
expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 0.8
for: 10m
annotations:
summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
}} fraction used'
# Prometheus disk getting full.
- alert: PrometheusDiskLow
expr: node_filesystem_free{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
/ node_filesystem_size{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
< 0.2
for: 10m
labels:
severity: testing
annotations:
summary: 'Prometheus storage is almost full: {{ $value }} fraction free'
aggregation.rules.yml: |
# This file contains aggregation rules, specifically:
# "node:X" node-level aggregation of a per-store metric X
# "cluster:X" cluster-level aggregation of a per-store or per-node metric X
#
# Most aggregation rules should use the "without (label1, label2, ...)" keyword
# to keep all labels but the ones specified.
groups:
- name: rules/aggregation.rules
rules:
- record: node:capacity
expr: sum without(store) (capacity{job="cockroachdb"})
- record: cluster:capacity
expr: sum without(instance) (node:capacity{job="cockroachdb"})
- record: node:capacity_available
expr: sum without(store) (capacity_available{job="cockroachdb"})
- record: cluster:capacity_available
expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
- record: capacity_available:ratio
expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
- record: node:capacity_available:ratio
expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
- record: cluster:capacity_available:ratio
expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
- record: txn_durations_bucket:rate1m
expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
- record: txn_durations:rate1m:quantile_50
expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_75
expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_90
expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_95
expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_99
expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
- record: exec_latency_bucket:rate1m
expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
- record: exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
- record: round_trip_latency_bucket:rate1m
expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
- record: round_trip_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
- record: sql_exec_latency_bucket:rate1m
expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
- record: sql_exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
- record: raft_process_logcommit_latency_bucket:rate1m
expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_logcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency_bucket:rate1m
expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_commandcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
---
apiVersion: v1
kind: Service
metadata:
name: prom
labels:
app: prom
spec:
type: NodePort
ports:
# UI
- name: http
port: 9090
targetPort: 9090
nodePort: 31990
selector:
app: cockroachdb
---
apiVersion: v1
kind: Pod
metadata:
name: prom
labels:
app: cockroachdb
spec:
hostname: prom
#subdomain: cockroachdb
containers:
- name: prom
image: prom/prometheus
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9000
name: http
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus/prometheus.yml
subPath: prometheus.yml
- name: prometheus-config
mountPath: /etc/prometheus/aggregation.rules.yml
subPath: aggregation.rules.yml
- name: prometheus-config
mountPath: /etc/prometheus/alerts.rules.yml
subPath: alerts.rules.yml
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
################
# ALERTMANAGER #
################
---
apiVersion: v1
kind: Service
metadata:
name: alertmgr
labels:
app: alertmgr
spec:
type: NodePort
ports:
# UI
- name: http
port: 9093
targetPort: 9093
nodePort: 31993
selector:
app: cockroachdb
---
apiVersion: v1
kind: Pod
metadata:
name: alertmgr
labels:
app: cockroachdb
spec:
hostname: alertmgr
#subdomain: cockroachdb
containers:
- name: alertmgr
image: quay.io/prometheus/alertmanager:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9093
name: http
###########
# GRAFANA #
###########
---
apiVersion: v1
kind: Service
metadata:
name: grafana
labels:
app: grafana
spec:
type: NodePort
ports:
# UI
- name: http
port: 3000
targetPort: 3000
nodePort: 32000
selector:
app: grafana
---
apiVersion: v1
kind: Pod
metadata:
name: grafana
labels:
app: grafana
spec:
hostname: grafana
containers:
- name: grafana
image: grafana/grafana
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: http
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment