fabiog1901/chaos.yaml

## chaos.yaml
#################
# NETWORK CHAOS #
#################
---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
  name: delay-uswest-useast
  labels:
    app: cockroachdb
spec:
  action: delay # chaos action
  mode: all
  selector: # define the pods belong to dc-a
    pods:
      default: # namespace of the target pods
        - roach-seattle-1
        - roach-seattle-2
        - roach-seattle-3
        - jumpbox
  delay:
    latency: "60ms"
  direction: to
  target:
    selector: # define the pods belong to dc-b and dc-c
      pods:
        default: # namespace of the target pods
          - roach-newyork-1
          - roach-newyork-2
          - roach-newyork-3
    mode: all

---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
  name: delay-uswest-euwest
  labels:
    app: cockroachdb
spec:
  action: delay # chaos action
  mode: all
  selector: # define the pods belong to dc-a
    pods:
      default: # namespace of the target pods
        - roach-london-1
        - roach-london-2
        - roach-london-3
        - jumpbox
  delay:
    latency: "180ms"
  direction: to
  target:
    selector: # define the pods belong to dc-b and dc-c
      pods:
        default: # namespace of the target pods
          - roach-seattle-1
          - roach-seattle-2
          - roach-seattle-3
    mode: all

---
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
  name: delay-useast-euwest
  labels:
    app: cockroachdb
spec:
  action: delay # chaos action
  mode: all
  selector: # define the pods belong to dc-a
    pods:
      default: # namespace of the target pods
        - roach-newyork-1
        - roach-newyork-2
        - roach-newyork-3
        - jumpbox
  delay:
    latency: "120ms"
  direction: to
  target:
    selector: # define the pods belong to dc-b and dc-c
      pods:
        default: # namespace of the target pods
          - roach-london-1
          - roach-london-2
          - roach-london-3
    mode: all

## crdb-k8s-cluster.yaml
############
# SERVICES #
############
---
# us-west2
apiVersion: v1
kind: Service
metadata:
  name: us-west2
  labels:
    app: cockroachdb
spec:
  type: NodePort
  ports:
    # SQL client port
    - name: grpc
      port: 26257
      targetPort: 26257
      nodePort: 31257
    # Admin UI
    - name: http
      port: 8080
      targetPort: 8080
      nodePort: 31080
  selector:
    app: cockroachdb
    region: us-west2

---
# us-east4
apiVersion: v1
kind: Service
metadata:
  name: us-east4
  labels:
    app: cockroachdb
spec:
  type: NodePort
  ports:
    # SQL client port
    - name: grpc
      port: 26257
      targetPort: 26257
      nodePort: 31258
    # Admin UI
    - name: http
      port: 8080
      targetPort: 8080
      nodePort: 31180
  selector:
    app: cockroachdb
    region: us-east4

---
# eu-west2
apiVersion: v1
kind: Service
metadata:
  name: eu-west2
  labels:
    app: cockroachdb
spec:
  type: NodePort
  ports:
    # SQL client port
    - name: grpc
      port: 26257
      targetPort: 26257
      nodePort: 31259
    # Admin UI
    - name: http
      port: 8080
      targetPort: 8080
      nodePort: 31280
  selector:
    app: cockroachdb
    region: eu-west2

---
# intra-node service
apiVersion: v1
kind: Service
metadata:
  name: cockroachdb
  labels:
    app: cockroachdb
  annotations:
    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
    prometheus.io/scrape: "true"
    prometheus.io/path: "_status/vars"
    prometheus.io/port: "8080"
spec:
  ports:
    - port: 26257
      targetPort: 26257
      name: grpc
    - port: 8080
      targetPort: 8080
      name: http
  publishNotReadyAddresses: true
  clusterIP: None
  selector:
    app: cockroachdb


##############
# PODS + PVC #
##############
---
# roach-seattle-1
apiVersion: v1
kind: Pod
metadata:
  name: roach-seattle-1
  labels:
    app: cockroachdb
    region: us-west2
spec:
  hostname: roach-seattle-1
  subdomain: cockroachdb
  containers:
    - name: roach-seattle-1
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-west2,zone=a
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-seattle-1-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-seattle-1-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-seattle-2
apiVersion: v1
kind: Pod
metadata:
  name: roach-seattle-2
  labels:
    app: cockroachdb
    region: us-west2
spec:
  hostname: roach-seattle-2
  subdomain: cockroachdb
  containers:
    - name: roach-seattle-2
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-west2,zone=b
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-seattle-2-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-seattle-2-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-seattle-3
apiVersion: v1
kind: Pod
metadata:
  name: roach-seattle-3
  labels:
    app: cockroachdb
    region: us-west2
spec:
  hostname: roach-seattle-3
  subdomain: cockroachdb
  containers:
    - name: roach-seattle-3
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-west2,zone=c
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-seattle-3-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-seattle-3-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi


---
# roach-newyork-1
apiVersion: v1
kind: Pod
metadata:
  name: roach-newyork-1
  labels:
    app: cockroachdb
    region: us-east4
spec:
  hostname: roach-newyork-1
  subdomain: cockroachdb
  containers:
    - name: roach-newyork-1
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-east4,zone=a
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-newyork-1-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-newyork-1-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-newyork-2
apiVersion: v1
kind: Pod
metadata:
  name: roach-newyork-2
  labels:
    app: cockroachdb
    region: us-east4
spec:
  hostname: roach-newyork-2
  subdomain: cockroachdb
  containers:
    - name: roach-newyork-2
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-east4,zone=b
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-newyork-2-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-newyork-2-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-newyork-3
apiVersion: v1
kind: Pod
metadata:
  name: roach-newyork-3
  labels:
    app: cockroachdb
    region: us-east4
spec:
  hostname: roach-newyork-3
  subdomain: cockroachdb
  containers:
    - name: roach-newyork-3
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=us-east4,zone=c
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-newyork-3-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-newyork-3-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi


---
# roach-london-1
apiVersion: v1
kind: Pod
metadata:
  name: roach-london-1
  labels:
    app: cockroachdb
    region: eu-west2
spec:
  hostname: roach-london-1
  subdomain: cockroachdb
  containers:
    - name: roach-london-1
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=eu-west2,zone=a
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-london-1-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-london-1-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-london-2
apiVersion: v1
kind: Pod
metadata:
  name: roach-london-2
  labels:
    app: cockroachdb
    region: eu-west2
spec:
  hostname: roach-london-2
  subdomain: cockroachdb
  containers:
    - name: roach-london-2
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=eu-west2,zone=b
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-london-2-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-london-2-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

---
# roach-london-3
apiVersion: v1
kind: Pod
metadata:
  name: roach-london-3
  labels:
    app: cockroachdb
    region: eu-west2
spec:
  hostname: roach-london-3
  subdomain: cockroachdb
  containers:
    - name: roach-london-3
      image: cockroachdb/cockroach:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 26257
          name: grpc
        - containerPort: 8080
          name: http
      livenessProbe:
        httpGet:
          path: "/health"
          port: http
        initialDelaySeconds: 30
        periodSeconds: 5
      readinessProbe:
        httpGet:
          path: "/health?ready=1"
          port: http
        initialDelaySeconds: 10
        periodSeconds: 5
        failureThreshold: 2
      volumeMounts:
        - name: datadir
          mountPath: /cockroach/cockroach-data
      env:
        - name: COCKROACH_CHANNEL
          value: kubernetes-insecure
        - name: GOMAXPROCS
          valueFrom:
            resourceFieldRef:
              resource: limits.cpu
              divisor: "1"
        - name: MEMORY_LIMIT_MIB
          valueFrom:
            resourceFieldRef:
              resource: limits.memory
              divisor: "1Mi"
      command:
        - "/bin/bash"
        - "-ecx"
        - exec
          /cockroach/cockroach
          start
          --logtostderr
          --insecure
          --advertise-host $(hostname -f)
          --http-addr 0.0.0.0
          --join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
          --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
          --locality=region=eu-west2,zone=c
  terminationGracePeriodSeconds: 60
  volumes:
    - name: datadir
      persistentVolumeClaim:
        claimName: roach-london-3-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: roach-london-3-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

########################
# INIT AND CONFIG JOBS #
########################
---
apiVersion: batch/v1
kind: Job
metadata:
  name: cluster-init
  labels:
    app: cockroachdb
spec:
  template:
    spec:
      containers:
      - name: cluster-init
        image: cockroachdb/cockroach:latest
        imagePullPolicy: IfNotPresent
        command:
          - "/cockroach/cockroach"
          - "init"
          - "--insecure"
          - "--host=roach-seattle-1.cockroachdb"
      restartPolicy: OnFailure

---
apiVersion: batch/v1
kind: Job
metadata:
  name: cluster-sql-init
  labels:
    app: cockroachdb
spec:
  template:
    spec:
      containers:
      - name: cluster-sql-init
        image: cockroachdb/cockroach:latest
        imagePullPolicy: IfNotPresent
        command:
          - "/cockroach/cockroach"
          - "sql"
          - "--insecure"
          - "--url"
          - "postgresql://roach-seattle-1.cockroachdb:26257/defaultdb?sslmode=disable"
          - "-e"
          - "UPSERT into system.locations VALUES ('region', 'us-east4', 37.478397, -76.453077), ('region', 'us-west2', 43.804133, -120.554201), ('region', 'eu-west2', 51.5073509, -0.1277583);"
      restartPolicy: OnFailure


## monitoring.yaml
#############
#   MINIO   #
#############
---
apiVersion: v1
kind: Service
metadata:
  name: minio
  labels:
    app: minio
spec:
  type: NodePort
  ports:
    # UI
    - name: http
      port: 9000
      targetPort: 9000
      nodePort: 31900
  selector:
    app: minio

---
apiVersion: v1
kind: Pod
metadata:
  name: minio
  labels:
    app: minio
spec:
  hostname: minio
  containers:
    - name: minio
      image: minio/minio
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 9000
          name: http
      volumeMounts:
        - name: minio-data
          mountPath: /data
      args:
        - server
        - /data
  volumes:
    - name: minio-data
      persistentVolumeClaim:
        claimName: minio-data

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: minio-data
  labels:
    app: cockroachdb
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Filesystem
  storageClassName: standard
  resources:
    requests:
      storage: 1Gi

##################
#   PROMETHEUS   #
##################
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
data:
  prometheus.yml: |
    ---
    global:
      scrape_interval: 10s
      evaluation_interval: 10s

    rule_files:
      # what to alert for
      - /etc/prometheus/alerts.rules.yml
      # what metrics to collect
      - /etc/prometheus/aggregation.rules.yml

    # whom to alert
    alerting:
      alertmanagers:
        - static_configs:
            - targets:
                - cockroachdb:9093

    scrape_configs:
      - job_name: "cockroachdb"
        metrics_path: "/_status/vars"
        scheme: "http"
        tls_config:
          insecure_skip_verify: true
        static_configs:
          # what hosts to monitor
          - targets:
              - roach-seattle-1.cockroachdb:8080
              - roach-seattle-2.cockroachdb:8080
              - roach-seattle-3.cockroachdb:8080
              - roach-newyork-1.cockroachdb:8080
              - roach-newyork-2.cockroachdb:8080
              - roach-newyork-3.cockroachdb:8080
              - roach-london-1.cockroachdb:8080
              - roach-london-2.cockroachdb:8080
              - roach-london-3.cockroachdb:8080
            labels:
              cluster: "crdb"
  alerts.rules.yml: |
    groups:
    - name: rules/alerts.rules
      rules:
      # Alert for any instance that is unreachable for >15 minutes.
      - alert: InstanceDead
        expr: up{job="cockroachdb"} == 0
        for: 1m
        annotations:
          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
            down for more than 15 minutes.'
          summary: Instance {{ $labels.instance }} dead
      # Alert for any instance that is not ready for a while.
      - alert: InstanceNotReady
        # This alert applies only to Kubernetes deployments and requires that you run kube-state-metrics: https://github.com/kubernetes/kube-state-metrics
        expr: kube_statefulset_status_replicas_ready{statefulset="cockroachdb"} != kube_statefulset_status_replicas{statefulset="cockroachdb"}
        for: 45m
        annotations:
          description: 'there has been an unready replica for cluster {{ $labels.cluster }}
            for more than 15 minutes.'
          summary: Instance not ready
      # Alert on instance restarts.
      - alert: InstanceRestart
        expr: resets(sys_uptime{job="cockroachdb"}[24h]) > 1
        annotations:
          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
            {{ $value }} time(s) in 24h'
          summary: Instance {{ $labels.instance }} restarted
      # Alert on flapping instances (frequent restarts).
      - alert: InstancesFlapping
        # Aggregated.
        # This alert assumes that rolling restarts or rolling upgrades leave at least 3 minutes between each node being updated or restarted.
        expr: sum by (cluster)(resets(sys_uptime{job="cockroachdb"}[5m])) > 2
        annotations:
          description: 'instances in cluster {{ $labels.cluster }} restarted
            {{ $value }} time(s) in 5m'
          summary: Instances in {{ $labels.cluster }} flapping
      # Alert on flapping instances (frequent restarts).
      - alert: InstanceFlapping
        # Un-aggregated.
        expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 1
        annotations:
          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
            {{ $value }} time(s) in 10m'
          summary: Instance {{ $labels.instance }} flapping
      # Alert on version mismatch.
      # This alert is intentionally loose (4 hours) to allow for rolling upgrades.
      # This may need to be adjusted for large clusters.
      - alert: VersionMismatch
        expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"}))
          > 1
        for: 4h
        annotations:
          description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
          summary: Binary version mismatch on {{ $labels.cluster }}
      # Available capacity alerts.
      - alert: StoreDiskLow
        expr: capacity_available:ratio{job="cockroachdb"} < 0.15
        annotations:
          summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
            }} available disk fraction
      - alert: ClusterDiskLow
        expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2
        annotations:
          summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
      # Unavailable ranges.
      - alert: UnavailableRanges
        expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 0
        for: 10m
        annotations:
          summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
      # Cockroach-measured clock offset nearing limit (by default, servers kill themselves at 400ms from the mean, so alert at 300ms)
      - alert: ClockOffsetNearMax
        expr: clock_offset_meannanos{job="cockroachdb"} > 300 * 1000 * 1000
        for: 5m
        annotations:
          summary: Clock on {{ $labels.instance }} as measured by cockroach is offset by {{ $value }} nanoseconds from the cluster mean  # Certificate expiration. Alerts are per node.
      - alert: CACertificateExpiresSoon
        expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"}
          - time()) < 86400 * 366
        labels:
          frequency: daily
        annotations:
          summary: CA certificate for {{ $labels.instance }} expires in less than a year
      - alert: ClientCACertificateExpiresSoon
        expr: (security_certificate_expiration_client_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_client_ca{job="cockroachdb"}
          - time()) < 86400 * 366
        labels:
          frequency: daily
        annotations:
          summary: Client CA certificate for {{ $labels.instance }} expires in less than a year
      - alert: UICACertificateExpiresSoon
        expr: (security_certificate_expiration_ui_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ui_ca{job="cockroachdb"}
          - time()) < 86400 * 366
        labels:
          frequency: daily
        annotations:
          summary: UI CA certificate for {{ $labels.instance }} expires in less than a year
      - alert: NodeCertificateExpiresSoon
        expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"}
          - time()) < 86400 * 183
        labels:
          frequency: daily
        annotations:
          summary: Node certificate for {{ $labels.instance }} expires in less than six months
      - alert: NodeClientCertificateExpiresSoon
        expr: (security_certificate_expiration_node_client{job="cockroachdb"} > 0) and (security_certificate_expiration_node_client{job="cockroachdb"}
          - time()) < 86400 * 183
        labels:
          frequency: daily
        annotations:
          summary: Client certificate for {{ $labels.instance }} expires in less than six months
      - alert: UICertificateExpiresSoon
        expr: (security_certificate_expiration_ui{job="cockroachdb"} > 0) and (security_certificate_expiration_ui{job="cockroachdb"}
          - time()) < 86400 * 20
        labels:
          frequency: daily
        annotations:
          summary: UI certificate for {{ $labels.instance }} expires in less than 20 days
      # Slow Latch/Lease/Raft requests.
      - alert: SlowLatchRequest
        expr: requests_slow_latch{job="cockroachdb"} > 0
        for: 5m
        labels:
          severity: testing
        annotations:
          summary: '{{ $value }} slow latch requests on {{ $labels.instance }}'
      - alert: SlowLeaseRequest
        expr: requests_slow_lease{job="cockroachdb"} > 0
        for: 5m
        labels:
          severity: testing
        annotations:
          summary: '{{ $value }} slow lease requests on {{ $labels.instance }}'
      - alert: SlowRaftRequest
        expr: requests_slow_raft{job="cockroachdb"} > 0
        for: 5m
        labels:
          severity: testing
        annotations:
          summary: '{{ $value }} slow raft requests on {{ $labels.instance }}'
      # Getting close to open file descriptor limit.
      - alert: HighOpenFDCount
        expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 0.8
        for: 10m
        annotations:
          summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
            }} fraction used'
      # Prometheus disk getting full.
      - alert: PrometheusDiskLow
        expr: node_filesystem_free{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
          / node_filesystem_size{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
          < 0.2
        for: 10m
        labels:
          severity: testing
        annotations:
          summary: 'Prometheus storage is almost full: {{ $value }} fraction free'
  aggregation.rules.yml: |
    # This file contains aggregation rules, specifically:
    #   "node:X" node-level aggregation of a per-store metric X
    #   "cluster:X" cluster-level aggregation of a per-store or per-node metric X
    #
    # Most aggregation rules should use the "without (label1, label2, ...)" keyword
    # to keep all labels but the ones specified.

    groups:
    - name: rules/aggregation.rules
      rules:
      - record: node:capacity
        expr: sum without(store) (capacity{job="cockroachdb"})
      - record: cluster:capacity
        expr: sum without(instance) (node:capacity{job="cockroachdb"})
      - record: node:capacity_available
        expr: sum without(store) (capacity_available{job="cockroachdb"})
      - record: cluster:capacity_available
        expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
      - record: capacity_available:ratio
        expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
      - record: node:capacity_available:ratio
        expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
      - record: cluster:capacity_available:ratio
        expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
      # Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
      - record: txn_durations_bucket:rate1m
        expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
      - record: txn_durations:rate1m:quantile_50
        expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
      - record: txn_durations:rate1m:quantile_75
        expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
      - record: txn_durations:rate1m:quantile_90
        expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
      - record: txn_durations:rate1m:quantile_95
        expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
      - record: txn_durations:rate1m:quantile_99
        expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
      - record: exec_latency_bucket:rate1m
        expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
      - record: exec_latency:rate1m:quantile_50
        expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
      - record: exec_latency:rate1m:quantile_75
        expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
      - record: exec_latency:rate1m:quantile_90
        expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
      - record: exec_latency:rate1m:quantile_95
        expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
      - record: exec_latency:rate1m:quantile_99
        expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
      - record: round_trip_latency_bucket:rate1m
        expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
      - record: round_trip_latency:rate1m:quantile_50
        expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
      - record: round_trip_latency:rate1m:quantile_75
        expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
      - record: round_trip_latency:rate1m:quantile_90
        expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
      - record: round_trip_latency:rate1m:quantile_95
        expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
      - record: round_trip_latency:rate1m:quantile_99
        expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
      - record: sql_exec_latency_bucket:rate1m
        expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
      - record: sql_exec_latency:rate1m:quantile_50
        expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
      - record: sql_exec_latency:rate1m:quantile_75
        expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
      - record: sql_exec_latency:rate1m:quantile_90
        expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
      - record: sql_exec_latency:rate1m:quantile_95
        expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
      - record: sql_exec_latency:rate1m:quantile_99
        expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
      - record: raft_process_logcommit_latency_bucket:rate1m
        expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
      - record: raft_process_logcommit_latency:rate1m:quantile_50
        expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
      - record: raft_process_logcommit_latency:rate1m:quantile_75
        expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
      - record: raft_process_logcommit_latency:rate1m:quantile_90
        expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
      - record: raft_process_logcommit_latency:rate1m:quantile_95
        expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
      - record: raft_process_logcommit_latency:rate1m:quantile_99
        expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
      - record: raft_process_commandcommit_latency_bucket:rate1m
        expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
      - record: raft_process_commandcommit_latency:rate1m:quantile_50
        expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
      - record: raft_process_commandcommit_latency:rate1m:quantile_75
        expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
      - record: raft_process_commandcommit_latency:rate1m:quantile_90
        expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
      - record: raft_process_commandcommit_latency:rate1m:quantile_95
        expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
      - record: raft_process_commandcommit_latency:rate1m:quantile_99
        expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)

---
apiVersion: v1
kind: Service
metadata:
  name: prom
  labels:
    app: prom
spec:
  type: NodePort
  ports:
    # UI
    - name: http
      port: 9090
      targetPort: 9090
      nodePort: 31990
  selector:
    app: cockroachdb

---
apiVersion: v1
kind: Pod
metadata:
  name: prom
  labels:
    app: cockroachdb
spec:
  hostname: prom
  #subdomain: cockroachdb
  containers:
    - name: prom
      image: prom/prometheus
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 9000
          name: http
      volumeMounts:
        - name: prometheus-config
          mountPath: /etc/prometheus/prometheus.yml
          subPath: prometheus.yml
        - name: prometheus-config
          mountPath: /etc/prometheus/aggregation.rules.yml
          subPath: aggregation.rules.yml
        - name: prometheus-config
          mountPath: /etc/prometheus/alerts.rules.yml
          subPath: alerts.rules.yml

  volumes:
    - name: prometheus-config
      configMap:
        name: prometheus-config

################
# ALERTMANAGER #
################
---
apiVersion: v1
kind: Service
metadata:
  name: alertmgr
  labels:
    app: alertmgr
spec:
  type: NodePort
  ports:
    # UI
    - name: http
      port: 9093
      targetPort: 9093
      nodePort: 31993
  selector:
    app: cockroachdb
---
apiVersion: v1
kind: Pod
metadata:
  name: alertmgr
  labels:
    app: cockroachdb
spec:
  hostname: alertmgr
  #subdomain: cockroachdb
  containers:
    - name: alertmgr
      image: quay.io/prometheus/alertmanager:latest
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 9093
          name: http

###########
# GRAFANA #
###########
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  labels:
    app: grafana
spec:
  type: NodePort
  ports:
    # UI
    - name: http
      port: 3000
      targetPort: 3000
      nodePort: 32000
  selector:
    app: grafana
---
apiVersion: v1
kind: Pod
metadata:
  name: grafana
  labels:
    app: grafana
spec:
  hostname: grafana
  containers:
    - name: grafana
      image: grafana/grafana
      imagePullPolicy: IfNotPresent
      ports:
        - containerPort: 3000
          name: http
	#################
	# NETWORK CHAOS #
	#################
	---
	apiVersion: chaos-mesh.org/v1alpha1
	kind: NetworkChaos
	metadata:
	name: delay-uswest-useast
	labels:
	app: cockroachdb
	spec:
	action: delay # chaos action
	mode: all
	selector: # define the pods belong to dc-a
	pods:
	default: # namespace of the target pods
	- roach-seattle-1
	- roach-seattle-2
	- roach-seattle-3
	- jumpbox
	delay:
	latency: "60ms"
	direction: to
	target:
	selector: # define the pods belong to dc-b and dc-c
	pods:
	default: # namespace of the target pods
	- roach-newyork-1
	- roach-newyork-2
	- roach-newyork-3
	mode: all

	---
	apiVersion: chaos-mesh.org/v1alpha1
	kind: NetworkChaos
	metadata:
	name: delay-uswest-euwest
	labels:
	app: cockroachdb
	spec:
	action: delay # chaos action
	mode: all
	selector: # define the pods belong to dc-a
	pods:
	default: # namespace of the target pods
	- roach-london-1
	- roach-london-2
	- roach-london-3
	- jumpbox
	delay:
	latency: "180ms"
	direction: to
	target:
	selector: # define the pods belong to dc-b and dc-c
	pods:
	default: # namespace of the target pods
	- roach-seattle-1
	- roach-seattle-2
	- roach-seattle-3
	mode: all

	---
	apiVersion: chaos-mesh.org/v1alpha1
	kind: NetworkChaos
	metadata:
	name: delay-useast-euwest
	labels:
	app: cockroachdb
	spec:
	action: delay # chaos action
	mode: all
	selector: # define the pods belong to dc-a
	pods:
	default: # namespace of the target pods
	- roach-newyork-1
	- roach-newyork-2
	- roach-newyork-3
	- jumpbox
	delay:
	latency: "120ms"
	direction: to
	target:
	selector: # define the pods belong to dc-b and dc-c
	pods:
	default: # namespace of the target pods
	- roach-london-1
	- roach-london-2
	- roach-london-3
	mode: all
	############
	# SERVICES #
	############
	---
	# us-west2
	apiVersion: v1
	kind: Service
	metadata:
	name: us-west2
	labels:
	app: cockroachdb
	spec:
	type: NodePort
	ports:
	# SQL client port
	- name: grpc
	port: 26257
	targetPort: 26257
	nodePort: 31257
	# Admin UI
	- name: http
	port: 8080
	targetPort: 8080
	nodePort: 31080
	selector:
	app: cockroachdb
	region: us-west2

	---
	# us-east4
	apiVersion: v1
	kind: Service
	metadata:
	name: us-east4
	labels:
	app: cockroachdb
	spec:
	type: NodePort
	ports:
	# SQL client port
	- name: grpc
	port: 26257
	targetPort: 26257
	nodePort: 31258
	# Admin UI
	- name: http
	port: 8080
	targetPort: 8080
	nodePort: 31180
	selector:
	app: cockroachdb
	region: us-east4

	---
	# eu-west2
	apiVersion: v1
	kind: Service
	metadata:
	name: eu-west2
	labels:
	app: cockroachdb
	spec:
	type: NodePort
	ports:
	# SQL client port
	- name: grpc
	port: 26257
	targetPort: 26257
	nodePort: 31259
	# Admin UI
	- name: http
	port: 8080
	targetPort: 8080
	nodePort: 31280
	selector:
	app: cockroachdb
	region: eu-west2

	---
	# intra-node service
	apiVersion: v1
	kind: Service
	metadata:
	name: cockroachdb
	labels:
	app: cockroachdb
	annotations:
	service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
	prometheus.io/scrape: "true"
	prometheus.io/path: "_status/vars"
	prometheus.io/port: "8080"
	spec:
	ports:
	- port: 26257
	targetPort: 26257
	name: grpc
	- port: 8080
	targetPort: 8080
	name: http
	publishNotReadyAddresses: true
	clusterIP: None
	selector:
	app: cockroachdb


	##############
	# PODS + PVC #
	##############
	---
	# roach-seattle-1
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-seattle-1
	labels:
	app: cockroachdb
	region: us-west2
	spec:
	hostname: roach-seattle-1
	subdomain: cockroachdb
	containers:
	- name: roach-seattle-1
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-west2,zone=a
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-seattle-1-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-seattle-1-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-seattle-2
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-seattle-2
	labels:
	app: cockroachdb
	region: us-west2
	spec:
	hostname: roach-seattle-2
	subdomain: cockroachdb
	containers:
	- name: roach-seattle-2
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-west2,zone=b
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-seattle-2-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-seattle-2-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-seattle-3
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-seattle-3
	labels:
	app: cockroachdb
	region: us-west2
	spec:
	hostname: roach-seattle-3
	subdomain: cockroachdb
	containers:
	- name: roach-seattle-3
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-west2,zone=c
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-seattle-3-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-seattle-3-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi


	---
	# roach-newyork-1
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-newyork-1
	labels:
	app: cockroachdb
	region: us-east4
	spec:
	hostname: roach-newyork-1
	subdomain: cockroachdb
	containers:
	- name: roach-newyork-1
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-east4,zone=a
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-newyork-1-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-newyork-1-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-newyork-2
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-newyork-2
	labels:
	app: cockroachdb
	region: us-east4
	spec:
	hostname: roach-newyork-2
	subdomain: cockroachdb
	containers:
	- name: roach-newyork-2
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-east4,zone=b
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-newyork-2-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-newyork-2-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-newyork-3
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-newyork-3
	labels:
	app: cockroachdb
	region: us-east4
	spec:
	hostname: roach-newyork-3
	subdomain: cockroachdb
	containers:
	- name: roach-newyork-3
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=us-east4,zone=c
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-newyork-3-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-newyork-3-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi


	---
	# roach-london-1
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-london-1
	labels:
	app: cockroachdb
	region: eu-west2
	spec:
	hostname: roach-london-1
	subdomain: cockroachdb
	containers:
	- name: roach-london-1
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=eu-west2,zone=a
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-london-1-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-london-1-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-london-2
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-london-2
	labels:
	app: cockroachdb
	region: eu-west2
	spec:
	hostname: roach-london-2
	subdomain: cockroachdb
	containers:
	- name: roach-london-2
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=eu-west2,zone=b
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-london-2-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-london-2-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	---
	# roach-london-3
	apiVersion: v1
	kind: Pod
	metadata:
	name: roach-london-3
	labels:
	app: cockroachdb
	region: eu-west2
	spec:
	hostname: roach-london-3
	subdomain: cockroachdb
	containers:
	- name: roach-london-3
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 26257
	name: grpc
	- containerPort: 8080
	name: http
	livenessProbe:
	httpGet:
	path: "/health"
	port: http
	initialDelaySeconds: 30
	periodSeconds: 5
	readinessProbe:
	httpGet:
	path: "/health?ready=1"
	port: http
	initialDelaySeconds: 10
	periodSeconds: 5
	failureThreshold: 2
	volumeMounts:
	- name: datadir
	mountPath: /cockroach/cockroach-data
	env:
	- name: COCKROACH_CHANNEL
	value: kubernetes-insecure
	- name: GOMAXPROCS
	valueFrom:
	resourceFieldRef:
	resource: limits.cpu
	divisor: "1"
	- name: MEMORY_LIMIT_MIB
	valueFrom:
	resourceFieldRef:
	resource: limits.memory
	divisor: "1Mi"
	command:
	- "/bin/bash"
	- "-ecx"
	- exec
	/cockroach/cockroach
	start
	--logtostderr
	--insecure
	--advertise-host $(hostname -f)
	--http-addr 0.0.0.0
	--join roach-seattle-1.cockroachdb,roach-newyork-1.cockroachdb,roach-london-1.cockroachdb
	--cache $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB
	--locality=region=eu-west2,zone=c
	terminationGracePeriodSeconds: 60
	volumes:
	- name: datadir
	persistentVolumeClaim:
	claimName: roach-london-3-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: roach-london-3-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	########################
	# INIT AND CONFIG JOBS #
	########################
	---
	apiVersion: batch/v1
	kind: Job
	metadata:
	name: cluster-init
	labels:
	app: cockroachdb
	spec:
	template:
	spec:
	containers:
	- name: cluster-init
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	command:
	- "/cockroach/cockroach"
	- "init"
	- "--insecure"
	- "--host=roach-seattle-1.cockroachdb"
	restartPolicy: OnFailure

	---
	apiVersion: batch/v1
	kind: Job
	metadata:
	name: cluster-sql-init
	labels:
	app: cockroachdb
	spec:
	template:
	spec:
	containers:
	- name: cluster-sql-init
	image: cockroachdb/cockroach:latest
	imagePullPolicy: IfNotPresent
	command:
	- "/cockroach/cockroach"
	- "sql"
	- "--insecure"
	- "--url"
	- "postgresql://roach-seattle-1.cockroachdb:26257/defaultdb?sslmode=disable"
	- "-e"
	- "UPSERT into system.locations VALUES ('region', 'us-east4', 37.478397, -76.453077), ('region', 'us-west2', 43.804133, -120.554201), ('region', 'eu-west2', 51.5073509, -0.1277583);"
	restartPolicy: OnFailure
	#############
	# MINIO #
	#############
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: minio
	labels:
	app: minio
	spec:
	type: NodePort
	ports:
	# UI
	- name: http
	port: 9000
	targetPort: 9000
	nodePort: 31900
	selector:
	app: minio

	---
	apiVersion: v1
	kind: Pod
	metadata:
	name: minio
	labels:
	app: minio
	spec:
	hostname: minio
	containers:
	- name: minio
	image: minio/minio
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 9000
	name: http
	volumeMounts:
	- name: minio-data
	mountPath: /data
	args:
	- server
	- /data
	volumes:
	- name: minio-data
	persistentVolumeClaim:
	claimName: minio-data

	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: minio-data
	labels:
	app: cockroachdb
	spec:
	accessModes:
	- ReadWriteMany
	volumeMode: Filesystem
	storageClassName: standard
	resources:
	requests:
	storage: 1Gi

	##################
	# PROMETHEUS #
	##################
	---
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: prometheus-config
	data:
	prometheus.yml: \|
	---
	global:
	scrape_interval: 10s
	evaluation_interval: 10s

	rule_files:
	# what to alert for
	- /etc/prometheus/alerts.rules.yml
	# what metrics to collect
	- /etc/prometheus/aggregation.rules.yml

	# whom to alert
	alerting:
	alertmanagers:
	- static_configs:
	- targets:
	- cockroachdb:9093

	scrape_configs:
	- job_name: "cockroachdb"
	metrics_path: "/_status/vars"
	scheme: "http"
	tls_config:
	insecure_skip_verify: true
	static_configs:
	# what hosts to monitor
	- targets:
	- roach-seattle-1.cockroachdb:8080
	- roach-seattle-2.cockroachdb:8080
	- roach-seattle-3.cockroachdb:8080
	- roach-newyork-1.cockroachdb:8080
	- roach-newyork-2.cockroachdb:8080
	- roach-newyork-3.cockroachdb:8080
	- roach-london-1.cockroachdb:8080
	- roach-london-2.cockroachdb:8080
	- roach-london-3.cockroachdb:8080
	labels:
	cluster: "crdb"
	alerts.rules.yml: \|
	groups:
	- name: rules/alerts.rules
	rules:
	# Alert for any instance that is unreachable for >15 minutes.
	- alert: InstanceDead
	expr: up{job="cockroachdb"} == 0
	for: 1m
	annotations:
	description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
	down for more than 15 minutes.'
	summary: Instance {{ $labels.instance }} dead
	# Alert for any instance that is not ready for a while.
	- alert: InstanceNotReady
	# This alert applies only to Kubernetes deployments and requires that you run kube-state-metrics: https://github.com/kubernetes/kube-state-metrics
	expr: kube_statefulset_status_replicas_ready{statefulset="cockroachdb"} != kube_statefulset_status_replicas{statefulset="cockroachdb"}
	for: 45m
	annotations:
	description: 'there has been an unready replica for cluster {{ $labels.cluster }}
	for more than 15 minutes.'
	summary: Instance not ready
	# Alert on instance restarts.
	- alert: InstanceRestart
	expr: resets(sys_uptime{job="cockroachdb"}[24h]) > 1
	annotations:
	description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
	{{ $value }} time(s) in 24h'
	summary: Instance {{ $labels.instance }} restarted
	# Alert on flapping instances (frequent restarts).
	- alert: InstancesFlapping
	# Aggregated.
	# This alert assumes that rolling restarts or rolling upgrades leave at least 3 minutes between each node being updated or restarted.
	expr: sum by (cluster)(resets(sys_uptime{job="cockroachdb"}[5m])) > 2
	annotations:
	description: 'instances in cluster {{ $labels.cluster }} restarted
	{{ $value }} time(s) in 5m'
	summary: Instances in {{ $labels.cluster }} flapping
	# Alert on flapping instances (frequent restarts).
	- alert: InstanceFlapping
	# Un-aggregated.
	expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 1
	annotations:
	description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
	{{ $value }} time(s) in 10m'
	summary: Instance {{ $labels.instance }} flapping
	# Alert on version mismatch.
	# This alert is intentionally loose (4 hours) to allow for rolling upgrades.
	# This may need to be adjusted for large clusters.
	- alert: VersionMismatch
	expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"}))
	> 1
	for: 4h
	annotations:
	description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
	summary: Binary version mismatch on {{ $labels.cluster }}
	# Available capacity alerts.
	- alert: StoreDiskLow
	expr: capacity_available:ratio{job="cockroachdb"} < 0.15
	annotations:
	summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
	}} available disk fraction
	- alert: ClusterDiskLow
	expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2
	annotations:
	summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
	# Unavailable ranges.
	- alert: UnavailableRanges
	expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 0
	for: 10m
	annotations:
	summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
	# Cockroach-measured clock offset nearing limit (by default, servers kill themselves at 400ms from the mean, so alert at 300ms)
	- alert: ClockOffsetNearMax
	expr: clock_offset_meannanos{job="cockroachdb"} > 300 * 1000 * 1000
	for: 5m
	annotations:
	summary: Clock on {{ $labels.instance }} as measured by cockroach is offset by {{ $value }} nanoseconds from the cluster mean # Certificate expiration. Alerts are per node.
	- alert: CACertificateExpiresSoon
	expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"}
	- time()) < 86400 * 366
	labels:
	frequency: daily
	annotations:
	summary: CA certificate for {{ $labels.instance }} expires in less than a year
	- alert: ClientCACertificateExpiresSoon
	expr: (security_certificate_expiration_client_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_client_ca{job="cockroachdb"}
	- time()) < 86400 * 366
	labels:
	frequency: daily
	annotations:
	summary: Client CA certificate for {{ $labels.instance }} expires in less than a year
	- alert: UICACertificateExpiresSoon
	expr: (security_certificate_expiration_ui_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ui_ca{job="cockroachdb"}
	- time()) < 86400 * 366
	labels:
	frequency: daily
	annotations:
	summary: UI CA certificate for {{ $labels.instance }} expires in less than a year
	- alert: NodeCertificateExpiresSoon
	expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"}
	- time()) < 86400 * 183
	labels:
	frequency: daily
	annotations:
	summary: Node certificate for {{ $labels.instance }} expires in less than six months
	- alert: NodeClientCertificateExpiresSoon
	expr: (security_certificate_expiration_node_client{job="cockroachdb"} > 0) and (security_certificate_expiration_node_client{job="cockroachdb"}
	- time()) < 86400 * 183
	labels:
	frequency: daily
	annotations:
	summary: Client certificate for {{ $labels.instance }} expires in less than six months
	- alert: UICertificateExpiresSoon
	expr: (security_certificate_expiration_ui{job="cockroachdb"} > 0) and (security_certificate_expiration_ui{job="cockroachdb"}
	- time()) < 86400 * 20
	labels:
	frequency: daily
	annotations:
	summary: UI certificate for {{ $labels.instance }} expires in less than 20 days
	# Slow Latch/Lease/Raft requests.
	- alert: SlowLatchRequest
	expr: requests_slow_latch{job="cockroachdb"} > 0
	for: 5m
	labels:
	severity: testing
	annotations:
	summary: '{{ $value }} slow latch requests on {{ $labels.instance }}'
	- alert: SlowLeaseRequest
	expr: requests_slow_lease{job="cockroachdb"} > 0
	for: 5m
	labels:
	severity: testing
	annotations:
	summary: '{{ $value }} slow lease requests on {{ $labels.instance }}'
	- alert: SlowRaftRequest
	expr: requests_slow_raft{job="cockroachdb"} > 0
	for: 5m
	labels:
	severity: testing
	annotations:
	summary: '{{ $value }} slow raft requests on {{ $labels.instance }}'
	# Getting close to open file descriptor limit.
	- alert: HighOpenFDCount
	expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 0.8
	for: 10m
	annotations:
	summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
	}} fraction used'
	# Prometheus disk getting full.
	- alert: PrometheusDiskLow
	expr: node_filesystem_free{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
	/ node_filesystem_size{cluster="prometheus",job="node_exporter_prometheus",mountpoint="/data"}
	< 0.2
	for: 10m
	labels:
	severity: testing
	annotations:
	summary: 'Prometheus storage is almost full: {{ $value }} fraction free'
	aggregation.rules.yml: \|
	# This file contains aggregation rules, specifically:
	# "node:X" node-level aggregation of a per-store metric X
	# "cluster:X" cluster-level aggregation of a per-store or per-node metric X
	#
	# Most aggregation rules should use the "without (label1, label2, ...)" keyword
	# to keep all labels but the ones specified.

	groups:
	- name: rules/aggregation.rules
	rules:
	- record: node:capacity
	expr: sum without(store) (capacity{job="cockroachdb"})
	- record: cluster:capacity
	expr: sum without(instance) (node:capacity{job="cockroachdb"})
	- record: node:capacity_available
	expr: sum without(store) (capacity_available{job="cockroachdb"})
	- record: cluster:capacity_available
	expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
	- record: capacity_available:ratio
	expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
	- record: node:capacity_available:ratio
	expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
	- record: cluster:capacity_available:ratio
	expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
	# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
	- record: txn_durations_bucket:rate1m
	expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
	- record: txn_durations:rate1m:quantile_50
	expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
	- record: txn_durations:rate1m:quantile_75
	expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
	- record: txn_durations:rate1m:quantile_90
	expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
	- record: txn_durations:rate1m:quantile_95
	expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
	- record: txn_durations:rate1m:quantile_99
	expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
	- record: exec_latency_bucket:rate1m
	expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
	- record: exec_latency:rate1m:quantile_50
	expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
	- record: exec_latency:rate1m:quantile_75
	expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
	- record: exec_latency:rate1m:quantile_90
	expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
	- record: exec_latency:rate1m:quantile_95
	expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
	- record: exec_latency:rate1m:quantile_99
	expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
	- record: round_trip_latency_bucket:rate1m
	expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
	- record: round_trip_latency:rate1m:quantile_50
	expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
	- record: round_trip_latency:rate1m:quantile_75
	expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
	- record: round_trip_latency:rate1m:quantile_90
	expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
	- record: round_trip_latency:rate1m:quantile_95
	expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
	- record: round_trip_latency:rate1m:quantile_99
	expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
	- record: sql_exec_latency_bucket:rate1m
	expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
	- record: sql_exec_latency:rate1m:quantile_50
	expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
	- record: sql_exec_latency:rate1m:quantile_75
	expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
	- record: sql_exec_latency:rate1m:quantile_90
	expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
	- record: sql_exec_latency:rate1m:quantile_95
	expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
	- record: sql_exec_latency:rate1m:quantile_99
	expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
	- record: raft_process_logcommit_latency_bucket:rate1m
	expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
	- record: raft_process_logcommit_latency:rate1m:quantile_50
	expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
	- record: raft_process_logcommit_latency:rate1m:quantile_75
	expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
	- record: raft_process_logcommit_latency:rate1m:quantile_90
	expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
	- record: raft_process_logcommit_latency:rate1m:quantile_95
	expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
	- record: raft_process_logcommit_latency:rate1m:quantile_99
	expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
	- record: raft_process_commandcommit_latency_bucket:rate1m
	expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
	- record: raft_process_commandcommit_latency:rate1m:quantile_50
	expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
	- record: raft_process_commandcommit_latency:rate1m:quantile_75
	expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
	- record: raft_process_commandcommit_latency:rate1m:quantile_90
	expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
	- record: raft_process_commandcommit_latency:rate1m:quantile_95
	expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
	- record: raft_process_commandcommit_latency:rate1m:quantile_99
	expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)

	---
	apiVersion: v1
	kind: Service
	metadata:
	name: prom
	labels:
	app: prom
	spec:
	type: NodePort
	ports:
	# UI
	- name: http
	port: 9090
	targetPort: 9090
	nodePort: 31990
	selector:
	app: cockroachdb

	---
	apiVersion: v1
	kind: Pod
	metadata:
	name: prom
	labels:
	app: cockroachdb
	spec:
	hostname: prom
	#subdomain: cockroachdb
	containers:
	- name: prom
	image: prom/prometheus
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 9000
	name: http
	volumeMounts:
	- name: prometheus-config
	mountPath: /etc/prometheus/prometheus.yml
	subPath: prometheus.yml
	- name: prometheus-config
	mountPath: /etc/prometheus/aggregation.rules.yml
	subPath: aggregation.rules.yml
	- name: prometheus-config
	mountPath: /etc/prometheus/alerts.rules.yml
	subPath: alerts.rules.yml

	volumes:
	- name: prometheus-config
	configMap:
	name: prometheus-config

	################
	# ALERTMANAGER #
	################
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: alertmgr
	labels:
	app: alertmgr
	spec:
	type: NodePort
	ports:
	# UI
	- name: http
	port: 9093
	targetPort: 9093
	nodePort: 31993
	selector:
	app: cockroachdb
	---
	apiVersion: v1
	kind: Pod
	metadata:
	name: alertmgr
	labels:
	app: cockroachdb
	spec:
	hostname: alertmgr
	#subdomain: cockroachdb
	containers:
	- name: alertmgr
	image: quay.io/prometheus/alertmanager:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 9093
	name: http

	###########
	# GRAFANA #
	###########
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: grafana
	labels:
	app: grafana
	spec:
	type: NodePort
	ports:
	# UI
	- name: http
	port: 3000
	targetPort: 3000
	nodePort: 32000
	selector:
	app: grafana
	---
	apiVersion: v1
	kind: Pod
	metadata:
	name: grafana
	labels:
	app: grafana
	spec:
	hostname: grafana
	containers:
	- name: grafana
	image: grafana/grafana
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 3000
	name: http