Skip to content

Instantly share code, notes, and snippets.

@rewanthtammana
Created December 20, 2022 08:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rewanthtammana/d4315690bc80012cc01b1ff4dc88a80f to your computer and use it in GitHub Desktop.
Save rewanthtammana/d4315690bc80012cc01b1ff4dc88a80f to your computer and use it in GitHub Desktop.
---
rule_files:
- loki.all.rules.yml
tests:
- interval: 1m
input_series:
- series: 'cortex_ring_members{container="service", job="zj88t-prometheus/workload-zj88t/0", name="service", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", pod="loki-service-676b8c897b-rq298", provider="aws", service_priority="highest", state="Unhealthy"}'
values: "0+0x20 1+0x160" # 1 unhealthy value after 20 minutes
- series: 'loki_panic_total{app="loki-service", container="service", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", pod="loki-service-676b8c897b-rq29", provider="aws", service_priority="highest"}'
values: "0+0x20 1+0x160" # 1 panic after 20 minutes
- series: 'loki_request_duration_seconds_count{app="loki-distributor", container="distributor", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}'
values: "0+60x180" # 1 request per second OK for 3 hours
- series: 'loki_request_duration_seconds_count{app="loki-distributor", container="distributor", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}'
values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing
alert_rule_test:
- alertname: LokiRequestPanics
eval_time: 15m # should be OK after 15 minutes
exp_alerts:
- alertname: LokiRequestPanics
eval_time: 25m # After 25 minutes, should fire an alert for the t+20 error
exp_alerts:
- exp_labels:
area: services
cancel_if_apiserver_down: true
cancel_if_cluster_status_creating: true
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: true
cancel_if_outside_working_hours: false
cancel_if_scrape_timeout: true
job: zj88t-prometheus/workload-zj88t/0
namespace: loki
severity: page
topic: observability
exp_annotations:
description: This alert checks that we have no panic errors on Loki.
- alertname: LokiRequestPanics
eval_time: 40m # After 40 minutes, all should be back to normal
exp_alerts:
- alertname: LokiRequestErrors
eval_time: 15m # should be OK after 15 minutes
exp_alerts:
- alertname: LokiRequestErrors
eval_time: 160m # Alert after more than 120m of incident
exp_alerts:
- exp_labels:
area: services
cancel_if_apiserver_down: true
cancel_if_cluster_status_creating: true
cancel_if_cluster_status_deleting: true
cancel_if_cluster_status_updating: true
cancel_if_outside_working_hours: false
cancel_if_scrape_timeout: true
job: zj88t-prometheus/workload-zj88t/0
namespace: loki
route: loki_api_v1_push
severity: page
topic: observability
exp_annotations:
description: This alert checks that we have less than 10% errors on Loki requests.
- alertname: LokiRingUnhealthy
eval_time: 15m # should be OK after 15 minutes
exp_alerts:
- alertname: LokiRingUnhealthy
eval_time: 25m # after 25 minutes we have an unhealthy member, but we want to filter too short events. So no alert yet.
exp_alerts:
- alertname: LokiRingUnhealthy
eval_time: 40m # now the event has been there for 20 minutes, we should have an alert.
exp_alerts:
- exp_labels:
app: loki-service
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
container: service
name: service
namespace: loki
pod: loki-service-676b8c897b-rq29
severity: page
topic: observability
exp_annotations:
description: "Loki pod loki-service-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment