timfeirg/ceph_alerts.yml

## ceph_alerts.yml
apiVersion: 1
groups:
- folder: ceph-alerts
  interval: 10s
  name: cluster health
  orgId: 1
  rules:
  - annotations:
      description: The cluster state has been HEALTH_ERROR for more than 5 minutes.
        Please check 'ceph health detail' for more information.
      summary: Ceph is in the ERROR state
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_status == 2
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.2.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephHealthError
    uid: ceph-ceph_health_error
  - annotations:
      description: The cluster state has been HEALTH_WARN for more than 15 minutes.
        Please check 'ceph health detail' for more information.
      summary: Ceph is in the WARNING state
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_status == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 15m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephHealthWarning
    uid: ceph-ceph_health_warning
- folder: ceph-alerts
  interval: 10s
  name: mon
  orgId: 1
  rules:
  - annotations:
      description: '{{ $min := query "floor(count(ceph_mon_metadata) / 2) + 1" | first
        | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active.
        Without quorum the cluster will become inoperable, affecting all services
        and connected clients. The following monitors are down: {{- range query "(ceph_mon_quorum_status
        == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} -
        {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}'
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
      summary: Monitor quorum is at risk
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: "(\n  (ceph_health_detail{name=\"MON_DOWN\"} == 1) * on() (\n    count(ceph_mon_quorum_status\
          \ == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)\n  )\n) == 1\n"
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.3.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephMonDownQuorumAtRisk
    uid: ceph-ceph_mon_down_quorum_at_risk
  - annotations:
      description: '{{ $down := query "count(ceph_mon_quorum_status == 0)" | first
        | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have
        {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of
        an additional monitor will make your cluster inoperable.  The following monitors
        are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon)
        group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon
        }} on {{ .Labels.hostname }} {{- end }}

        '
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
      summary: One or more monitors down
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: 'count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata)
          / 2) + 1)

          '
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephMonDown
    uid: ceph-ceph_mon_down
  - annotations:
      description: The free space available to a monitor's store is critically low.
        You should increase the space available to the monitor(s). The default directory
        is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db
        on the mon pod's worker node for Rook. Look for old, rotated versions of *.log
        and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories
        under /var/lib/rook and other directories on the same filesystem, often /var/log
        and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}}
        - {{ .Labels.hostname }} {{- end }}
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
      summary: Filesystem space on at least one monitor is critically low
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.3.2
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephMonDiskspaceCritical
    uid: ceph-ceph_mon_diskspace_critical
  - annotations:
      description: The space available to a monitor's store is approaching full (>70%
        is the default). You should increase the space available to the monitor(s).
        The default directory is /var/lib/ceph/mon-*/data/store.db on traditional
        deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker
        node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do
        NOT touch any *.sst files. Also check any other directories under /var/lib/rook
        and other directories on the same filesystem, often /var/log and /var/tmp
        are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}}
        - {{ .Labels.hostname }} {{- end }}
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
      summary: Drive space on at least one monitor is approaching full
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephMonDiskspaceLow
    uid: ceph-ceph_mon_diskspace_low
  - annotations:
      description: Ceph monitors rely on closely synchronized time to maintain quorum
        and cluster consistency. This event indicates that the time on at least one
        mon has drifted too far from the lead mon. Review cluster status with ceph
        -s. This will show which monitors are affected. Check the time sync status
        on each monitor host with 'ceph time-sync-status' and the state and peers
        of your ntpd or chrony daemon.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
      summary: Clock skew detected among monitors
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephMonClockSkew
    uid: ceph-ceph_mon_clock_skew
- folder: ceph-alerts
  interval: 10s
  name: osd
  orgId: 1
  rules:
  - annotations:
      description: '{{ $value | humanize }}% or {{ with query "count(ceph_osd_up ==
        0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)"
        }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs
        are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname)
        ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname
        }} {{- end }}'
      summary: More than 10% of OSDs are down
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephOSDDownHigh
    uid: ceph-ceph_osddown_high
  - annotations:
      description: 'The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon)
        group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.hostname }}
        : {{ .Labels.ceph_daemon }} {{- end }}'
      summary: An OSD host is offline
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.8
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDHostDown
    uid: ceph-ceph_osdhost_down
  - annotations:
      description: '{{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{
        $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s
        }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else
        }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname)
        ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname
        }} {{- end }}

        '
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
      summary: An OSD has been marked down
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_DOWN"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.2
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDDown
    uid: ceph-ceph_osddown
  - annotations:
      description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph
        health detail' and 'ceph osd df' to identify the problem. To resolve, add
        capacity to the affected OSD's failure domain, restore down/out OSDs, or delete
        unwanted data.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
      summary: OSD(s) running low on free space (NEARFULL)
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.3
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDNearFull
    uid: ceph-ceph_osdnear_full
  - annotations:
      description: An OSD has reached the FULL threshold. Writes to pools that share
        the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df'
        to identify the problem. To resolve, add capacity to the affected OSD's failure
        domain, restore down/out OSDs, or delete unwanted data.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
      summary: OSD full, writes blocked
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_FULL"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.6
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephOSDFull
    uid: ceph-ceph_osdfull
  - annotations:
      description: An OSD has reached the BACKFILL FULL threshold. This will prevent
        rebalance operations from completing. Use 'ceph health detail' and 'ceph osd
        df' to identify the problem. To resolve, add capacity to the affected OSD's
        failure domain, restore down/out OSDs, or delete unwanted data.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
      summary: OSD(s) too full for backfill operations
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDBackfillFull
    uid: ceph-ceph_osdbackfill_full
  - annotations:
      description: Reads from an OSD have used a secondary PG to return data to the
        client, indicating a potential failing drive.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
      summary: OSD reports a high number of read errors
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDTooManyRepairs
    uid: ceph-ceph_osdtoo_many_repairs
  - annotations:
      description: OSD heartbeats on the cluster's 'public' network (frontend) are
        running slow. Investigate the network for latency or loss issues. Use 'ceph
        health detail' to show the affected OSDs.
      summary: Network issues delaying OSD heartbeats (public network)
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDTimeoutsPublicNetwork
    uid: ceph-ceph_osdtimeouts_public_network
  - annotations:
      description: OSD heartbeats on the cluster's 'cluster' network (backend) are
        slow. Investigate the network for latency issues on this subnet. Use 'ceph
        health detail' to show the affected OSDs.
      summary: Network issues delaying OSD heartbeats (cluster network)
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDTimeoutsClusterNetwork
    uid: ceph-ceph_osdtimeouts_cluster_network
  - annotations:
      description: One or more OSDs have an internal inconsistency between metadata
        and the size of the device. This could lead to the OSD(s) crashing in future.
        You should redeploy the affected OSDs.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
      summary: OSD size inconsistency error
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDInternalDiskSizeMismatch
    uid: ceph-ceph_osdinternal_disk_size_mismatch
  - annotations:
      description: The device health module has determined that one or more devices
        will fail soon. To review device status use 'ceph device ls'. To show a specific
        device use 'ceph device info <dev id>'. Mark the OSD out so that data may
        migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace
        the device, and redeploy the OSD.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
      summary: Device(s) predicted to fail soon
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephDeviceFailurePredicted
    uid: ceph-ceph_device_failure_predicted
  - annotations:
      description: The device health module has determined that devices predicted
        to fail can not be remediated automatically, since too many OSDs would be
        removed from the cluster to ensure performance and availabililty. Prevent
        data integrity issues by adding new OSDs so that data may be relocated.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
      summary: Too many devices are predicted to fail, unable to resolve
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.7
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephDeviceFailurePredictionTooHigh
    uid: ceph-ceph_device_failure_prediction_too_high
  - annotations:
      description: "The device health module has determined that one or more devices\
        \ will fail soon, but the normal process of relocating the data on the device\
        \ to other OSDs in the cluster is blocked. \nEnsure that the cluster has available\
        \ free space. It may be necessary to add capacity to the cluster to allow\
        \ data from the failing device to successfully migrate, or to enable the balancer."
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
      summary: Device failure is predicted, but unable to relocate data
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephDeviceFailureRelocationIncomplete
    uid: ceph-ceph_device_failure_relocation_incomplete
  - annotations:
      description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked
        down and back up {{ $value | humanize }} times once a minute for 5 minutes.
        This may indicate a network issue (latency, packet loss, MTU mismatch) on
        the cluster network, or the public network if no cluster network is deployed.
        Check the network stats on the listed host(s).
      documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
      summary: Network issues are causing OSDs to flap (mark each other down)
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: (rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)
          * 60 > 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.4
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDFlapping
    uid: ceph-ceph_osdflapping
  - annotations:
      description: An OSD has encountered read errors, but the OSD has recovered by
        retrying the reads. This may indicate an issue with hardware or the kernel.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
      summary: Device read errors detected
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephOSDReadErrors
    uid: ceph-ceph_osdread_errors
  - annotations:
      description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
        by more than 30% from average PG count.
      summary: PGs are not balanced across OSDs
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: "abs(\n  ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg\
          \ > 0) by (job)) /\n  on (job) group_left avg(ceph_osd_numpg > 0) by (job)\n\
          ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30\n"
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.4.5
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPGImbalance
    uid: ceph-ceph_pgimbalance
- folder: ceph-alerts
  interval: 10s
  name: mds
  orgId: 1
  rules:
  - annotations:
      description: Filesystem metadata has been corrupted. Data may be inaccessible.
        Analyze metrics from the MDS daemon admin socket, or escalate to support.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
      summary: CephFS filesystem is damaged.
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.5.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephFilesystemDamaged
    uid: ceph-ceph_filesystem_damaged
  - annotations:
      description: All MDS ranks are unavailable. The MDS daemons managing metadata
        are down, rendering the filesystem offline.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
      summary: CephFS filesystem is offline
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.5.3
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephFilesystemOffline
    uid: ceph-ceph_filesystem_offline
  - annotations:
      description: One or more metadata daemons (MDS ranks) are failed or in a damaged
        state. At best the filesystem is partially available, at worst the filesystem
        is completely unusable.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
      summary: CephFS filesystem is degraded
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="FS_DEGRADED"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.5.4
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephFilesystemDegraded
    uid: ceph-ceph_filesystem_degraded
  - annotations:
      description: The filesystem's 'max_mds' setting defines the number of MDS ranks
        in the filesystem. The current number of active MDS daemons is less than this
        value.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
      summary: Ceph MDS daemon count is lower than configured
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephFilesystemMDSRanksLow
    uid: ceph-ceph_filesystem_mdsranks_low
  - annotations:
      description: The minimum number of standby daemons required by standby_count_wanted
        is less than the current number of standby daemons. Adjust the standby count
        or increase the number of MDS daemons.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
      summary: Ceph filesystem standby daemons too few
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephFilesystemInsufficientStandby
    uid: ceph-ceph_filesystem_insufficient_standby
  - annotations:
      description: An MDS daemon has failed, leaving only one active rank and no available
        standby. Investigate the cause of the failure or add a standby MDS.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
      summary: MDS daemon failed, no further standby available
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.5.5
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephFilesystemFailureNoStandby
    uid: ceph-ceph_filesystem_failure_no_standby
  - annotations:
      description: The filesystem has switched to READ ONLY due to an unexpected error
        when writing to the metadata pool. Either analyze the output from the MDS
        daemon admin socket, or escalate to support.
      documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
      summary: CephFS filesystem in read only mode due to write error(s)
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.5.2
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephFilesystemReadOnly
    uid: ceph-ceph_filesystem_read_only
- folder: ceph-alerts
  interval: 10s
  name: mgr
  orgId: 1
  rules:
  - annotations:
      description: One or more mgr modules have crashed and have yet to be acknowledged
        by an administrator. A crashed module may impact functionality within the
        cluster. Use the 'ceph crash' command to determine which module has failed,
        and archive it to acknowledge the failure.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
      summary: A manager module has recently crashed
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.6.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephMgrModuleCrash
    uid: ceph-ceph_mgr_module_crash
  - annotations:
      description: The mgr/prometheus module at {{ $labels.instance }} is unreachable.
        This could mean that the module has been disabled or the mgr daemon itself
        is down. Without the mgr/prometheus module metrics and alerts will no longer
        function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to
        to determine whether the mgr is active. If the mgr is not active, restart
        it, otherwise you can determine module status with 'ceph mgr module ls'. If
        it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.
      summary: The mgr/prometheus module is not available
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: up{job="ceph"} == 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.6.2
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephMgrPrometheusModuleInactive
    uid: ceph-ceph_mgr_prometheus_module_inactive
- folder: ceph-alerts
  interval: 10s
  name: pgs
  orgId: 1
  rules:
  - annotations:
      description: '{{ $value }} PGs have been inactive for more than 5 minutes in
        pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write
        requests.'
      summary: One or more placement groups are inactive
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total
          - ceph_pg_active) > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPGsInactive
    uid: ceph-ceph_pgs_inactive
  - annotations:
      description: '{{ $value }} PGs have been unclean for more than 15 minutes in
        pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.'
      summary: One or more placement groups are marked unclean
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total
          - ceph_pg_clean) > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 15m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.2
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPGsUnclean
    uid: ceph-ceph_pgs_unclean
  - annotations:
      description: During data consistency checks (scrub), at least one PG has been
        flagged as being damaged or inconsistent. Check to see which PG is affected,
        and attempt a manual repair if necessary. To list problematic placement groups,
        use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair
        <pg_num>' command.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
      summary: Placement group damaged, manual intervention needed
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.4
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPGsDamaged
    uid: ceph-ceph_pgs_damaged
  - annotations:
      description: Data redundancy is at risk since one or more OSDs are at or above
        the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs,
        or delete unwanted data.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
      summary: OSDs are too full for recovery
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.5
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPGRecoveryAtRisk
    uid: ceph-ceph_pgrecovery_at_risk
  - annotations:
      description: Data availability is reduced, impacting the cluster's ability to
        service I/O. One or more placement groups (PGs) are in a state that blocks
        I/O.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
      summary: PG is unavailable, blocking I/O
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
          == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.3
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPGUnavilableBlockingIO
    uid: ceph-ceph_pgunavilable_blocking_io
  - annotations:
      description: Data redundancy may be at risk due to lack of free space within
        the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add
        more capacity, or delete unwanted data.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
      summary: Backfill operations are blocked due to lack of free space
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.7.6
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPGBackfillAtRisk
    uid: ceph-ceph_pgbackfill_at_risk
  - annotations:
      description: 'One or more PGs have not been scrubbed recently. Scrubs check
        metadata integrity, protecting against bit-rot. They check that metadata is
        consistent across data replicas. When PGs miss their scrub interval, it may
        indicate that the scrub window is too small, or PGs were not in a ''clean''
        state during the scrub window. You can manually initiate a scrub with: ceph
        pg scrub <pgid>'
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
      summary: Placement group(s) have not been scrubbed
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPGNotScrubbed
    uid: ceph-ceph_pgnot_scrubbed
  - annotations:
      description: "The number of placement groups per OSD is too high (exceeds the\
        \ mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been\
        \ disabled for any pools with 'ceph osd pool autoscale-status', and that the\
        \ profile selected is appropriate. You may also adjust the target_size_ratio\
        \ of a pool to guide the autoscaler based on the expected relative size of\
        \ the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or\
        \ set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for\
        \ one or more pools."
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
      summary: Placement groups per OSD is too high
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPGsHighPerOSD
    uid: ceph-ceph_pgs_high_per_osd
  - annotations:
      description: One or more PGs have not been deep scrubbed recently. Deep scrubs
        protect against bit-rot. They compare data replicas to ensure consistency.
        When PGs miss their deep scrub interval, it may indicate that the window is
        too small or PGs were not in a 'clean' state during the deep-scrub window.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
      summary: Placement group(s) have not been deep scrubbed
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPGNotDeepScrubbed
    uid: ceph-ceph_pgnot_deep_scrubbed
- folder: ceph-alerts
  interval: 10s
  name: nodes
  orgId: 1
  rules:
  - annotations:
      description: 'Root volume is dangerously full: {{ $value | humanize }}% free.'
      summary: Root filesystem is dangerously full
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
          * 100 < 5
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.8.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephNodeRootFilesystemFull
    uid: ceph-ceph_node_root_filesystem_full
  - annotations:
      description: Node {{ $labels.instance }} experiences packet drop > 0.5% or >
        10 packets/s on interface {{ $labels.device }}.
      summary: One or more NICs reports packet drops
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: "(\n  rate(node_network_receive_drop_total{device!=\"lo\"}[1m]) +\n\
          \  rate(node_network_transmit_drop_total{device!=\"lo\"}[1m])\n) / (\n \
          \ rate(node_network_receive_packets_total{device!=\"lo\"}[1m]) +\n  rate(node_network_transmit_packets_total{device!=\"\
          lo\"}[1m])\n) >= 0.0050000000000000001 and (\n  rate(node_network_receive_drop_total{device!=\"\
          lo\"}[1m]) +\n  rate(node_network_transmit_drop_total{device!=\"lo\"}[1m])\n\
          ) >= 10\n"
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.8.2
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephNodeNetworkPacketDrops
    uid: ceph-ceph_node_network_packet_drops
  - annotations:
      description: Node {{ $labels.instance }} experiences packet errors > 0.01% or
        > 10 packets/s on interface {{ $labels.device }}.
      summary: One or more NICs reports packet errors
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: "(\n  rate(node_network_receive_errs_total{device!=\"lo\"}[1m]) +\n\
          \  rate(node_network_transmit_errs_total{device!=\"lo\"}[1m])\n) / (\n \
          \ rate(node_network_receive_packets_total{device!=\"lo\"}[1m]) +\n  rate(node_network_transmit_packets_total{device!=\"\
          lo\"}[1m])\n) >= 0.0001 or (\n  rate(node_network_receive_errs_total{device!=\"\
          lo\"}[1m]) +\n  rate(node_network_transmit_errs_total{device!=\"lo\"}[1m])\n\
          ) >= 10\n"
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.8.3
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephNodeNetworkPacketErrors
    uid: ceph-ceph_node_network_packet_errors
  - annotations:
      description: Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will
        be full in less than 5 days based on the 48 hour trailing fill rate.
      summary: Host filesystem free space is getting low
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 *
          24 * 5) *on(instance) group_left(nodename) node_uname_info < 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.8.4
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephNodeDiskspaceWarning
    uid: ceph-ceph_node_diskspace_warning
  - annotations:
      description: Node {{ $labels.instance }} has a different MTU size ({{ $value
        }}) than the median of devices named {{ $labels.device }}.
      summary: MTU settings across Ceph hosts are inconsistent
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max
          by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
          !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
          ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0))  )
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephNodeInconsistentMTU
    uid: ceph-ceph_node_inconsistent_mtu
- folder: ceph-alerts
  interval: 10s
  name: pools
  orgId: 1
  rules:
  - annotations:
      description: Pool '{{ $labels.name }}' will be full in less than 5 days assuming
        the average fill-up rate of the past 48 hours.
      summary: Pool growth rate may soon exceed capacity
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id,
          instance) group_right() ceph_pool_metadata) >= 95
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.9.2
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPoolGrowthWarning
    uid: ceph-ceph_pool_growth_warning
  - annotations:
      description: A pool is approaching the near full threshold, which will prevent
        recovery/backfill operations from completing. Consider adding more capacity.
      summary: Free space in a pool is too low for recovery/backfill
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 0s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPoolBackfillFull
    uid: ceph-ceph_pool_backfill_full
  - annotations:
      description: A pool has reached its MAX quota, or OSDs supporting the pool have
        reached the FULL threshold. Until this is resolved, writes to the pool will
        be blocked. Pool Breakdown (top 5) {{- range query "topk(5, sort_desc(ceph_pool_percent_used
        * on(pool_id) group_right ceph_pool_metadata))" }} - {{ .Labels.name }} at
        {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the
        cluster first then increase the pool's quota (e.g. ceph osd pool set quota
        <pool_name> max_bytes <bytes>)
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
      summary: Pool is full - writes are blocked
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="POOL_FULL"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.9.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephPoolFull
    uid: ceph-ceph_pool_full
  - annotations:
      description: A pool has exceeded the warning (percent full) threshold, or OSDs
        supporting the pool have reached the NEARFULL threshold. Writes may continue,
        but you are at risk of the pool going read-only if more capacity isn't made
        available. Determine the affected pool with 'ceph df detail', looking at QUOTA
        BYTES and STORED. Increase the pool's quota, or add capacity to the cluster
        first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name>
        max_bytes <bytes>). Also ensure that the balancer is active.
      summary: One or more Ceph pools are nearly full
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 5m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephPoolNearFull
    uid: ceph-ceph_pool_near_full
- folder: ceph-alerts
  interval: 10s
  name: healthchecks
  orgId: 1
  rules:
  - annotations:
      description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time
        exceeded)'
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
      summary: OSD operations are slow to complete
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_healthcheck_slow_ops > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephSlowOps
    uid: ceph-ceph_slow_ops
  - annotations:
      description: '{{ $labels.ceph_daemon }} operations are taking too long to process
        (complaint time exceeded)'
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
      summary: '{{ $labels.ceph_daemon }} operations are slow to complete'
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephDaemonSlowOps
    uid: ceph-ceph_daemon_slow_ops
- folder: ceph-alerts
  interval: 10s
  name: cephadm
  orgId: 1
  rules:
  - annotations:
      description: The cephadm cluster upgrade process has failed. The cluster remains
        in an undetermined state. Please review the cephadm logs, to understand the
        nature of the issue
      summary: Ceph version upgrade has failed
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.11.2
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephadmUpgradeFailed
    uid: ceph-cephadm_upgrade_failed
  - annotations:
      description: A daemon managed by cephadm is no longer active. Determine, which
        daemon is down with 'ceph health detail'. you may start daemons with the 'ceph
        orch daemon start <daemon_id>'
      summary: A ceph daemon manged by cephadm is down
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.11.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephadmDaemonFailed
    uid: ceph-cephadm_daemon_failed
  - annotations:
      description: Cluster management has been paused manually. This will prevent
        the orchestrator from service management and reconciliation. If this is not
        intentional, resume cephadm operations with 'ceph orch resume'
      documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
      summary: Orchestration tasks via cephadm are PAUSED
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      severity: warning
      type: ceph_default
    noDataState: OK
    title: CephadmPaused
    uid: ceph-cephadm_paused
- folder: ceph-alerts
  interval: 10s
  name: PrometheusServer
  orgId: 1
  rules:
  - annotations:
      description: The prometheus job that scrapes from Ceph is no longer defined,
        this will effectively mean you'll have no metrics or alerts for the cluster.  Please
        review the job definitions in the prometheus.yml file of the prometheus instance.
      summary: The scrape job for Ceph is missing from Prometheus
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: absent(up{job="ceph"})
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.12.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: PrometheusJobMissing
    uid: ceph-prometheus_job_missing
- folder: ceph-alerts
  interval: 10s
  name: rados
  orgId: 1
  rules:
  - annotations:
      description: The latest version of a RADOS object can not be found, even though
        all OSDs are up. I/O requests for this object from clients will block (hang).
        Resolving this issue may require the object to be rolled back to a prior version
        manually, and manually verified.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
      summary: Object(s) marked UNFOUND
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up
          == 1) == bool count(ceph_osd_metadata)) == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 30s
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.10.1
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephObjectMissing
    uid: ceph-ceph_object_missing
- folder: ceph-alerts
  interval: 10s
  name: generic
  orgId: 1
  rules:
  - annotations:
      description: One or more daemons have crashed recently, and need to be acknowledged.
        This notification ensures that software crashes do not go unseen. To acknowledge
        a crash, use the 'ceph crash archive <id>' command.
      documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
      summary: One or more Ceph daemons have crashed, and are pending acknowledgement
    condition: A
    data:
    - datasourceUid: ceph
      model:
        disableTextWrap: false
        editorMode: code
        expr: ceph_health_detail{name="RECENT_CRASH"} == 1
        fullMetaSearch: false
        includeNullMetadata: true
        instant: true
        intervalMs: 10000
        legendFormat: __auto
        maxDataPoints: 43200
        range: false
        refId: A
        useBackend: false
      refId: A
      relativeTimeRange:
        from: 300
        to: 0
    execErrState: Error
    for: 1m
    isPaused: false
    labels:
      oid: 1.3.6.1.4.1.50495.1.2.1.1.2
      severity: critical
      type: ceph_default
    noDataState: OK
    title: CephDaemonCrash
    uid: ceph-ceph_daemon_crash