Skip to content

Instantly share code, notes, and snippets.

@hagen1778
Created August 8, 2020 21:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hagen1778/20ba5af021db40569000608992fecb7d to your computer and use it in GitHub Desktop.
Save hagen1778/20ba5af021db40569000608992fecb7d to your computer and use it in GitHub Desktop.
Migrating data from Prometheus to VM. Prometheus rules config
groups:
- name: CPU rules
interval: 10s
rules:
# The count of CPUs per node, useful for getting CPU time as a percent of total.
- record: instance:node_cpus:count
expr: >
count without (cpu, mode) (
node_cpu_seconds_total{mode="idle"}
)
# CPU in use by CPU.
- record: instance_cpu:node_cpu_seconds_not_idle:rate1m
expr: >
sum without (mode) (
1 - rate(node_cpu_seconds_total{mode="idle"}[1m])
)
# CPU in use by mode.
# Split recording for iowait to avoid reset bugs.
- record: instance_mode:node_cpu_seconds:rate1m
expr: >
sum without (cpu) (
rate(node_cpu_seconds_total{mode!="iowait"}[1m])
)
- record: instance_mode:node_cpu_seconds:rate1m
expr: >
sum without (cpu) (
deriv(node_cpu_seconds_total{mode="iowait"}[1m]) > 0
)
# CPU in use ratio.
- record: instance:node_cpu_utilization:ratio
expr: >
avg without (cpu) (
instance_cpu:node_cpu_seconds_not_idle:rate1m
)
# CPU summaries
- record: job:node_cpu_utilization:min_ratio
expr: >
min without (fqdn, instance) (
instance:node_cpu_utilization:ratio
)
- record: job:node_cpu_utilization:avg_ratio
expr: >
avg without (fqdn, instance) (
instance:node_cpu_utilization:ratio
)
- record: job:node_cpu_utilization:max_ratio
expr: >
max without (fqdn, instance) (
instance:node_cpu_utilization:ratio
)
# Rules for calculating and alerting on long-term node utilization issues.
- name: Utilization
interval: 20s
rules:
- record: instance:cpu_utilization:ratio_max
expr: max_over_time(instance:node_cpu_utilization:ratio[300s])
- record: instance:cpu_utilization:ratio_avg
expr: avg_over_time(instance:node_cpu_utilization:ratio[300s])
- record: instance:cpu_utilization:ratio_q95
expr: quantile_over_time(0.95, instance:node_cpu_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_max
expr: max_over_time(instance:node_memory_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_avg
expr: avg_over_time(instance:node_memory_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_q95
expr: quantile_over_time(0.95, instance:node_memory_utilization:ratio[300s])
- name: Node memory
rules:
- record: instance:node_memory_available:ratio
expr: >
(
node_memory_MemAvailable_bytes or
(
node_memory_Buffers_bytes +
node_memory_Cached_bytes +
node_memory_MemFree_bytes +
node_memory_Slab_bytes
)
) /
node_memory_MemTotal_bytes
- record: instance:node_memory_utilization:ratio
expr: 1 - instance:node_memory_available:ratio
- name: Node filesystem rules
rules:
- record: instance:node_filesystem_avail:ratio
expr: node_filesystem_avail_bytes{device=~"(/dev/.+|tank/dataset)"} / node_filesystem_size_bytes{device=~"(/dev/.+|tank/dataset)"}
- record: instance:node_disk_writes_completed:irate1m
expr: sum(irate(node_disk_writes_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
- record: instance:node_disk_reads_completed:irate1m
expr: sum(irate(node_disk_reads_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
- expr: |-
avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: ':node_disk_saturation:avg_irate'
- expr: |-
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |-
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- record: instance:up:count
expr: count(up{job="node",type!=""} == 1) WITHOUT (instance, fqdn)
- name: node-exporter.examples
rules:
- record: instance:node_num_cpu:sum
expr: |-
count without (cpu) (
count without (mode) (
node_cpu_seconds_total
)
)
- expr: |-
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |-
(
node_load1
/
instance:node_num_cpu:sum
)
record: instance:node_load1_per_cpu:ratio
- expr: |-
1 - (
node_memory_MemAvailable_bytes
/
node_memory_MemTotal_bytes
)
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: rate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: rate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{ device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |-
sum without (device) (
rate(node_network_receive_drop_total{device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment