Skip to content

Instantly share code, notes, and snippets.

@mattico
Last active November 15, 2023 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattico/d89172579cd69a4d8b8077c2e4fe8c17 to your computer and use it in GitHub Desktop.
Save mattico/d89172579cd69a4d8b8077c2e4fe8c17 to your computer and use it in GitHub Desktop.
Script for monitoring zpool errors
#!/usr/bin/env python3
import subprocess
import os
import csv
import re
from collections import defaultdict
from datetime import timedelta
STATUS_CMD = ["zpool", "status", "-v"]
KCF_STATS = "/proc/spl/kstat/kcf/NONAME_provider_stats"
ERROR_LOG = "/var/log/zpool_errors.csv"
def get_uptime():
"""Return the system's uptime."""
with open("/proc/uptime", "r") as f:
seconds = float(f.readline().split()[0])
return str(timedelta(seconds=seconds))
def get_kcf_ops_failed():
"""Return the value of kcf_ops_failed from KCF_STATS."""
with open(KCF_STATS, "r") as f:
for line in f:
if "kcf_ops_failed" in line:
return int(line.split()[-1])
return 0 # default if not found
def get_recorded_kcf_ops_failed():
last_kcf_value = 0 # Default to 0 if not recorded before
with open(ERROR_LOG, "r") as file:
reader = csv.reader(file)
next(reader) # Skip header
for row in reader:
if len(row) >= 5 and row[4]:
last_kcf_value = int(row[4])
return last_kcf_value
SCRUB_COMPLETED_PATTERN = re.compile("scrub repaired (\d+[BKMGTP]?B) in (?:\d+ days )?[\d:]+ with (\d+) errors")
def get_current_status():
"""Return a dictionary of current ZFS errors, keyed by pool name."""
output = subprocess.check_output(STATUS_CMD, text=True)
errors = {}
scrub_status = {}
current_pool = None
recording = False
for line in output.splitlines():
line = line.strip()
if "pool:" in line:
current_pool = line.split(':')[1].strip()
recording = False
errors[current_pool] = set()
scrub_status[current_pool] = 'NOT_IN_PROGRESS'
elif current_pool and 'scrub in progress' in line:
scrub_status[current_pool] = 'IN_PROGRESS'
elif current_pool and 'scrub repaired' in line:
match = SCRUB_COMPLETED_PATTERN.search(line)
repaired, scrub_errors = match.groups()
scrub_status[current_pool] = f"COMPLETED: Repaired {repaired} with {scrub_errors} errors"
elif current_pool and "Permanent errors have been detected" in line:
recording = True
elif current_pool and recording and line:
errors[current_pool].add(line)
return errors, scrub_status
def get_recorded_errors():
"""Return a dictionary of the present (non-removed) errors from the log, keyed by pool name."""
errors = defaultdict(set)
if not os.path.exists(ERROR_LOG):
return errors
with open(ERROR_LOG, 'r') as file:
reader = csv.reader(file)
next(reader) # Skip header
for row in reader:
pool = row[0]
error = row[2]
action = row[3]
if action == "ADDED":
errors[pool].add(error)
elif action == "REMOVED":
errors[pool].discard(error)
else:
pass # Ignore KCF_VALUE_CHANGED actions
return errors
def get_recorded_scrub_status():
"""Return a dictionary of the last recorded scrub status from each pool."""
status = defaultdict(str) # If there's nothing in the logs we'll notice any changes
if not os.path.exists(ERROR_LOG):
return status
with open(ERROR_LOG, 'r') as file:
reader = csv.reader(file)
next(reader) # Skip header
for row in reader:
if len(row) < 6:
continue # No Scrub column
pool = row[0]
scrub = row[5]
status[pool] = scrub
return status
def initialize_log():
"""Initialize the log file if it doesn't exist."""
if not os.path.exists(ERROR_LOG):
with open(ERROR_LOG, 'w') as file:
writer = csv.writer(file)
writer.writerow(["Pool", "Uptime", "Error", "Action", "KCF_Ops_Failed", "Scrub"])
def main():
initialize_log()
current_errors, current_scrub_status = get_current_status()
pools = set(list(current_errors.keys()) + list(current_scrub_status.keys()))
recorded_errors = get_recorded_errors()
recorded_scrub_status = get_recorded_scrub_status()
kcf_value = get_kcf_ops_failed()
prev_kcf_value = get_recorded_kcf_ops_failed()
with open(ERROR_LOG, "a") as file:
writer = csv.writer(file)
uptime = get_uptime()
for pool in pools:
cur_err = current_errors[pool]
rec_err = recorded_errors[pool]
cur_scrub = current_scrub_status[pool]
rec_scrub = recorded_scrub_status[pool]
added_errors = cur_err - rec_err
removed_errors = rec_err - cur_err
for error in added_errors:
writer.writerow([pool, uptime, error, "ADDED", kcf_value, cur_scrub])
prev_kcf_value = kcf_value # already recorded new value
rec_scrub = cur_scrub
for error in removed_errors:
writer.writerow([pool, uptime, error, "REMOVED", kcf_value, cur_scrub])
prev_kcf_value = kcf_value
rec_scrub = cur_scrub
# If we didn't record the new scrub status in an error row, output it standalone
if rec_scrub != cur_scrub:
writer.writerow([pool, uptime, 'SCRUB_STATUS_CHANGED', "", kcf_value, cur_scrub])
# Make sure kcf value changes are recorded even without a zpool error
# so we can notice if they are uncorrelated
if kcf_value != prev_kcf_value:
writer.writerow(["", uptime, "KCF_VALUE_CHANGED", "", kcf_value])
if __name__ == "__main__":
main()
[Unit]
Description=Check for ZFS permanent errors
[Service]
Type=oneshot
ExecStart=/usr/bin/python3 /usr/local/bin/check_zpool_errors.py
# Hardening measures
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
NoNewPrivileges=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictRealtime=yes
RestrictNamespaces=yes
ReadOnlyPaths=/usr/local/bin/check_zpool_errors.py /proc/spl/kstat/kcf/NONAME_provider_stats
ReadWritePaths=/var/log/
[Unit]
Description=Run check_zpool_errors service every minute
[Timer]
OnBootSec=2min
OnUnitActiveSec=1min
[Install]
WantedBy=timers.target
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment