mattico/check_zpool_errors.py

## check_zpool_errors.py
#!/usr/bin/env python3

import subprocess
import os
import csv
import re
from collections import defaultdict
from datetime import timedelta

STATUS_CMD = ["zpool", "status", "-v"]
KCF_STATS = "/proc/spl/kstat/kcf/NONAME_provider_stats"
ERROR_LOG = "/var/log/zpool_errors.csv"

def get_uptime():
    """Return the system's uptime."""
    with open("/proc/uptime", "r") as f:
        seconds = float(f.readline().split()[0])
        return str(timedelta(seconds=seconds))

def get_kcf_ops_failed():
    """Return the value of kcf_ops_failed from KCF_STATS."""
    with open(KCF_STATS, "r") as f:
        for line in f:
            if "kcf_ops_failed" in line:
                return int(line.split()[-1])
    return 0  # default if not found

def get_recorded_kcf_ops_failed():
    last_kcf_value = 0  # Default to 0 if not recorded before
    with open(ERROR_LOG, "r") as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            if len(row) >= 5 and row[4]:
                last_kcf_value = int(row[4])
    return last_kcf_value

SCRUB_COMPLETED_PATTERN = re.compile("scrub repaired (\d+[BKMGTP]?B) in (?:\d+ days )?[\d:]+ with (\d+) errors")

def get_current_status():
    """Return a dictionary of current ZFS errors, keyed by pool name."""
    output = subprocess.check_output(STATUS_CMD, text=True)
    errors = {}
    scrub_status = {}
    current_pool = None
    recording = False

    for line in output.splitlines():
        line = line.strip()
        if "pool:" in line:
            current_pool = line.split(':')[1].strip()
            recording = False
            errors[current_pool] = set()
            scrub_status[current_pool] = 'NOT_IN_PROGRESS'
        elif current_pool and 'scrub in progress' in line:
            scrub_status[current_pool] = 'IN_PROGRESS'
        elif current_pool and 'scrub repaired' in line:
            match = SCRUB_COMPLETED_PATTERN.search(line)
            repaired, scrub_errors = match.groups()
            scrub_status[current_pool] = f"COMPLETED: Repaired {repaired} with {scrub_errors} errors"
        elif current_pool and "Permanent errors have been detected" in line:
            recording = True
        elif current_pool and recording and line:
            errors[current_pool].add(line)

    return errors, scrub_status

def get_recorded_errors():
    """Return a dictionary of the present (non-removed) errors from the log, keyed by pool name."""
    errors = defaultdict(set)
    if not os.path.exists(ERROR_LOG):
        return errors

    with open(ERROR_LOG, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            pool = row[0]
            error = row[2]
            action = row[3]
            if action == "ADDED":
                errors[pool].add(error)
            elif action == "REMOVED":
                errors[pool].discard(error)
            else:
                pass  # Ignore KCF_VALUE_CHANGED actions

    return errors

def get_recorded_scrub_status():
    """Return a dictionary of the last recorded scrub status from each pool."""
    status = defaultdict(str) # If there's nothing in the logs we'll notice any changes
    if not os.path.exists(ERROR_LOG):
        return status

    with open(ERROR_LOG, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            if len(row) < 6:
                continue # No Scrub column
            pool = row[0]
            scrub = row[5]
            status[pool] = scrub

    return status

def initialize_log():
    """Initialize the log file if it doesn't exist."""
    if not os.path.exists(ERROR_LOG):
        with open(ERROR_LOG, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(["Pool", "Uptime", "Error", "Action", "KCF_Ops_Failed", "Scrub"])

def main():
    initialize_log()

    current_errors, current_scrub_status = get_current_status()
    pools = set(list(current_errors.keys()) + list(current_scrub_status.keys()))
    recorded_errors = get_recorded_errors()
    recorded_scrub_status = get_recorded_scrub_status()
    kcf_value = get_kcf_ops_failed()
    prev_kcf_value = get_recorded_kcf_ops_failed()

    with open(ERROR_LOG, "a") as file:
        writer = csv.writer(file)
        uptime = get_uptime()

        for pool in pools:
            cur_err = current_errors[pool]
            rec_err = recorded_errors[pool]
            cur_scrub = current_scrub_status[pool]
            rec_scrub = recorded_scrub_status[pool]
            added_errors = cur_err - rec_err
            removed_errors = rec_err - cur_err

            for error in added_errors:
                writer.writerow([pool, uptime, error, "ADDED", kcf_value, cur_scrub])
                prev_kcf_value = kcf_value # already recorded new value
                rec_scrub = cur_scrub
            for error in removed_errors:
                writer.writerow([pool, uptime, error, "REMOVED", kcf_value, cur_scrub])
                prev_kcf_value = kcf_value
                rec_scrub = cur_scrub

            # If we didn't record the new scrub status in an error row, output it standalone
            if rec_scrub != cur_scrub:
                writer.writerow([pool, uptime, 'SCRUB_STATUS_CHANGED', "", kcf_value, cur_scrub])


        # Make sure kcf value changes are recorded even without a zpool error
        # so we can notice if they are uncorrelated
        if kcf_value != prev_kcf_value:
            writer.writerow(["", uptime, "KCF_VALUE_CHANGED", "", kcf_value])

if __name__ == "__main__":
    main()

## check_zpool_errors.service
[Unit]
Description=Check for ZFS permanent errors

[Service]
Type=oneshot
ExecStart=/usr/bin/python3 /usr/local/bin/check_zpool_errors.py

# Hardening measures
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
NoNewPrivileges=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictRealtime=yes
RestrictNamespaces=yes
ReadOnlyPaths=/usr/local/bin/check_zpool_errors.py /proc/spl/kstat/kcf/NONAME_provider_stats
ReadWritePaths=/var/log/

## check_zpool_errors.timer
[Unit]
Description=Run check_zpool_errors service every minute

[Timer]
OnBootSec=2min
OnUnitActiveSec=1min

[Install]
WantedBy=timers.target
	#!/usr/bin/env python3

	import subprocess
	import os
	import csv
	import re
	from collections import defaultdict
	from datetime import timedelta

	STATUS_CMD = ["zpool", "status", "-v"]
	KCF_STATS = "/proc/spl/kstat/kcf/NONAME_provider_stats"
	ERROR_LOG = "/var/log/zpool_errors.csv"

	def get_uptime():
	"""Return the system's uptime."""
	with open("/proc/uptime", "r") as f:
	seconds = float(f.readline().split()[0])
	return str(timedelta(seconds=seconds))

	def get_kcf_ops_failed():
	"""Return the value of kcf_ops_failed from KCF_STATS."""
	with open(KCF_STATS, "r") as f:
	for line in f:
	if "kcf_ops_failed" in line:
	return int(line.split()[-1])
	return 0 # default if not found

	def get_recorded_kcf_ops_failed():
	last_kcf_value = 0 # Default to 0 if not recorded before
	with open(ERROR_LOG, "r") as file:
	reader = csv.reader(file)
	next(reader) # Skip header
	for row in reader:
	if len(row) >= 5 and row[4]:
	last_kcf_value = int(row[4])
	return last_kcf_value

	SCRUB_COMPLETED_PATTERN = re.compile("scrub repaired (\d+[BKMGTP]?B) in (?:\d+ days )?[\d:]+ with (\d+) errors")

	def get_current_status():
	"""Return a dictionary of current ZFS errors, keyed by pool name."""
	output = subprocess.check_output(STATUS_CMD, text=True)
	errors = {}
	scrub_status = {}
	current_pool = None
	recording = False

	for line in output.splitlines():
	line = line.strip()
	if "pool:" in line:
	current_pool = line.split(':')[1].strip()
	recording = False
	errors[current_pool] = set()
	scrub_status[current_pool] = 'NOT_IN_PROGRESS'
	elif current_pool and 'scrub in progress' in line:
	scrub_status[current_pool] = 'IN_PROGRESS'
	elif current_pool and 'scrub repaired' in line:
	match = SCRUB_COMPLETED_PATTERN.search(line)
	repaired, scrub_errors = match.groups()
	scrub_status[current_pool] = f"COMPLETED: Repaired {repaired} with {scrub_errors} errors"
	elif current_pool and "Permanent errors have been detected" in line:
	recording = True
	elif current_pool and recording and line:
	errors[current_pool].add(line)

	return errors, scrub_status

	def get_recorded_errors():
	"""Return a dictionary of the present (non-removed) errors from the log, keyed by pool name."""
	errors = defaultdict(set)
	if not os.path.exists(ERROR_LOG):
	return errors

	with open(ERROR_LOG, 'r') as file:
	reader = csv.reader(file)
	next(reader) # Skip header
	for row in reader:
	pool = row[0]
	error = row[2]
	action = row[3]
	if action == "ADDED":
	errors[pool].add(error)
	elif action == "REMOVED":
	errors[pool].discard(error)
	else:
	pass # Ignore KCF_VALUE_CHANGED actions

	return errors

	def get_recorded_scrub_status():
	"""Return a dictionary of the last recorded scrub status from each pool."""
	status = defaultdict(str) # If there's nothing in the logs we'll notice any changes
	if not os.path.exists(ERROR_LOG):
	return status

	with open(ERROR_LOG, 'r') as file:
	reader = csv.reader(file)
	next(reader) # Skip header
	for row in reader:
	if len(row) < 6:
	continue # No Scrub column
	pool = row[0]
	scrub = row[5]
	status[pool] = scrub

	return status

	def initialize_log():
	"""Initialize the log file if it doesn't exist."""
	if not os.path.exists(ERROR_LOG):
	with open(ERROR_LOG, 'w') as file:
	writer = csv.writer(file)
	writer.writerow(["Pool", "Uptime", "Error", "Action", "KCF_Ops_Failed", "Scrub"])

	def main():
	initialize_log()

	current_errors, current_scrub_status = get_current_status()
	pools = set(list(current_errors.keys()) + list(current_scrub_status.keys()))
	recorded_errors = get_recorded_errors()
	recorded_scrub_status = get_recorded_scrub_status()
	kcf_value = get_kcf_ops_failed()
	prev_kcf_value = get_recorded_kcf_ops_failed()

	with open(ERROR_LOG, "a") as file:
	writer = csv.writer(file)
	uptime = get_uptime()

	for pool in pools:
	cur_err = current_errors[pool]
	rec_err = recorded_errors[pool]
	cur_scrub = current_scrub_status[pool]
	rec_scrub = recorded_scrub_status[pool]
	added_errors = cur_err - rec_err
	removed_errors = rec_err - cur_err

	for error in added_errors:
	writer.writerow([pool, uptime, error, "ADDED", kcf_value, cur_scrub])
	prev_kcf_value = kcf_value # already recorded new value
	rec_scrub = cur_scrub
	for error in removed_errors:
	writer.writerow([pool, uptime, error, "REMOVED", kcf_value, cur_scrub])
	prev_kcf_value = kcf_value
	rec_scrub = cur_scrub

	# If we didn't record the new scrub status in an error row, output it standalone
	if rec_scrub != cur_scrub:
	writer.writerow([pool, uptime, 'SCRUB_STATUS_CHANGED', "", kcf_value, cur_scrub])


	# Make sure kcf value changes are recorded even without a zpool error
	# so we can notice if they are uncorrelated
	if kcf_value != prev_kcf_value:
	writer.writerow(["", uptime, "KCF_VALUE_CHANGED", "", kcf_value])

	if __name__ == "__main__":
	main()
	[Unit]
	Description=Check for ZFS permanent errors

	[Service]
	Type=oneshot
	ExecStart=/usr/bin/python3 /usr/local/bin/check_zpool_errors.py

	# Hardening measures
	PrivateTmp=yes
	ProtectSystem=strict
	ProtectHome=yes
	NoNewPrivileges=yes
	ProtectKernelTunables=yes
	ProtectKernelModules=yes
	ProtectControlGroups=yes
	RestrictRealtime=yes
	RestrictNamespaces=yes
	ReadOnlyPaths=/usr/local/bin/check_zpool_errors.py /proc/spl/kstat/kcf/NONAME_provider_stats
	ReadWritePaths=/var/log/
	[Unit]
	Description=Run check_zpool_errors service every minute

	[Timer]
	OnBootSec=2min
	OnUnitActiveSec=1min

	[Install]
	WantedBy=timers.target