Last active
November 15, 2023 16:57
-
-
Save mattico/d89172579cd69a4d8b8077c2e4fe8c17 to your computer and use it in GitHub Desktop.
Script for monitoring zpool errors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import subprocess | |
import os | |
import csv | |
import re | |
from collections import defaultdict | |
from datetime import timedelta | |
STATUS_CMD = ["zpool", "status", "-v"] | |
KCF_STATS = "/proc/spl/kstat/kcf/NONAME_provider_stats" | |
ERROR_LOG = "/var/log/zpool_errors.csv" | |
def get_uptime(): | |
"""Return the system's uptime.""" | |
with open("/proc/uptime", "r") as f: | |
seconds = float(f.readline().split()[0]) | |
return str(timedelta(seconds=seconds)) | |
def get_kcf_ops_failed(): | |
"""Return the value of kcf_ops_failed from KCF_STATS.""" | |
with open(KCF_STATS, "r") as f: | |
for line in f: | |
if "kcf_ops_failed" in line: | |
return int(line.split()[-1]) | |
return 0 # default if not found | |
def get_recorded_kcf_ops_failed(): | |
last_kcf_value = 0 # Default to 0 if not recorded before | |
with open(ERROR_LOG, "r") as file: | |
reader = csv.reader(file) | |
next(reader) # Skip header | |
for row in reader: | |
if len(row) >= 5 and row[4]: | |
last_kcf_value = int(row[4]) | |
return last_kcf_value | |
SCRUB_COMPLETED_PATTERN = re.compile("scrub repaired (\d+[BKMGTP]?B) in (?:\d+ days )?[\d:]+ with (\d+) errors") | |
def get_current_status(): | |
"""Return a dictionary of current ZFS errors, keyed by pool name.""" | |
output = subprocess.check_output(STATUS_CMD, text=True) | |
errors = {} | |
scrub_status = {} | |
current_pool = None | |
recording = False | |
for line in output.splitlines(): | |
line = line.strip() | |
if "pool:" in line: | |
current_pool = line.split(':')[1].strip() | |
recording = False | |
errors[current_pool] = set() | |
scrub_status[current_pool] = 'NOT_IN_PROGRESS' | |
elif current_pool and 'scrub in progress' in line: | |
scrub_status[current_pool] = 'IN_PROGRESS' | |
elif current_pool and 'scrub repaired' in line: | |
match = SCRUB_COMPLETED_PATTERN.search(line) | |
repaired, scrub_errors = match.groups() | |
scrub_status[current_pool] = f"COMPLETED: Repaired {repaired} with {scrub_errors} errors" | |
elif current_pool and "Permanent errors have been detected" in line: | |
recording = True | |
elif current_pool and recording and line: | |
errors[current_pool].add(line) | |
return errors, scrub_status | |
def get_recorded_errors(): | |
"""Return a dictionary of the present (non-removed) errors from the log, keyed by pool name.""" | |
errors = defaultdict(set) | |
if not os.path.exists(ERROR_LOG): | |
return errors | |
with open(ERROR_LOG, 'r') as file: | |
reader = csv.reader(file) | |
next(reader) # Skip header | |
for row in reader: | |
pool = row[0] | |
error = row[2] | |
action = row[3] | |
if action == "ADDED": | |
errors[pool].add(error) | |
elif action == "REMOVED": | |
errors[pool].discard(error) | |
else: | |
pass # Ignore KCF_VALUE_CHANGED actions | |
return errors | |
def get_recorded_scrub_status(): | |
"""Return a dictionary of the last recorded scrub status from each pool.""" | |
status = defaultdict(str) # If there's nothing in the logs we'll notice any changes | |
if not os.path.exists(ERROR_LOG): | |
return status | |
with open(ERROR_LOG, 'r') as file: | |
reader = csv.reader(file) | |
next(reader) # Skip header | |
for row in reader: | |
if len(row) < 6: | |
continue # No Scrub column | |
pool = row[0] | |
scrub = row[5] | |
status[pool] = scrub | |
return status | |
def initialize_log(): | |
"""Initialize the log file if it doesn't exist.""" | |
if not os.path.exists(ERROR_LOG): | |
with open(ERROR_LOG, 'w') as file: | |
writer = csv.writer(file) | |
writer.writerow(["Pool", "Uptime", "Error", "Action", "KCF_Ops_Failed", "Scrub"]) | |
def main(): | |
initialize_log() | |
current_errors, current_scrub_status = get_current_status() | |
pools = set(list(current_errors.keys()) + list(current_scrub_status.keys())) | |
recorded_errors = get_recorded_errors() | |
recorded_scrub_status = get_recorded_scrub_status() | |
kcf_value = get_kcf_ops_failed() | |
prev_kcf_value = get_recorded_kcf_ops_failed() | |
with open(ERROR_LOG, "a") as file: | |
writer = csv.writer(file) | |
uptime = get_uptime() | |
for pool in pools: | |
cur_err = current_errors[pool] | |
rec_err = recorded_errors[pool] | |
cur_scrub = current_scrub_status[pool] | |
rec_scrub = recorded_scrub_status[pool] | |
added_errors = cur_err - rec_err | |
removed_errors = rec_err - cur_err | |
for error in added_errors: | |
writer.writerow([pool, uptime, error, "ADDED", kcf_value, cur_scrub]) | |
prev_kcf_value = kcf_value # already recorded new value | |
rec_scrub = cur_scrub | |
for error in removed_errors: | |
writer.writerow([pool, uptime, error, "REMOVED", kcf_value, cur_scrub]) | |
prev_kcf_value = kcf_value | |
rec_scrub = cur_scrub | |
# If we didn't record the new scrub status in an error row, output it standalone | |
if rec_scrub != cur_scrub: | |
writer.writerow([pool, uptime, 'SCRUB_STATUS_CHANGED', "", kcf_value, cur_scrub]) | |
# Make sure kcf value changes are recorded even without a zpool error | |
# so we can notice if they are uncorrelated | |
if kcf_value != prev_kcf_value: | |
writer.writerow(["", uptime, "KCF_VALUE_CHANGED", "", kcf_value]) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Check for ZFS permanent errors | |
[Service] | |
Type=oneshot | |
ExecStart=/usr/bin/python3 /usr/local/bin/check_zpool_errors.py | |
# Hardening measures | |
PrivateTmp=yes | |
ProtectSystem=strict | |
ProtectHome=yes | |
NoNewPrivileges=yes | |
ProtectKernelTunables=yes | |
ProtectKernelModules=yes | |
ProtectControlGroups=yes | |
RestrictRealtime=yes | |
RestrictNamespaces=yes | |
ReadOnlyPaths=/usr/local/bin/check_zpool_errors.py /proc/spl/kstat/kcf/NONAME_provider_stats | |
ReadWritePaths=/var/log/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Run check_zpool_errors service every minute | |
[Timer] | |
OnBootSec=2min | |
OnUnitActiveSec=1min | |
[Install] | |
WantedBy=timers.target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment