Skip to content

Instantly share code, notes, and snippets.

@rwarren
Created May 3, 2019 19:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rwarren/786fc4f1cc3abc067082f8812c669ed1 to your computer and use it in GitHub Desktop.
Save rwarren/786fc4f1cc3abc067082f8812c669ed1 to your computer and use it in GitHub Desktop.
A script hacked together to get the ZFS reporting I wanted (that the telegraf zfs plugin didn't provide)
#!/usr/bin/env python
# file: check_zfs.py
# This script is intended to be run as a cron job. Given a URL, it collects various zfs stats and
# sends them off to our influxdb server. Credentials are hard coded, but the URL is not.
# Usage: check_zfs.py [url]
# - without the url it dumps the stats to stdout
from collections import OrderedDict
import datetime
import os
import platform
from pprint import pprint as pp
import re
import subprocess
import sys
from influxdb import InfluxDBClient
interval_dict = OrderedDict([("Y", 365*86400), # 1 year
("M", 30*86400), # 1 month
("W", 7*86400), # 1 week
("D", 86400), # 1 day
("h", 3600), # 1 hour
("m", 60), # 1 minute
("s", 1)]) # 1 second
# Zpool health reports are strings, but grafana thresholding/alerting works best with numbers...
# - the map below is in rough order of increasing severity, with 0 being OK
HEALTH_MAP = dict(
ONLINE = 0,
DEGRADED = 11,
OFFLINE = 21,
UNAVAIL = 22,
FAULTED = 23,
REMOVED = 24,
)
UNKNOWN_HEALTH_VALUE = 10 # count as a major error until we update our dict
def human_to_seconds(string):
# converts human-readable durations ("3h5m") to seconds (for scrub timing)
# - cribbed completely from https://goo.gl/Cpf9yn
interval_exc = "Bad interval format for {0}".format(string)
interval_regex = re.compile("^(?P<value>[0-9]+)(?P<unit>[{0}])".format("".join(interval_dict.keys())))
seconds = 0
while string:
match = interval_regex.match(string)
if match:
value, unit = int(match.group("value")), match.group("unit")
if int(value) and unit in interval_dict:
seconds += value * interval_dict[unit]
string = string[match.end():]
else:
raise Exception(interval_exc)
else:
raise Exception(interval_exc)
return seconds
def GetZFSPools():
lines = subprocess.check_output("/sbin/zpool list -H".split())
lines = lines.decode("ascii").strip()
return [line.split("\t")[0] for line in lines.split("\n")]
SCRUB_PROBLEM_RET = dict(zpool_scrub_errors = 1,
zpool_scrub_repaired = 0,
zpool_scrub_duration = 0)
def _GetScrubStats(PoolName):
cmd = "/sbin/zpool status %s" % PoolName
lines = subprocess.check_output(cmd.split())
lines = lines.decode("ascii").strip()
for line in lines.split("\n"):
line = line.strip()
if line.startswith("scan:"): # the only one we need
try:
toks = line.split()
repairedIdx = toks.index("repaired")
errorsIdx = toks.index("errors")
lastRepairCount = int(toks[repairedIdx + 1])
lastErrorCount = int(toks[errorsIdx - 1])
lastScrubDuration = human_to_seconds(toks[repairedIdx + 3]) # in seconds
return dict(zpool_scrub_errors = lastErrorCount,
zpool_scrub_repairs = lastRepairCount,
zpool_scrub_duration = lastScrubDuration)
except Exception: # ValueError
if ("scrub in progress" in line) or ("canceled" in line):
return dict(zpool_scrub_errors = 0, zpool_scrub_repaired = 0) # no duration
return SCRUB_PROBLEM_RET
else:
return SCRUB_PROBLEM_RET
def GetPoolStats(PoolName):
statmap = {
"allocated": int, # bytes used in pool
"free": int, # bytes free in pool
"size": int, # total size of pool (bytes)
"dedupratio": lambda v: float(v[:-1]), # How much dedup there is ("1.00x" if not enabled)
"fragmentation": lambda v: float(v[:-1]), # % fragmentation of the free space (e.g. "28%")
"health": str, # health of pool (one of ONLINE, DEGRADED, FAULTED, OFFLINE, REMOVED, or UNAVAIL)
}
cmd = "/sbin/zpool get %s -pH %s" % (",".join(statmap), PoolName)
lines = subprocess.check_output(cmd.split())
lines = lines.decode("ascii").strip()
ret = {}
for line in lines.split("\n"):
pool, prop, val, _ = line.split("\t", maxsplit = 4) # safe against added columns; _ is currently "source"
assert pool == PoolName
ret["zpool_" + prop] = statmap[prop](val)
ret["zpool_health_int"] = HEALTH_MAP.get(ret["zpool_health"], UNKNOWN_HEALTH_VALUE)
# Add the tough-to-parse scrub stats...
ret.update(_GetScrubStats(PoolName))
return ret
def GetArcStats():
statmap = {
"size": int, # number of currently allocated bytes for the ARC
"c": int, # target size for the ARC (in bytes). ZFS dynamically adjusts this.
"c_max": int, # maximum size allowed to allocate for the ARC
"p": int, # size of recently used cache
"hits": int, # cumulative number of cache hits
"misses": int, # cumulative number of cache misses
"mfu_hits": int, # cumulative number of mfu cache hits
"mru_hits": int, # cumulative number of mru cache hits
}
ret = {}
with open("/proc/spl/kstat/zfs/arcstats", "r") as fp:
next(fp) # skip the junky opening line
next(fp) # skip the header
arcstats = {}
for line in fp:
name, type, data = line.strip().split(maxsplit = 3)
try:
arcstats[name] = statmap[name](data)
except KeyError:
pass # don't care about the stat
# variable naming below emulates the names from zfs-util's arc_summary.py...
arc_size = arcstats["size"]
target_size = arcstats["c"]
target_max_size = arcstats["c_max"]
mru_size = arcstats["p"]
mfu_size = (arc_size - mru_size) if (arc_size > target_size) else (target_size - mru_size)
arc_hits = arcstats["hits"]
arc_misses = arcstats["misses"]
mfu_hits = arcstats["mfu_hits"]
mru_hits = arcstats["mru_hits"]
real_hits = mfu_hits + mru_hits
arc_accesses_total = arc_hits + arc_misses
cache_hit_ratio = arc_hits / arc_accesses_total
cache_miss_ratio = arc_misses / arc_accesses_total
actual_hit_ratio = real_hits / arc_accesses_total
recently_used_cache_size = mru_size
# Return stats using sensible names that can be located via prefixes in influxdb...
return dict(
arc_size_current = arc_size, # the current value
arc_size_target = target_size, # the adaptive value
arc_size_max = target_max_size, # the max possible value
arc_mru_size = mru_size,
arc_mfu_size = mfu_size,
arc_hit_ratio = cache_hit_ratio,
arc_miss_ratio = cache_miss_ratio,
arc_hit_ratio_actual = actual_hit_ratio,
)
def GetDatasetStats():
# args are:
# -p -- parseable output (gets bytes instead of SI suffixes)
# -H -- no header
# -o -- requests specific dataset properties (name, used, etc)
cmd = "/sbin/zfs list -pHo name,used,referenced,compressratio"
stats = {}
lines = subprocess.check_output(cmd.split())
lines = lines.decode("ascii").strip()
for line in lines.split("\n"):
path, used, referenced, compressratio = line.split()
stats[path] = dict(
size = int(referenced),
size_with_snapshots = int(used),
compressratio = float(compressratio[:-1]),
)
return stats
def PrintStats(PoolStats, ArcStats, DatasetStats):
print("Pool stats:")
pp(PoolStats)
print("ARC stats:")
pp(ArcStats)
print("Dataset stats:")
pp(DatasetStats)
def SendStats(URL, PoolStats, ArcStats, DatasetStats):
influxClient = InfluxDBClient(URL, *my_personal_info) # <-- add your own
sampleTime = datetime.datetime.now().isoformat()
measName = "ptvm_zfs" # to distinguish from "normal"/limited telegraf zfs data
baseTags = {
"host": platform.node(),
"srv_loc": os.environ["SRV_LOC"],
"srv_owner": os.environ["SRV_OWNER"],
"srv_purpose": os.environ["SRV_PURPOSE"],
"srv_vtype": os.environ["SRV_VTYPE"],
}
jsonBody = []
# Assemble the ArcStats...
jsonBody.append(dict(measurement = measName,
time = sampleTime,
tags = baseTags,
fields = ArcStats))
# Assemble the PoolStats...
for pName, pStats in PoolStats.items():
jsonBody.append(dict(measurement = measName,
time = sampleTime,
tags = dict(zpool = pName, **baseTags),
fields = pStats))
# Assemble the DatasetStats...
for dsPath, dsStats in DatasetStats.items():
zpool = dsPath.split("/", 1)[0]
jsonBody.append(dict(measurement = measName,
time = sampleTime,
tags = dict(dataset = dsPath, zpool = zpool, **baseTags),
fields = dsStats))
influxClient.write_points(jsonBody)
def Main():
pools = GetZFSPools()
poolStats = {pool: GetPoolStats(pool) for pool in pools}
arcStats = GetArcStats()
datasetStats = GetDatasetStats()
try:
influxdbURL = sys.argv[1]
except IndexError:
PrintStats(poolStats, arcStats, datasetStats)
else:
SendStats(influxdbURL, poolStats, arcStats, datasetStats)
if __name__ == "__main__":
Main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment