-
-
Save rwarren/786fc4f1cc3abc067082f8812c669ed1 to your computer and use it in GitHub Desktop.
A script hacked together to get the ZFS reporting I wanted (that the telegraf zfs plugin didn't provide)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# file: check_zfs.py | |
# This script is intended to be run as a cron job. Given a URL, it collects various zfs stats and | |
# sends them off to our influxdb server. Credentials are hard coded, but the URL is not. | |
# Usage: check_zfs.py [url] | |
# - without the url it dumps the stats to stdout | |
from collections import OrderedDict | |
import datetime | |
import os | |
import platform | |
from pprint import pprint as pp | |
import re | |
import subprocess | |
import sys | |
from influxdb import InfluxDBClient | |
interval_dict = OrderedDict([("Y", 365*86400), # 1 year | |
("M", 30*86400), # 1 month | |
("W", 7*86400), # 1 week | |
("D", 86400), # 1 day | |
("h", 3600), # 1 hour | |
("m", 60), # 1 minute | |
("s", 1)]) # 1 second | |
# Zpool health reports are strings, but grafana thresholding/alerting works best with numbers... | |
# - the map below is in rough order of increasing severity, with 0 being OK | |
HEALTH_MAP = dict( | |
ONLINE = 0, | |
DEGRADED = 11, | |
OFFLINE = 21, | |
UNAVAIL = 22, | |
FAULTED = 23, | |
REMOVED = 24, | |
) | |
UNKNOWN_HEALTH_VALUE = 10 # count as a major error until we update our dict | |
def human_to_seconds(string): | |
# converts human-readable durations ("3h5m") to seconds (for scrub timing) | |
# - cribbed completely from https://goo.gl/Cpf9yn | |
interval_exc = "Bad interval format for {0}".format(string) | |
interval_regex = re.compile("^(?P<value>[0-9]+)(?P<unit>[{0}])".format("".join(interval_dict.keys()))) | |
seconds = 0 | |
while string: | |
match = interval_regex.match(string) | |
if match: | |
value, unit = int(match.group("value")), match.group("unit") | |
if int(value) and unit in interval_dict: | |
seconds += value * interval_dict[unit] | |
string = string[match.end():] | |
else: | |
raise Exception(interval_exc) | |
else: | |
raise Exception(interval_exc) | |
return seconds | |
def GetZFSPools(): | |
lines = subprocess.check_output("/sbin/zpool list -H".split()) | |
lines = lines.decode("ascii").strip() | |
return [line.split("\t")[0] for line in lines.split("\n")] | |
SCRUB_PROBLEM_RET = dict(zpool_scrub_errors = 1, | |
zpool_scrub_repaired = 0, | |
zpool_scrub_duration = 0) | |
def _GetScrubStats(PoolName): | |
cmd = "/sbin/zpool status %s" % PoolName | |
lines = subprocess.check_output(cmd.split()) | |
lines = lines.decode("ascii").strip() | |
for line in lines.split("\n"): | |
line = line.strip() | |
if line.startswith("scan:"): # the only one we need | |
try: | |
toks = line.split() | |
repairedIdx = toks.index("repaired") | |
errorsIdx = toks.index("errors") | |
lastRepairCount = int(toks[repairedIdx + 1]) | |
lastErrorCount = int(toks[errorsIdx - 1]) | |
lastScrubDuration = human_to_seconds(toks[repairedIdx + 3]) # in seconds | |
return dict(zpool_scrub_errors = lastErrorCount, | |
zpool_scrub_repairs = lastRepairCount, | |
zpool_scrub_duration = lastScrubDuration) | |
except Exception: # ValueError | |
if ("scrub in progress" in line) or ("canceled" in line): | |
return dict(zpool_scrub_errors = 0, zpool_scrub_repaired = 0) # no duration | |
return SCRUB_PROBLEM_RET | |
else: | |
return SCRUB_PROBLEM_RET | |
def GetPoolStats(PoolName): | |
statmap = { | |
"allocated": int, # bytes used in pool | |
"free": int, # bytes free in pool | |
"size": int, # total size of pool (bytes) | |
"dedupratio": lambda v: float(v[:-1]), # How much dedup there is ("1.00x" if not enabled) | |
"fragmentation": lambda v: float(v[:-1]), # % fragmentation of the free space (e.g. "28%") | |
"health": str, # health of pool (one of ONLINE, DEGRADED, FAULTED, OFFLINE, REMOVED, or UNAVAIL) | |
} | |
cmd = "/sbin/zpool get %s -pH %s" % (",".join(statmap), PoolName) | |
lines = subprocess.check_output(cmd.split()) | |
lines = lines.decode("ascii").strip() | |
ret = {} | |
for line in lines.split("\n"): | |
pool, prop, val, _ = line.split("\t", maxsplit = 4) # safe against added columns; _ is currently "source" | |
assert pool == PoolName | |
ret["zpool_" + prop] = statmap[prop](val) | |
ret["zpool_health_int"] = HEALTH_MAP.get(ret["zpool_health"], UNKNOWN_HEALTH_VALUE) | |
# Add the tough-to-parse scrub stats... | |
ret.update(_GetScrubStats(PoolName)) | |
return ret | |
def GetArcStats(): | |
statmap = { | |
"size": int, # number of currently allocated bytes for the ARC | |
"c": int, # target size for the ARC (in bytes). ZFS dynamically adjusts this. | |
"c_max": int, # maximum size allowed to allocate for the ARC | |
"p": int, # size of recently used cache | |
"hits": int, # cumulative number of cache hits | |
"misses": int, # cumulative number of cache misses | |
"mfu_hits": int, # cumulative number of mfu cache hits | |
"mru_hits": int, # cumulative number of mru cache hits | |
} | |
ret = {} | |
with open("/proc/spl/kstat/zfs/arcstats", "r") as fp: | |
next(fp) # skip the junky opening line | |
next(fp) # skip the header | |
arcstats = {} | |
for line in fp: | |
name, type, data = line.strip().split(maxsplit = 3) | |
try: | |
arcstats[name] = statmap[name](data) | |
except KeyError: | |
pass # don't care about the stat | |
# variable naming below emulates the names from zfs-util's arc_summary.py... | |
arc_size = arcstats["size"] | |
target_size = arcstats["c"] | |
target_max_size = arcstats["c_max"] | |
mru_size = arcstats["p"] | |
mfu_size = (arc_size - mru_size) if (arc_size > target_size) else (target_size - mru_size) | |
arc_hits = arcstats["hits"] | |
arc_misses = arcstats["misses"] | |
mfu_hits = arcstats["mfu_hits"] | |
mru_hits = arcstats["mru_hits"] | |
real_hits = mfu_hits + mru_hits | |
arc_accesses_total = arc_hits + arc_misses | |
cache_hit_ratio = arc_hits / arc_accesses_total | |
cache_miss_ratio = arc_misses / arc_accesses_total | |
actual_hit_ratio = real_hits / arc_accesses_total | |
recently_used_cache_size = mru_size | |
# Return stats using sensible names that can be located via prefixes in influxdb... | |
return dict( | |
arc_size_current = arc_size, # the current value | |
arc_size_target = target_size, # the adaptive value | |
arc_size_max = target_max_size, # the max possible value | |
arc_mru_size = mru_size, | |
arc_mfu_size = mfu_size, | |
arc_hit_ratio = cache_hit_ratio, | |
arc_miss_ratio = cache_miss_ratio, | |
arc_hit_ratio_actual = actual_hit_ratio, | |
) | |
def GetDatasetStats(): | |
# args are: | |
# -p -- parseable output (gets bytes instead of SI suffixes) | |
# -H -- no header | |
# -o -- requests specific dataset properties (name, used, etc) | |
cmd = "/sbin/zfs list -pHo name,used,referenced,compressratio" | |
stats = {} | |
lines = subprocess.check_output(cmd.split()) | |
lines = lines.decode("ascii").strip() | |
for line in lines.split("\n"): | |
path, used, referenced, compressratio = line.split() | |
stats[path] = dict( | |
size = int(referenced), | |
size_with_snapshots = int(used), | |
compressratio = float(compressratio[:-1]), | |
) | |
return stats | |
def PrintStats(PoolStats, ArcStats, DatasetStats): | |
print("Pool stats:") | |
pp(PoolStats) | |
print("ARC stats:") | |
pp(ArcStats) | |
print("Dataset stats:") | |
pp(DatasetStats) | |
def SendStats(URL, PoolStats, ArcStats, DatasetStats): | |
influxClient = InfluxDBClient(URL, *my_personal_info) # <-- add your own | |
sampleTime = datetime.datetime.now().isoformat() | |
measName = "ptvm_zfs" # to distinguish from "normal"/limited telegraf zfs data | |
baseTags = { | |
"host": platform.node(), | |
"srv_loc": os.environ["SRV_LOC"], | |
"srv_owner": os.environ["SRV_OWNER"], | |
"srv_purpose": os.environ["SRV_PURPOSE"], | |
"srv_vtype": os.environ["SRV_VTYPE"], | |
} | |
jsonBody = [] | |
# Assemble the ArcStats... | |
jsonBody.append(dict(measurement = measName, | |
time = sampleTime, | |
tags = baseTags, | |
fields = ArcStats)) | |
# Assemble the PoolStats... | |
for pName, pStats in PoolStats.items(): | |
jsonBody.append(dict(measurement = measName, | |
time = sampleTime, | |
tags = dict(zpool = pName, **baseTags), | |
fields = pStats)) | |
# Assemble the DatasetStats... | |
for dsPath, dsStats in DatasetStats.items(): | |
zpool = dsPath.split("/", 1)[0] | |
jsonBody.append(dict(measurement = measName, | |
time = sampleTime, | |
tags = dict(dataset = dsPath, zpool = zpool, **baseTags), | |
fields = dsStats)) | |
influxClient.write_points(jsonBody) | |
def Main(): | |
pools = GetZFSPools() | |
poolStats = {pool: GetPoolStats(pool) for pool in pools} | |
arcStats = GetArcStats() | |
datasetStats = GetDatasetStats() | |
try: | |
influxdbURL = sys.argv[1] | |
except IndexError: | |
PrintStats(poolStats, arcStats, datasetStats) | |
else: | |
SendStats(influxdbURL, poolStats, arcStats, datasetStats) | |
if __name__ == "__main__": | |
Main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment