rwarren/check_zfs.py Secret

## check_zfs.py
#!/usr/bin/env python
# file: check_zfs.py

# This script is intended to be run as a cron job. Given a URL, it collects various zfs stats and
# sends them off to our influxdb server. Credentials are hard coded, but the URL is not.

# Usage: check_zfs.py [url]
#  - without the url it dumps the stats to stdout

from collections import OrderedDict
import datetime
import os
import platform
from pprint import pprint as pp
import re
import subprocess
import sys

from influxdb import InfluxDBClient

interval_dict = OrderedDict([("Y", 365*86400),  # 1 year
                             ("M", 30*86400),   # 1 month
                             ("W", 7*86400),    # 1 week
                             ("D", 86400),      # 1 day
                             ("h", 3600),       # 1 hour
                             ("m", 60),         # 1 minute
                             ("s", 1)])         # 1 second

# Zpool health reports are strings, but grafana thresholding/alerting works best with numbers...
# - the map below is in rough order of increasing severity, with 0 being OK
HEALTH_MAP = dict(
    ONLINE = 0,
    DEGRADED = 11,
    OFFLINE = 21,
    UNAVAIL = 22,
    FAULTED = 23,
    REMOVED = 24,
)
UNKNOWN_HEALTH_VALUE = 10 # count as a major error until we update our dict

def human_to_seconds(string):
    # converts human-readable durations ("3h5m") to seconds (for scrub timing)
    #  - cribbed completely from https://goo.gl/Cpf9yn
    interval_exc = "Bad interval format for {0}".format(string)

    interval_regex = re.compile("^(?P<value>[0-9]+)(?P<unit>[{0}])".format("".join(interval_dict.keys())))
    seconds = 0

    while string:
        match = interval_regex.match(string)
        if match:
            value, unit = int(match.group("value")), match.group("unit")
            if int(value) and unit in interval_dict:
                seconds += value * interval_dict[unit]
                string = string[match.end():]
            else:
                raise Exception(interval_exc)
        else:
            raise Exception(interval_exc)
    return seconds

def GetZFSPools():
    lines = subprocess.check_output("/sbin/zpool list -H".split())
    lines = lines.decode("ascii").strip()
    return [line.split("\t")[0] for line in lines.split("\n")]

SCRUB_PROBLEM_RET = dict(zpool_scrub_errors = 1,
                         zpool_scrub_repaired = 0,
                         zpool_scrub_duration = 0)
def _GetScrubStats(PoolName):
    cmd = "/sbin/zpool status %s" % PoolName
    lines = subprocess.check_output(cmd.split())
    lines = lines.decode("ascii").strip()
    for line in lines.split("\n"):
        line = line.strip()
        if line.startswith("scan:"): # the only one we need
            try:
                toks = line.split()
                repairedIdx = toks.index("repaired")
                errorsIdx = toks.index("errors")
                lastRepairCount = int(toks[repairedIdx + 1])
                lastErrorCount = int(toks[errorsIdx - 1])
                lastScrubDuration = human_to_seconds(toks[repairedIdx + 3]) # in seconds
                return dict(zpool_scrub_errors = lastErrorCount,
                            zpool_scrub_repairs = lastRepairCount,
                            zpool_scrub_duration = lastScrubDuration)
            except Exception: # ValueError
                if ("scrub in progress" in line) or ("canceled" in line):
                    return dict(zpool_scrub_errors = 0, zpool_scrub_repaired = 0) # no duration
                return SCRUB_PROBLEM_RET
    else:
        return SCRUB_PROBLEM_RET

def GetPoolStats(PoolName):
    statmap = {
        "allocated": int,     # bytes used in pool
        "free": int,          # bytes free in pool
        "size": int,          # total size of pool (bytes)
        "dedupratio": lambda v: float(v[:-1]),    # How much dedup there is ("1.00x" if not enabled)
        "fragmentation": lambda v: float(v[:-1]), # % fragmentation of the free space (e.g. "28%")
        "health": str,        # health of pool (one of ONLINE, DEGRADED, FAULTED, OFFLINE, REMOVED, or UNAVAIL)
    }
    cmd = "/sbin/zpool get %s -pH %s" % (",".join(statmap), PoolName)
    lines = subprocess.check_output(cmd.split())
    lines = lines.decode("ascii").strip()
    ret = {}
    for line in lines.split("\n"):
        pool, prop, val, _ = line.split("\t", maxsplit = 4) # safe against added columns; _ is currently "source"
        assert pool == PoolName
        ret["zpool_" + prop] = statmap[prop](val)
    ret["zpool_health_int"] = HEALTH_MAP.get(ret["zpool_health"], UNKNOWN_HEALTH_VALUE)
    # Add the tough-to-parse scrub stats...
    ret.update(_GetScrubStats(PoolName))
    return ret

def GetArcStats():
    statmap = {
        "size": int,     # number of currently allocated bytes for the ARC
        "c": int,        # target size for the ARC (in bytes). ZFS dynamically adjusts this.
        "c_max": int,    # maximum size allowed to allocate for the ARC
        "p": int,        # size of recently used cache
        "hits": int,     # cumulative number of cache hits
        "misses": int,   # cumulative number of cache misses
        "mfu_hits": int, # cumulative number of mfu cache hits
        "mru_hits": int, # cumulative number of mru cache hits
    }
    ret = {}
    with open("/proc/spl/kstat/zfs/arcstats", "r") as fp:
        next(fp) # skip the junky opening line
        next(fp) # skip the header
        arcstats = {}
        for line in fp:
            name, type, data = line.strip().split(maxsplit = 3)
            try:
                arcstats[name] = statmap[name](data)
            except KeyError:
                pass # don't care about the stat

    # variable naming below emulates the names from zfs-util's arc_summary.py...
    arc_size = arcstats["size"]
    target_size = arcstats["c"]
    target_max_size = arcstats["c_max"]
    mru_size = arcstats["p"]
    mfu_size = (arc_size - mru_size) if (arc_size > target_size) else (target_size - mru_size)
    arc_hits = arcstats["hits"]
    arc_misses = arcstats["misses"]
    mfu_hits = arcstats["mfu_hits"]
    mru_hits = arcstats["mru_hits"]
    real_hits = mfu_hits + mru_hits
    arc_accesses_total = arc_hits + arc_misses
    cache_hit_ratio = arc_hits / arc_accesses_total
    cache_miss_ratio = arc_misses / arc_accesses_total
    actual_hit_ratio = real_hits / arc_accesses_total
    recently_used_cache_size = mru_size

    # Return stats using sensible names that can be located via prefixes in influxdb...
    return dict(
        arc_size_current = arc_size,    # the current value
        arc_size_target = target_size,  # the adaptive value
        arc_size_max = target_max_size, # the max possible value
        arc_mru_size = mru_size,
        arc_mfu_size = mfu_size,
        arc_hit_ratio = cache_hit_ratio,
        arc_miss_ratio = cache_miss_ratio,
        arc_hit_ratio_actual = actual_hit_ratio,
    )

def GetDatasetStats():
    # args are:
    #   -p -- parseable output (gets bytes instead of SI suffixes)
    #   -H -- no header
    #   -o -- requests specific dataset properties (name, used, etc)
    cmd = "/sbin/zfs list -pHo name,used,referenced,compressratio"
    stats = {}
    lines = subprocess.check_output(cmd.split())
    lines = lines.decode("ascii").strip()
    for line in lines.split("\n"):
        path, used, referenced, compressratio = line.split()
        stats[path] = dict(
            size = int(referenced),
            size_with_snapshots = int(used),
            compressratio = float(compressratio[:-1]),
        )
    return stats

def PrintStats(PoolStats, ArcStats, DatasetStats):
    print("Pool stats:")
    pp(PoolStats)
    print("ARC stats:")
    pp(ArcStats)
    print("Dataset stats:")
    pp(DatasetStats)

def SendStats(URL, PoolStats, ArcStats, DatasetStats):
    influxClient = InfluxDBClient(URL, *my_personal_info) # <-- add your own
    sampleTime = datetime.datetime.now().isoformat()
    measName = "ptvm_zfs" # to distinguish from "normal"/limited telegraf zfs data
    baseTags = {
        "host": platform.node(),
        "srv_loc": os.environ["SRV_LOC"],
        "srv_owner": os.environ["SRV_OWNER"],
        "srv_purpose": os.environ["SRV_PURPOSE"],
        "srv_vtype": os.environ["SRV_VTYPE"],
    }
    jsonBody = []

    # Assemble the ArcStats...
    jsonBody.append(dict(measurement = measName,
                         time = sampleTime,
                         tags = baseTags,
                         fields = ArcStats))

    # Assemble the PoolStats...
    for pName, pStats in PoolStats.items():
        jsonBody.append(dict(measurement = measName,
                             time = sampleTime,
                             tags = dict(zpool = pName, **baseTags),
                             fields = pStats))

    # Assemble the DatasetStats...
    for dsPath, dsStats in DatasetStats.items():
        zpool = dsPath.split("/", 1)[0]
        jsonBody.append(dict(measurement = measName,
                             time = sampleTime,
                             tags = dict(dataset = dsPath, zpool = zpool, **baseTags),
                             fields = dsStats))

    influxClient.write_points(jsonBody)

def Main():
    pools = GetZFSPools()
    poolStats = {pool: GetPoolStats(pool) for pool in pools}
    arcStats = GetArcStats()
    datasetStats = GetDatasetStats()
    try:
        influxdbURL = sys.argv[1]
    except IndexError:
        PrintStats(poolStats, arcStats, datasetStats)
    else:
        SendStats(influxdbURL, poolStats, arcStats, datasetStats)

if __name__ == "__main__":
    Main()
	#!/usr/bin/env python
	# file: check_zfs.py

	# This script is intended to be run as a cron job. Given a URL, it collects various zfs stats and
	# sends them off to our influxdb server. Credentials are hard coded, but the URL is not.

	# Usage: check_zfs.py [url]
	# - without the url it dumps the stats to stdout

	from collections import OrderedDict
	import datetime
	import os
	import platform
	from pprint import pprint as pp
	import re
	import subprocess
	import sys

	from influxdb import InfluxDBClient

	interval_dict = OrderedDict([("Y", 365*86400), # 1 year
	("M", 30*86400), # 1 month
	("W", 7*86400), # 1 week
	("D", 86400), # 1 day
	("h", 3600), # 1 hour
	("m", 60), # 1 minute
	("s", 1)]) # 1 second

	# Zpool health reports are strings, but grafana thresholding/alerting works best with numbers...
	# - the map below is in rough order of increasing severity, with 0 being OK
	HEALTH_MAP = dict(
	ONLINE = 0,
	DEGRADED = 11,
	OFFLINE = 21,
	UNAVAIL = 22,
	FAULTED = 23,
	REMOVED = 24,
	)
	UNKNOWN_HEALTH_VALUE = 10 # count as a major error until we update our dict

	def human_to_seconds(string):
	# converts human-readable durations ("3h5m") to seconds (for scrub timing)
	# - cribbed completely from https://goo.gl/Cpf9yn
	interval_exc = "Bad interval format for {0}".format(string)

	interval_regex = re.compile("^(?P<value>[0-9]+)(?P<unit>[{0}])".format("".join(interval_dict.keys())))
	seconds = 0

	while string:
	match = interval_regex.match(string)
	if match:
	value, unit = int(match.group("value")), match.group("unit")
	if int(value) and unit in interval_dict:
	seconds += value * interval_dict[unit]
	string = string[match.end():]
	else:
	raise Exception(interval_exc)
	else:
	raise Exception(interval_exc)
	return seconds

	def GetZFSPools():
	lines = subprocess.check_output("/sbin/zpool list -H".split())
	lines = lines.decode("ascii").strip()
	return [line.split("\t")[0] for line in lines.split("\n")]

	SCRUB_PROBLEM_RET = dict(zpool_scrub_errors = 1,
	zpool_scrub_repaired = 0,
	zpool_scrub_duration = 0)
	def _GetScrubStats(PoolName):
	cmd = "/sbin/zpool status %s" % PoolName
	lines = subprocess.check_output(cmd.split())
	lines = lines.decode("ascii").strip()
	for line in lines.split("\n"):
	line = line.strip()
	if line.startswith("scan:"): # the only one we need
	try:
	toks = line.split()
	repairedIdx = toks.index("repaired")
	errorsIdx = toks.index("errors")
	lastRepairCount = int(toks[repairedIdx + 1])
	lastErrorCount = int(toks[errorsIdx - 1])
	lastScrubDuration = human_to_seconds(toks[repairedIdx + 3]) # in seconds
	return dict(zpool_scrub_errors = lastErrorCount,
	zpool_scrub_repairs = lastRepairCount,
	zpool_scrub_duration = lastScrubDuration)
	except Exception: # ValueError
	if ("scrub in progress" in line) or ("canceled" in line):
	return dict(zpool_scrub_errors = 0, zpool_scrub_repaired = 0) # no duration
	return SCRUB_PROBLEM_RET
	else:
	return SCRUB_PROBLEM_RET

	def GetPoolStats(PoolName):
	statmap = {
	"allocated": int, # bytes used in pool
	"free": int, # bytes free in pool
	"size": int, # total size of pool (bytes)
	"dedupratio": lambda v: float(v[:-1]), # How much dedup there is ("1.00x" if not enabled)
	"fragmentation": lambda v: float(v[:-1]), # % fragmentation of the free space (e.g. "28%")
	"health": str, # health of pool (one of ONLINE, DEGRADED, FAULTED, OFFLINE, REMOVED, or UNAVAIL)
	}
	cmd = "/sbin/zpool get %s -pH %s" % (",".join(statmap), PoolName)
	lines = subprocess.check_output(cmd.split())
	lines = lines.decode("ascii").strip()
	ret = {}
	for line in lines.split("\n"):
	pool, prop, val, _ = line.split("\t", maxsplit = 4) # safe against added columns; _ is currently "source"
	assert pool == PoolName
	ret["zpool_" + prop] = statmap[prop](val)
	ret["zpool_health_int"] = HEALTH_MAP.get(ret["zpool_health"], UNKNOWN_HEALTH_VALUE)
	# Add the tough-to-parse scrub stats...
	ret.update(_GetScrubStats(PoolName))
	return ret

	def GetArcStats():
	statmap = {
	"size": int, # number of currently allocated bytes for the ARC
	"c": int, # target size for the ARC (in bytes). ZFS dynamically adjusts this.
	"c_max": int, # maximum size allowed to allocate for the ARC
	"p": int, # size of recently used cache
	"hits": int, # cumulative number of cache hits
	"misses": int, # cumulative number of cache misses
	"mfu_hits": int, # cumulative number of mfu cache hits
	"mru_hits": int, # cumulative number of mru cache hits
	}
	ret = {}
	with open("/proc/spl/kstat/zfs/arcstats", "r") as fp:
	next(fp) # skip the junky opening line
	next(fp) # skip the header
	arcstats = {}
	for line in fp:
	name, type, data = line.strip().split(maxsplit = 3)
	try:
	arcstats[name] = statmap[name](data)
	except KeyError:
	pass # don't care about the stat

	# variable naming below emulates the names from zfs-util's arc_summary.py...
	arc_size = arcstats["size"]
	target_size = arcstats["c"]
	target_max_size = arcstats["c_max"]
	mru_size = arcstats["p"]
	mfu_size = (arc_size - mru_size) if (arc_size > target_size) else (target_size - mru_size)
	arc_hits = arcstats["hits"]
	arc_misses = arcstats["misses"]
	mfu_hits = arcstats["mfu_hits"]
	mru_hits = arcstats["mru_hits"]
	real_hits = mfu_hits + mru_hits
	arc_accesses_total = arc_hits + arc_misses
	cache_hit_ratio = arc_hits / arc_accesses_total
	cache_miss_ratio = arc_misses / arc_accesses_total
	actual_hit_ratio = real_hits / arc_accesses_total
	recently_used_cache_size = mru_size

	# Return stats using sensible names that can be located via prefixes in influxdb...
	return dict(
	arc_size_current = arc_size, # the current value
	arc_size_target = target_size, # the adaptive value
	arc_size_max = target_max_size, # the max possible value
	arc_mru_size = mru_size,
	arc_mfu_size = mfu_size,
	arc_hit_ratio = cache_hit_ratio,
	arc_miss_ratio = cache_miss_ratio,
	arc_hit_ratio_actual = actual_hit_ratio,
	)

	def GetDatasetStats():
	# args are:
	# -p -- parseable output (gets bytes instead of SI suffixes)
	# -H -- no header
	# -o -- requests specific dataset properties (name, used, etc)
	cmd = "/sbin/zfs list -pHo name,used,referenced,compressratio"
	stats = {}
	lines = subprocess.check_output(cmd.split())
	lines = lines.decode("ascii").strip()
	for line in lines.split("\n"):
	path, used, referenced, compressratio = line.split()
	stats[path] = dict(
	size = int(referenced),
	size_with_snapshots = int(used),
	compressratio = float(compressratio[:-1]),
	)
	return stats

	def PrintStats(PoolStats, ArcStats, DatasetStats):
	print("Pool stats:")
	pp(PoolStats)
	print("ARC stats:")
	pp(ArcStats)
	print("Dataset stats:")
	pp(DatasetStats)

	def SendStats(URL, PoolStats, ArcStats, DatasetStats):
	influxClient = InfluxDBClient(URL, *my_personal_info) # <-- add your own
	sampleTime = datetime.datetime.now().isoformat()
	measName = "ptvm_zfs" # to distinguish from "normal"/limited telegraf zfs data
	baseTags = {
	"host": platform.node(),
	"srv_loc": os.environ["SRV_LOC"],
	"srv_owner": os.environ["SRV_OWNER"],
	"srv_purpose": os.environ["SRV_PURPOSE"],
	"srv_vtype": os.environ["SRV_VTYPE"],
	}
	jsonBody = []

	# Assemble the ArcStats...
	jsonBody.append(dict(measurement = measName,
	time = sampleTime,
	tags = baseTags,
	fields = ArcStats))

	# Assemble the PoolStats...
	for pName, pStats in PoolStats.items():
	jsonBody.append(dict(measurement = measName,
	time = sampleTime,
	tags = dict(zpool = pName, **baseTags),
	fields = pStats))

	# Assemble the DatasetStats...
	for dsPath, dsStats in DatasetStats.items():
	zpool = dsPath.split("/", 1)[0]
	jsonBody.append(dict(measurement = measName,
	time = sampleTime,
	tags = dict(dataset = dsPath, zpool = zpool, **baseTags),
	fields = dsStats))

	influxClient.write_points(jsonBody)

	def Main():
	pools = GetZFSPools()
	poolStats = {pool: GetPoolStats(pool) for pool in pools}
	arcStats = GetArcStats()
	datasetStats = GetDatasetStats()
	try:
	influxdbURL = sys.argv[1]
	except IndexError:
	PrintStats(poolStats, arcStats, datasetStats)
	else:
	SendStats(influxdbURL, poolStats, arcStats, datasetStats)

	if __name__ == "__main__":
	Main()