Skip to content

Instantly share code, notes, and snippets.

@alek-p
Last active July 30, 2018 22:30
Show Gist options
  • Save alek-p/24e1ad9437bfd694e03ad216b5386f56 to your computer and use it in GitHub Desktop.
Save alek-p/24e1ad9437bfd694e03ad216b5386f56 to your computer and use it in GitHub Desktop.
Capture zfs and other debuging info
#!/usr/bin/python3
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A copy of the CDDL is available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 Datto Inc.
#
import sys
import getopt
import signal
import subprocess
import json
import datetime
import time
import os
import tarfile
import shutil
default_timeout = 5
default_conf = """
[
{
"category": "version",
"enabled": 1,
"cmds": [
{ "cmd": "uname -a" },
{ "cmd": "cat /sys/module/spl/version" },
{ "cmd": "cat /sys/module/zfs/version", "timeout": 5, "enabled": 1}
]
},
{
"category": "zstat",
"enabled": 1,
"cmds": [
{ "cmd": "cat /proc/spl/kstat/zfs/dbgmsg" },
{ "cmd": "cat /proc/spl/taskq-all" },
{ "cmd": "cat /proc/spl/kmem/slab", "repeat": 3, "sleep": 1 },
{ "cmd": "cat /proc/spl/taskq", "repeat": 5, "sleep": 1 },
{ "cmd": "cat /proc/spl/kstat/zfs/arcstats", "repeat": 5, "sleep": 1 },
{ "cmd": "cat /proc/spl/kstat/zfs/zil", "repeat": 3, "sleep": 5 },
{ "cmd": "cat /proc/spl/kstat/zfs/dmu_tx", "repeat": 3, "sleep": 5 },
{ "cmd": "echo 100 > /sys/module/zfs/parameters/zfs_txg_history", "shell": "1" },
{ "cmd": "tail /proc/spl/kstat/zfs/dattoArray/txgs", "repeat": 3, "sleep": 5 }
]
},
{
"category": "perf",
"enabled": 1,
"cmds": [
{ "cmd": "uptime" },
{ "cmd": "dmesg -T" },
{ "cmd": "free -m" },
{ "cmd": "ps -ef" },
{ "cmd": "vmstat 1 3" },
{ "cmd": "mpstat -P ALL 1 3" },
{ "cmd": "pidstat 1 3" },
{ "cmd": "iostat -xz 5 3", "timeout": 20 },
{ "cmd": "sar -n DEV 1 3", "enabled": 1 },
{ "cmd": "sar -n TCP,ETCP 1 3", "enabled": 1 }
]
},
{
"category": "debug",
"enabled": 1,
"cmds": [
{ "cmd": "echo t > /proc/sysrq-trigger", "shell": 1 },
{ "cmd": "cat /var/log/kern.log" },
{ "cmd": "cat /var/log/syslog" },
{ "cmd": "cat /var/log/dpkg.log" }
]
},
{
"category": "zfs",
"enabled": 0,
"cmds": [
{ "cmd": "zfs get all dattoArray" },
{ "cmd": "zfs list -d 2 -r dattoArray", "timeout": 30 },
{ "cmd": "zpool history -i dattoArray", "timeout": 30 },
{ "cmd": "zpool events -v dattoArray" },
{ "cmd": "zpool get all dattoArray" },
{ "cmd": "zpool status dattoArray" },
{ "cmd": "zpool list -v dattoArray" },
{ "cmd": "zpool iostat dattoArray -v 5 3", "timeout": 20 },
{ "cmd": "zpool iostat dattoArray -vq 5 3", "timeout": 20 }
]
}
]"""
def usage():
print("Usage:", sys.argv[0], " <-c enable_category1,enable_category2,...>")
sys.exit(1)
try:
# Read command line args
opts, args = getopt.getopt(sys.argv[1:], 'c:')
except getopt.GetoptError as err:
print(str(err))
usage()
force_enable_categories = [ ]
for opt, arg in opts:
if opt == '-c':
force_enable_categories = str(arg).split(",")
ts = datetime.datetime.now().isoformat()
log_id = 'log_state_' + ts.replace(":", ".");
output_dir = '/tmp/' + log_id + "/"
os.makedirs(output_dir)
conf_filename = output_dir + 'log_state_conf.json'
fd = open(conf_filename, 'w+')
fd.write(default_conf)
fd.close()
with open(conf_filename, 'r') as json_file:
config = json.load(json_file)
json_file.close()
def create_tarfile(output_filename, source_dir):
print('Creating archive:', output_filename)
with tarfile.open(output_filename, 'w:gz') as archive:
archive.add(source_dir, arcname=log_id)
archive.close()
def signal_handler(signal, frame):
print('You pressed Ctrl+C')
create_tarfile(log_id + '.tar.gz', output_dir)
shutil.rmtree(output_dir)
sys.exit(1)
signal.signal(signal.SIGINT, signal_handler)
def run_cmd(category, cmd, s, t):
print('Running [', category, ']:', ' '.join(cmd))
start_ts = datetime.datetime.now().isoformat()
if (s):
output = 'Handed of to shell: ' + ' '.join(cmd)
err = os.system(' '.join(cmd))
if (err != 0):
print('SHELL EXECUTION FAILED:', ' '.join(cmd))
else:
try:
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=t).decode()
err = 0
except subprocess.CalledProcessError as e:
output = e.output.decode()
err = e.returncode
print('EXECUTION FAILED:', ' '.join(cmd))
except subprocess.TimeoutExpired as e:
print(cmd, ' timed out, output:\n', e.output.decode())
output = e.output.decode()
err = -5
print('TIMED OUT:', ' '.join(cmd))
except FileNotFoundError as e:
output = ''.join(cmd)
err = -1
print('CMD NOT FOUND:', ' '.join(cmd))
end_ts = datetime.datetime.now().isoformat()
out_file = output_dir + category + '/' + ''.join(cmd).replace("/", "_") + '.log'
log_fd = open(out_file, 'a+')
log_fd.write('Started: ' + start_ts + '; Ended: ' + end_ts + '; ERROR: ' + str(err) + '\n')
log_fd.write(output)
log_fd.close()
return err
for i in range(len(config)):
section = config[i]
section.setdefault('enabled', 1)
if (section['enabled'] != 1 and section["category"] not in force_enable_categories):
continue
os.makedirs(output_dir + section['category'])
for c in range(len(section['cmds'])):
cur = section['cmds'][c]
cur.setdefault('enabled', 1)
if (cur['enabled'] != 1):
continue
cur.setdefault('shell', False)
if (cur['shell'] == 1):
cur['shell'] = True
cur.setdefault('timeout', default_timeout)
run_cmd(section['category'], cur['cmd'].split(' '), cur['shell'], cur['timeout'])
if 'repeat' in cur:
cur.setdefault('sleep', 1)
for r in range(cur['repeat'] - 1):
time.sleep(cur['sleep'])
run_cmd(section['category'], cur['cmd'].split(' '), cur['shell'], cur['timeout'])
create_tarfile(log_id + '.tar.gz', output_dir)
shutil.rmtree(output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment