asomers/zfsfrag.py

## zfsfrag.py
# Copyright (c) 2014-2015 Spectra Logic Corporation
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions, and the following disclaimer,
#    without modification.
# 2. Redistributions in binary form must reproduce at minimum a disclaimer
#    substantially similar to the "NO WARRANTY" disclaimer below
#    ("Disclaimer") and any redistribution must be conditioned upon
#    including a substantially similar Disclaimer requirement for further
#    binary redistribution.
#
# NO WARRANTY
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGES.
#
# Authors: Alan Somers

import argparse
import re
import sys

import numpy
import numpy.ma as ma

# Maximum padding that ZFS will insert between otherwise contiguous RAIDZ blocks
# TODO: figure out how to calculate this correctly for the particular zpool
# topology, instead of guessing it.  The 7680 guess is based on a 15 drive
# raidz2 pool.
MAX_PADDING=7680

def analyze_file(pathname, l0_blocks):
    global frags_per_file
    global ooo_frags_per_file
    global total_gaps
    global ooo_gaps
    global fragsizes
    # Nothing to do for empty files
    if len(l0_blocks) == 0:
        return

    stripes = set(l0_blocks[:,0])
    blockcount = l0_blocks.shape[0]

    for stripe in stripes:
        stripe_blocks = numpy.array([l0_blocks[i,:]
                                    for i in range(blockcount)
                                    if l0_blocks[i,0] == stripe])
        # Second, determine how many contiguous ordered fragments make up the
        # file.  A contiguous ordered fragment is a chain of 1 or more L0
        # blocks that are adjacent on the vdev and whose data are adjacent in
        # the file.
        block_starts = stripe_blocks[:,1]
        block_ends = stripe_blocks[:,1] + stripe_blocks[:,2]
        frags = 1 + (numpy.abs(block_starts[1:] - block_ends[0:-1]) > MAX_PADDING).sum()
        # The gaps are the space between frags
        gaps = numpy.abs(block_starts[1:] - block_ends[0:-1])
        nonzero_gaps = ma.masked_array(gaps, mask=(gaps <= MAX_PADDING))
        total_gaps = ma.concatenate((total_gaps, nonzero_gaps))
        frags_per_file.append(frags)
        # Now determine the size of each frag, including RAID overhead
        _fragsizes = []
        fragend = -1
        fragsize = 0
        for idx, dva in enumerate(stripe_blocks):
            if abs(dva[1] - fragend) <= MAX_PADDING:
                # Continuation of a fragment
                fragsize += dva[2]
            else:
                # Beginning of a new fragment
                if fragsize > 0:
                    # Record fragsize, unless this is the file's first fragment
                    _fragsizes.append(fragsize)
                fragsize = dva[2]
            fragend = dva[1] + dva[2]
        _fragsizes.append(fragsize)
        assert len(_fragsizes) == frags
        assert sum(_fragsizes) == stripe_blocks[:,2].sum()
        # Record the list of fragsizes in the global variable
        fragsizes = numpy.concatenate((fragsizes, numpy.array(_fragsizes)))

        # Next determine how many contiguous but out of order fragments make up
        # the file.
        sorted_blocks = numpy.sort(stripe_blocks, 0)
        sorted_block_starts = sorted_blocks[:,1]
        sorted_block_ends = sorted_blocks[:,1] + sorted_blocks[:,2]
        frags = 1 + (sorted_block_starts[1:] - sorted_block_ends[0:-1] > MAX_PADDING).sum()
        sorted_gaps = numpy.abs(sorted_block_starts[1:] - sorted_block_ends[0:-1])
        sorted_nonzero_gaps = ma.masked_array(sorted_gaps, mask=(sorted_gaps <= MAX_PADDING))
        ooo_gaps = ma.concatenate((ooo_gaps, sorted_nonzero_gaps))
        ooo_frags_per_file.append(frags)


def main(argv):
    global frags_per_file
    global ooo_frags_per_file
    global total_gaps
    global ooo_gaps
    global fragsizes

    filesizes = []
    frags_per_file = []
    total_gaps = numpy.array([])
    fragsizes = numpy.array([])
    ooo_gaps = numpy.array([])
    ooo_frags_per_file = []


    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--filter",
            help="regex to select filenames to analyze", default="")
    parser.add_argument("filename", help="output from zdb -ddddd OBJSET")
    args = parser.parse_args()

    in_plain_file = False;
    f = open(args.filename, 'r')
    l0_dvas = []
    for l in f:
        if re.match("    Object", l):
            if in_plain_file:
                analyze_file(pathname, numpy.array(l0_dvas))
            in_plain_file = False
            l0_dvas = []
        elif re.search("ZFS plain file", l):
            in_plain_file = True
        elif re.search("^\tpath", l) and in_plain_file:
            pathname = l.replace("\tpath\t", "").strip()
            if not re.search(args.filter, pathname):
                # Skip this file
                in_plain_file = False
        elif re.match("\tsize", l) and in_plain_file:
            size = int(l.replace("\tsize\t", "").strip())
            filesizes.append(size)
        elif re.search("^ +[0-9a-f]+ +L0 ", l) and in_plain_file:
            # TODO: handle sparse files, which have L0 blocks with no lsize,
            # psize, or block fill.
            (fpos, blocklev, dva, lsize_psize, blk_fill, blk_birth) = l.split()
            (vdev, offset, asize) = dva.split(":")
            l0_dvas.append([int(vdev, 16), int(offset, 16), int(asize, 16)])
    if in_plain_file:
        analyze_file(pathname, numpy.array(l0_dvas))
    f.close()

    frags_per_file_a = numpy.array(frags_per_file)
    ooo_frags_per_file_a = numpy.array(ooo_frags_per_file)
    filesizes_a = numpy.array(filesizes)
    print "          %16s %11s %16s %11s %12s" % ("min", "mean", "max",
                                                  "stddev", "count")
    print "filesize  %16d %11g %16d %11g %12d" % (filesizes_a.min(),
            filesizes_a.mean(), filesizes_a.max(),
            filesizes_a.std(), filesizes_a.size)
    print "frags/file%16d %11g %16d %11g %12d" % (frags_per_file_a.min(),
            frags_per_file_a.mean(), frags_per_file_a.max(),
            frags_per_file_a.std(), frags_per_file_a.size)
    # Note: fragsizes includes RAID overhead
    print "fragsizes %16d %11g %16d %11g %12d" % (fragsizes.min(),
            fragsizes.mean(), fragsizes.max(), fragsizes.std(), fragsizes.size)
    print "gap sizes %16d %11g %16d %11g %12d" % (total_gaps.min(),
            total_gaps.mean(), total_gaps.max(), total_gaps.std(),
            total_gaps.count())
    print "OOO frags %16d %11g %16d %11g %12d" % (ooo_frags_per_file_a.min(),
            ooo_frags_per_file_a.mean(), ooo_frags_per_file_a.max(),
            ooo_frags_per_file_a.std(), ooo_frags_per_file_a.size)
    print "OOO gaps  %16d %11g %16d %11g %12d" % (ooo_gaps.min(),
            ooo_gaps.mean(), ooo_gaps.max(), ooo_gaps.std(), ooo_gaps.count())

main(sys.argv)
	# Copyright (c) 2014-2015 Spectra Logic Corporation
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions, and the following disclaimer,
	# without modification.
	# 2. Redistributions in binary form must reproduce at minimum a disclaimer
	# substantially similar to the "NO WARRANTY" disclaimer below
	# ("Disclaimer") and any redistribution must be conditioned upon
	# including a substantially similar Disclaimer requirement for further
	# binary redistribution.
	#
	# NO WARRANTY
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	# POSSIBILITY OF SUCH DAMAGES.
	#
	# Authors: Alan Somers

	import argparse
	import re
	import sys

	import numpy
	import numpy.ma as ma

	# Maximum padding that ZFS will insert between otherwise contiguous RAIDZ blocks
	# TODO: figure out how to calculate this correctly for the particular zpool
	# topology, instead of guessing it. The 7680 guess is based on a 15 drive
	# raidz2 pool.
	MAX_PADDING=7680

	def analyze_file(pathname, l0_blocks):
	global frags_per_file
	global ooo_frags_per_file
	global total_gaps
	global ooo_gaps
	global fragsizes
	# Nothing to do for empty files
	if len(l0_blocks) == 0:
	return

	stripes = set(l0_blocks[:,0])
	blockcount = l0_blocks.shape[0]

	for stripe in stripes:
	stripe_blocks = numpy.array([l0_blocks[i,:]
	for i in range(blockcount)
	if l0_blocks[i,0] == stripe])
	# Second, determine how many contiguous ordered fragments make up the
	# file. A contiguous ordered fragment is a chain of 1 or more L0
	# blocks that are adjacent on the vdev and whose data are adjacent in
	# the file.
	block_starts = stripe_blocks[:,1]
	block_ends = stripe_blocks[:,1] + stripe_blocks[:,2]
	frags = 1 + (numpy.abs(block_starts[1:] - block_ends[0:-1]) > MAX_PADDING).sum()
	# The gaps are the space between frags
	gaps = numpy.abs(block_starts[1:] - block_ends[0:-1])
	nonzero_gaps = ma.masked_array(gaps, mask=(gaps <= MAX_PADDING))
	total_gaps = ma.concatenate((total_gaps, nonzero_gaps))
	frags_per_file.append(frags)
	# Now determine the size of each frag, including RAID overhead
	_fragsizes = []
	fragend = -1
	fragsize = 0
	for idx, dva in enumerate(stripe_blocks):
	if abs(dva[1] - fragend) <= MAX_PADDING:
	# Continuation of a fragment
	fragsize += dva[2]
	else:
	# Beginning of a new fragment
	if fragsize > 0:
	# Record fragsize, unless this is the file's first fragment
	_fragsizes.append(fragsize)
	fragsize = dva[2]
	fragend = dva[1] + dva[2]
	_fragsizes.append(fragsize)
	assert len(_fragsizes) == frags
	assert sum(_fragsizes) == stripe_blocks[:,2].sum()
	# Record the list of fragsizes in the global variable
	fragsizes = numpy.concatenate((fragsizes, numpy.array(_fragsizes)))

	# Next determine how many contiguous but out of order fragments make up
	# the file.
	sorted_blocks = numpy.sort(stripe_blocks, 0)
	sorted_block_starts = sorted_blocks[:,1]
	sorted_block_ends = sorted_blocks[:,1] + sorted_blocks[:,2]
	frags = 1 + (sorted_block_starts[1:] - sorted_block_ends[0:-1] > MAX_PADDING).sum()
	sorted_gaps = numpy.abs(sorted_block_starts[1:] - sorted_block_ends[0:-1])
	sorted_nonzero_gaps = ma.masked_array(sorted_gaps, mask=(sorted_gaps <= MAX_PADDING))
	ooo_gaps = ma.concatenate((ooo_gaps, sorted_nonzero_gaps))
	ooo_frags_per_file.append(frags)


	def main(argv):
	global frags_per_file
	global ooo_frags_per_file
	global total_gaps
	global ooo_gaps
	global fragsizes

	filesizes = []
	frags_per_file = []
	total_gaps = numpy.array([])
	fragsizes = numpy.array([])
	ooo_gaps = numpy.array([])
	ooo_frags_per_file = []


	parser = argparse.ArgumentParser()
	parser.add_argument("-f", "--filter",
	help="regex to select filenames to analyze", default="")
	parser.add_argument("filename", help="output from zdb -ddddd OBJSET")
	args = parser.parse_args()

	in_plain_file = False;
	f = open(args.filename, 'r')
	l0_dvas = []
	for l in f:
	if re.match(" Object", l):
	if in_plain_file:
	analyze_file(pathname, numpy.array(l0_dvas))
	in_plain_file = False
	l0_dvas = []
	elif re.search("ZFS plain file", l):
	in_plain_file = True
	elif re.search("^\tpath", l) and in_plain_file:
	pathname = l.replace("\tpath\t", "").strip()
	if not re.search(args.filter, pathname):
	# Skip this file
	in_plain_file = False
	elif re.match("\tsize", l) and in_plain_file:
	size = int(l.replace("\tsize\t", "").strip())
	filesizes.append(size)
	elif re.search("^ +[0-9a-f]+ +L0 ", l) and in_plain_file:
	# TODO: handle sparse files, which have L0 blocks with no lsize,
	# psize, or block fill.
	(fpos, blocklev, dva, lsize_psize, blk_fill, blk_birth) = l.split()
	(vdev, offset, asize) = dva.split(":")
	l0_dvas.append([int(vdev, 16), int(offset, 16), int(asize, 16)])
	if in_plain_file:
	analyze_file(pathname, numpy.array(l0_dvas))
	f.close()

	frags_per_file_a = numpy.array(frags_per_file)
	ooo_frags_per_file_a = numpy.array(ooo_frags_per_file)
	filesizes_a = numpy.array(filesizes)
	print " %16s %11s %16s %11s %12s" % ("min", "mean", "max",
	"stddev", "count")
	print "filesize %16d %11g %16d %11g %12d" % (filesizes_a.min(),
	filesizes_a.mean(), filesizes_a.max(),
	filesizes_a.std(), filesizes_a.size)
	print "frags/file%16d %11g %16d %11g %12d" % (frags_per_file_a.min(),
	frags_per_file_a.mean(), frags_per_file_a.max(),
	frags_per_file_a.std(), frags_per_file_a.size)
	# Note: fragsizes includes RAID overhead
	print "fragsizes %16d %11g %16d %11g %12d" % (fragsizes.min(),
	fragsizes.mean(), fragsizes.max(), fragsizes.std(), fragsizes.size)
	print "gap sizes %16d %11g %16d %11g %12d" % (total_gaps.min(),
	total_gaps.mean(), total_gaps.max(), total_gaps.std(),
	total_gaps.count())
	print "OOO frags %16d %11g %16d %11g %12d" % (ooo_frags_per_file_a.min(),
	ooo_frags_per_file_a.mean(), ooo_frags_per_file_a.max(),
	ooo_frags_per_file_a.std(), ooo_frags_per_file_a.size)
	print "OOO gaps %16d %11g %16d %11g %12d" % (ooo_gaps.min(),
	ooo_gaps.mean(), ooo_gaps.max(), ooo_gaps.std(), ooo_gaps.count())

	main(sys.argv)