Skip to content

Instantly share code, notes, and snippets.

@szaydel
Last active September 21, 2017 23:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save szaydel/1e68ec3584bb3ebd7ec1 to your computer and use it in GitHub Desktop.
Save szaydel/1e68ec3584bb3ebd7ec1 to your computer and use it in GitHub Desktop.
Dtrace snippets for observing behaviour of ZIL
dtrace -qn '
/* Add a number greater than 0 after the script on the command line, to
* tell the script how often to print results to screen. If no value is
* given, script will fail immediately. Value must be positive and is in
* seconds. Results will be printed "this many" seconds apart, but will
* be normalized to a per-second result.
*/
int ts;
BEGIN { ts = timestamp; }
::zfs_log_write:entry {
this->file_path = stringof(args[3]->z_vnode->v_path);
@[this->file_path] = quantize(arg5);
@cnt = count();
@bytes = sum(arg5);
bypass_log = `zfs_slog_bypass_ok == 0 ? "will not" : "will";
}
tick-$$1sec {
this->now = timestamp;
this->delta = (this->now - ts) / 1000000000;
/* Adjust data to reflect per second values */
normalize(@, this->delta);
normalize(@cnt, this->delta);
normalize(@bytes, this->delta);
printf("----------------------------------------\n");
printf("Summary:\n");
printf("IOs >%d(kb) %s bypass log\n",
`zfs_immediate_write_sz >> 10, bypass_log);
printa("IOs through ZIL/sec: %@d\nBytes through ZIL/sec: %@d\n",
@cnt, @bytes);
printf("----------------------------------------\n");
printa(" Filepath: %s\n IOsize\t\t\t\tIOs/sec %@d\n", @);
trunc(@); trunc(@cnt); trunc(@bytes);
ts = this->now;
}'
/* Observe size buckets of IOs through the zfs_log_write function and measure zil_commit latency. */
dtrace -qn '
::zil_commit:entry {
self->zilog = args[0] ;
self->spa = args[0]->zl_spa ;
self->t = timestamp ;
}
::zfs_log_write:entry {
this->spa = args[0]->zl_spa ;
this->sync = args[0]->zl_sync == 0 ? "Async" : "Sync" ;
@x4[this->spa->spa_name, this->sync] =
sum(args[5] <= 0x1000 ? 1 : 0) ;
@x8[this->spa->spa_name, this->sync] =
sum((args[5] <= 0x2000 && args[5] > 0x1000) ? 1 : 0) ;
@x16[this->spa->spa_name, this->sync] =
sum((args[5] <= 0x4000 && args[5] > 0x2000) ? 1 : 0) ;
@x32[this->spa->spa_name, this->sync] =
sum((args[5] <= 0x8000 && args[5] > 0x4000) ? 1 : 0) ;
@x64[this->spa->spa_name, this->sync] =
sum((args[5] <= 0x10000 && args[5] > 0x8000) ? 1 : 0) ;
@x128[this->spa->spa_name, this->sync] =
sum((args[5] <= 0x20000 && args[5] > 0x10000) ? 1 : 0) ;
@xulim[this->spa->spa_name, this->sync] =
sum(args[5] > 0x20000 ? 1 : 0) ;
}
::zil_commit:return /self->t/ {
this->sync = self->zilog == 0 ? "Async" : "Sync" ;
@lathist[self->spa->spa_name, this->sync] =
quantize((timestamp - self->t) / 1000);
self->t = 0 ; self->spa = 0 ; self->zilog = 0 ;
}
END {
printa("\n Pool [%s] zil_commit %s IO record latency%@d\n",
@lathist);
printf("IO Count by Size:\n") ;
printf("pool type %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n",
"4kb","8kb","16kb","32kb","64kb","128kb",">128kb") ;
printa("%-8s %-8s %-8@d %-8@d %-8@d %-8@d %-8@d %-8@d %-8@d\n",
@x4, @x8, @x16, @x32, @x64, @x128, @xulim) ;
}'
/* Sizes of IOs through the zfs_log_write */
dtrace -qn '::zfs_log_write:entry {
this->ts = (walltimestamp / 1000000000) * 1000000000;
@sz[this->ts, args[5]] = count();
}
END {
printf("timestamp,io.size,count\n");
printa("%Y,%d,%@d\n", @sz);
}'
dtrace -qn '
/*
* Measure IO sizes and latency through the zil code. This script hides
* reality through effectively quantizing data to multiples of 4K
* numbers, where actual sizes may not be on those exact boundaries.
* We can get a sense for how much we are lied to by looking at the total
* count and comparing against the count given by base2 column, which counts
* each IO that is actually sized on the bucket boundaries.
*/
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"timestamp","sz.1k","sz.4k","sz.8k","sz.16k","sz.32k","sz.64k",
"sz.128k","sz.256k","sz.512k", "sz.1m","unk",
"l1k","l4k","l8k","l16k","l32k","l64k","l128k","l256k","l512k", "l1m","lunk", "mult.4k","total.ios", "throughput");
}
::zfs_log_write:entry {
/*
* this->wt reduces precision down to 1 second, to make result
* presentation a little bit easier.
*/
this->wt = (walltimestamp / 1000000000) * 1000000000;
self->start = timestamp;
this->sz = args[5];
this->m4k = this->sz%4096 == 0 ? 1 : 0;
this->l = this->sz > 0 && this->sz <= 0x400 ? "1k" :
this->sz > 0x400 && this->sz <= 0x1000 ? "4k" :
this->sz > 0x1000 && this->sz <= 0x2000 ? "8k" :
this->sz > 0x2000 && this->sz <= 0x4000 ? "16k" :
this->sz > 0x4000 && this->sz <= 0x8000 ? "32k" :
this->sz > 0x8000 && this->sz <= 0x10000 ? "64k" :
this->sz > 0x10000 && this->sz <= 0x20000 ? "128k" :
this->sz > 0x20000 && this->sz <= 0x40000 ? "256k" :
this->sz > 0x40000 && this->sz <= 0x80000 ? "512k" :
this->sz > 0x80000 && this->sz <= 0x100000 ? "1m" :
"unknown";
@sz1k[this->wt] = sum(this->l == "1k" ? 1 : 0);
@sz4k[this->wt] = sum(this->l == "4k" ? 1 : 0);
@sz8k[this->wt] = sum(this->l == "8k" ? 1 : 0);
@sz16k[this->wt] = sum(this->l == "16k" ? 1 : 0);
@sz32k[this->wt] = sum(this->l == "32k" ? 1 : 0);
@sz64k[this->wt] = sum(this->l == "64k" ? 1 : 0);
@sz128k[this->wt] = sum(this->l == "128k" ? 1 : 0);
@sz256k[this->wt] = sum(this->l == "256k" ? 1 : 0);
@sz512k[this->wt] = sum(this->l == "512k" ? 1 : 0);
@sz1m[this->wt] = sum(this->l == "1m" ? 1 : 0);
@szu[this->wt] = sum(this->l == "unknown" ? 1 : 0);
@bucksize[this->wt] = sum(this->m4k);
/* IOPS, assuming regular interval sampling */
@tot[this->wt] = count();
/* Throughput assuming regular interval sampling */
@tput[this->wt] = sum(this->sz);
}
::zfs_log_write:return /this->sz != NULL/ {
this->delta = (timestamp - self->start); /* note, this is nanoseconds */
@l1k[this->wt] = avg(this->l == "1k" ? this->delta : 0);
@l4k[this->wt] = avg(this->l == "4k" ? this->delta : 0);
@l8k[this->wt] = avg(this->l == "8k" ? this->delta : 0);
@l16k[this->wt] = avg(this->l == "16k" ? this->delta : 0);
@l32k[this->wt] = avg(this->l == "32k" ? this->delta : 0);
@l64k[this->wt] = avg(this->l == "64k" ? this->delta : 0);
@l128k[this->wt] = avg(this->l == "128k" ? this->delta : 0);
@l256k[this->wt] = avg(this->l == "256k" ? this->delta : 0);
@l512k[this->wt] = avg(this->l == "512k" ? this->delta : 0);
@l1m[this->wt] = avg(this->l == "1m" ? this->delta : 0);
@lu[this->wt] = sum(this->l == "unknown" ? this->delta : 0);
}
tick-1sec {
printa("%Y,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@sz1k, @sz4k, @sz8k, @sz16k, @sz32k, @sz64k, @sz128k, @sz256k,
@sz512k, @sz1m, @szu, @l1k, @l4k, @l8k, @l16k, @l32k,
@l64k, @l128k, @l256k, @l512k, @l1m, @lu, @bucksize, @tot,
@tput);
trunc(@sz1k); trunc(@sz4k); trunc(@sz8k);
trunc(@sz16k); trunc(@sz32k); trunc(@sz64k); trunc(@sz128k); trunc(@sz256k);
trunc(@sz512k); trunc(@sz1m); trunc(@szu);
trunc(@l1k); trunc(@l4k); trunc(@l8k);
trunc(@l16k); trunc(@l32k); trunc(@l64k); trunc(@l128k); trunc(@l256k);
trunc(@l512k); trunc(@l1m); trunc(@lu);
trunc(@bucksize); trunc(@tot); trunc(@tput);
}'
dtrace -qn '
BEGIN {totalbytes=0; total = 0;}
::zfs_log_write:entry { totalbytes+=args[5]; total++; @ = quantize(args[5]); }
END {
printf("\tTotal IOs: %d Total KBytes: %d\n", total, totalbytes >> 10); printa("\tIO size (bytes): %@d\n", @); }'
// IO size through the zil by process ID and execname. Includes both the size of IO,
// and amount of work done per microsecond of time spent in the zfs_log_write function.
dtrace -qn '
self int64_t bytes;
self int64_t delta;
::zfs_log_write:entry {
@[pid, execname] = quantize(args[5]);
self->ts = timestamp;
self->bytes += args[5];
}
::zfs_log_write:return /(timestamp - self->ts) > 0/ {
self->delta += (timestamp - self->ts); /* measure delta */
}
::zfs_log_write:return /self->delta > 100000/ {
/* How many bytes per microsecond? */
this->delta_us = self->delta / 1000;
@bytes_us[pid, execname] = quantize(self->bytes/this->delta_us);
self->ts = 0; self->bytes = 0; self->delta = 0;
}
tick-10sec {
printa("IO Size\npid: %d, name: %s %@d\n", @);
printa("Bytes/us\npid: %d, name: %s %@d\n", @bytes_us);
trunc(@); trunc(@bytes_us);
}'
#!/usr/sbin/dtrace -Cs
#include <sys/fs/zfs.h>
#pragma D option quiet
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2016, RackTop Systems.
*/
string logbias[zfs_logbias_op_t];
int x;
inline int BLKSZ_ABOVE_128K = -0xffff; /* Just a magic value */
/* This is a good way to play with this a little bit. Just copy and paste in
* a separate shell and let it roll. Make sure /bp/test exists.
while :; do dd if=/dev/urandom of=/bp/test/out.bin bs=4096 count=$RANDOM ;
dd if=/dev/urandom of=/bp/test/out.bin bs=0x2000 count=$RANDOM ;
dd if=/dev/urandom of=/bp/test/out.bin bs=0x4000 count=$RANDOM ;
sleep 1; done
*/
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s\n", "poolname", "bias",
"4K", "8K", "16K", "32K", "64K", "128K", "gt.128K"
);
logbias[ZFS_LOGBIAS_LATENCY] = "latency";
logbias[ZFS_LOGBIAS_THROUGHPUT] = "throughput";
}
::zil_lwb_write_start:entry {
this->pooln = stringof(args[0]->zl_dmu_pool->dp_spa->spa_name);
this->size = args[1]->lwb_sz;
this->lb = args[0]->zl_logbias;
/* We need to fit into one of these buckewith anything larger than
* 128K tagged with BLKSZ_ABOVE_128K, and ultimately resulting in
* being counted by @bgt128k aggregate.
*/
this->buck = 0x1000; /* We assume this is smallest possible IO size. */
this->buck = ( this->size > 0x1000 ) ? 8192 : this->buck ;
this->buck = ( this->size > 0x2000 ) ? 16384 : this->buck ;
this->buck = ( this->size >= 0x4000 ) ? 32768 : this->buck ;
this->buck = ( this->size >= 0x8000 ) ? 65536 : this->buck ;
this->buck = ( this->size > 0x10000 ) ? 131072 : this->buck ;
this->buck = ( this->size > 0x20000 ) ? BLKSZ_ABOVE_128K : this->buck ;
@b0x1000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x1000 ? 1 : 0);
@b0x2000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x2000 ? 1 : 0);
@b0x4000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x4000 ? 1 : 0);
@b0x8000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x8000 ? 1 : 0);
@b0x10000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x10000 ? 1 : 0);
@b0x20000[this->pooln, logbias[this->lb]] =
sum(this->buck == 0x20000 ? 1 : 0);
@bgt128k[this->pooln, logbias[this->lb]] =
sum(this->buck == BLKSZ_ABOVE_128K ? 1 : 0);
++x;
}
tick-1sec /x/ {
printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@b0x1000, @b0x2000, @b0x4000, @b0x8000, @b0x10000, @b0x20000, @bgt128k
);
trunc(@b0x1000); trunc(@b0x2000);
trunc(@b0x4000); trunc(@b0x8000); trunc(@b0x10000);
trunc(@b0x20000); trunc(@bgt128k);
x = 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment