Skip to content

Instantly share code, notes, and snippets.

@szaydel
Last active March 3, 2023 16:43
Show Gist options
  • Save szaydel/c5ca8ea20df235157b06 to your computer and use it in GitHub Desktop.
Save szaydel/c5ca8ea20df235157b06 to your computer and use it in GitHub Desktop.
[Dtrace snippets using io::: probes] One-liners and Scripts for observation at lower levels of the IO stack #tags: sd, io, io probes, stable probes, block device, dtrace, bandwidth, throughput, io errors, io retries
dtrace -qn '
BEGIN {
printf("timestamp(ns),latency(us),iotype,device,filename,blk.number,blk.count");
}
io:::start {
ts[args[0]->b_edev, args[0]->b_blkno] = timestamp;
}
io:::done /ts[args[0]->b_edev, args[0]->b_blkno] != 0/ {
this->lat_us = (timestamp - ts[args[0]->b_edev, args[0]->b_blkno])
/ 1000;
this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" :
args[0]->b_flags & B_READ ? "READ" : "NULL";
this->device = args[1]->dev_statname;
this->fp = args[2]->fi_pathname == "<unknown>" ? "UNKNOWN" :
args[2]->fi_pathname == "<none>" ? "UNKNOWN" :
args[2]->fi_pathname;
this->blocks = args[0]->b_bcount / 512; /* Assuming a 512b formatted dev */
printf("%d,%d,%s,%s,%s,0x%x,%d\n", walltimestamp,
this->lat_us, this->iotype, this->device,
this->fp, args[0]->b_blkno, this->blocks);
}' 2> /dev/null
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* bw-tput-iops-actv-time-hist-csv.d
* This script is meant to be a more informative replacement for the
* iostat utility. It measures some features that iostat in my opinion
* is not useful for. For example, iostat does not offer distributions
* at all, whereas here we plot distributions of IO rates as observed
* for each disk. Also, iostat does not offer operating ranges for
* measurements, which here we define as max - min of some measurement.
*
* Currently, script is limited to scsi_vhci devices, because they are
* ones we normally use for actual data storage. Anything else is either
* a locally attached device, or something not used by system, i.e.
* USB storage media, etc.
*/
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t]; /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;
/*
* These are the Output Parameter definitions for collected metrics
*
* sdname == Name of device, i.e. sd0
* mpxiowwn == mpxio device, like: 5000cca24500f698
* actvtm == amount of real, busy time spent processing IO
* rangerlat, rangewlat == latency range: max - min for Reads and Writes
* totbytes == total number of bytes Read and Written during sample interval
* tput,maxtput == mean and maximum or burst throughput
* ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
* aviosz,rangeiosz == IO size mean and range: max - min
* iops,maxiops == normalized IOPS mean and maximum
* avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
* // Histogram of IO Rate distribution with 4 buckets //
* [ ratelt1MBps ] ==> 1 second interval with < 1,000KB/s bw
* [ rate10MBps ] ==> 1 second interval with < 10,000KB/s bw
* [ rate100MBps ] ==> 1 second interval with < 100,000KB/s bw
* [ rate1GBps ] ==> 1 second interval with < 1,000,000KB/s bw
* avtime,maxtime == average and maximum IO completion latency
* // Histogram of latency distribution with 6 buckets //
* [ timegt1000ms ] ==> >1 second (SLOW!)
* [ time100ms ] ==> >50ms && 100ms<= (not acceptable)
* [ time50ms ] ==> >25ms && 50ms<= (acceptable)
* [ time25ms ] ==> >10ms && 25ms<= (ok)
* [ time10ms ] ==> >1ms && 10ms<= (good)
* [ timelt1ms ] ==> 1ms< (probably cache)
* // Histogram of IO size distribution with 7 buckets //
* [ iosztiny ] ==> 4K<
* [ iosz4k ] ==> between 4K and 8K
* [ iosz8k ] ==> between 8K and 16K
* [ iosz16k ] ==> between 16K and 32K
* [ iosz32k ] ==> between 32K and 64K
* [ iosz64k ] ==> between 64K and 128K
* [ ioszbig ] ==> anything 128K or above
* avpending == Average measured IOs in processing per sample interval
* maxpending == Maximum measured IOs in processing per sample interval
* cterr == Counter tracking number of errors per sample interval
* ctretry == Counter tracking number of retries per sample interval
*/
BEGIN {
printf("sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
}
/*
* Set interval timer initial value once for a given device.
* Each subsequent update will happen roughly once per second,
* or if no IO, whenever there is some IO that triggers io:::done
* probe, at which point we determine that timer expired.
*/
io:::start
/ival_timer[args[0]->b_edev] == 0/ {
ival_timer[args[0]->b_edev] = timestamp;
}
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
/* Increment pending IOs by 1 */
pend[args[0]->b_edev]++;
}
io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
this->x = args[0]->b_bcount * 976562;
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
start[args[0]->b_edev, args[0]->b_blkno] = 0;
/* Decrement pending IOs by 1, set to 0 if value < 0 */
pend[args[0]->b_edev]--;
pend[args[0]->b_edev] =
pend[args[0]->b_edev] > 0 ?
pend[args[0]->b_edev] : 0; /* avoid underflow */
/* Total Number of bytes per device */
@totbytes[this->sn, this->p] = sum(args[0]->b_bcount);
/* Total nanoseconds of active time per device */
@actv_tm[this->sn, this->p] = sum(this->elapsed);
/*
* Instead of converting nanoseconds to seconds we multiply
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
* In extreme observations, i.e. stalled IO, we may have huge
* this->elapsed values, in which case result will be 0 KB/s, even
* if there in fact was a non-zero value of bytes transferred.
*/
this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;
/* Measure IO rate range in KB/s */
@rangeKBps[this->sn, this->p] =
max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);
/* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
/* 1000KB/s <= */
@ratelt1MBps[this->sn, this->p] = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
/* > 1000KB/s && 10,000KB/s <= */
@rate10MBps[this->sn, this->p] = sum(this->kb_per_sec > 0x3e8 &&
this->kb_per_sec <= 0x2710 ? 1 : 0);
/* > 10,000KB/s && 100,000KB/s <= */
@rate100MBps[this->sn, this->p] = sum(this->kb_per_sec > 0x2710 &&
this->kb_per_sec <= 0x186a0 ? 1 : 0);
/* > 100,000KB/s && 1,000,000 KB/s <= */
@rate1GBps[this->sn, this->p] = sum(this->kb_per_sec > 0x186a0 &&
this->kb_per_sec <= 0xf4240 ? 1 : 0);
/*
* Collect minimum and maximum observed rate for later measurement
* of range for this metric.
*/
minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
minratei[args[0]->b_edev] : this->kb_per_sec;
maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
maxratei[args[0]->b_edev] : this->kb_per_sec;
/* Actual Kbytes/sec per device */
@avKBps[this->sn, this->p] = avg(this->kb_per_sec);
/* Average and Maximum Latency per device */
@avtime[this->sn, this->p] = avg(this->elapsed);
@maxtime[this->sn, this->p] = max(this->elapsed);
/*
* Latency histogram with buckets:
* >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
*/
@timegt1000ms[this->sn, this->p] = sum(
this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
@time100ms[this->sn, this->p] = sum(
this->elapsed > 50 * NSEC_PER_MSEC &&
this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
@time50ms[this->sn, this->p] = sum(
this->elapsed > 25 * NSEC_PER_MSEC &&
this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
@time25ms[this->sn, this->p] = sum(
this->elapsed > 10 * NSEC_PER_MSEC &&
this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
@time10ms[this->sn, this->p] = sum(
this->elapsed > 1 * NSEC_PER_MSEC &&
this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
@timelt1ms[this->sn, this->p] = sum(
this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);
/*
* Collect minimum and maximum io size for later measurement
* of range for this metric.
*/
miniosz[args[0]->b_edev] =
args[0]->b_bcount < miniosz[args[0]->b_edev] ?
args[0]->b_bcount : miniosz[args[0]->b_edev];
maxiosz[args[0]->b_edev] =
args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
args[0]->b_bcount : maxiosz[args[0]->b_edev];
/* Measure IO size range in Kilobytes */
@rangeiosz[this->sn, this->p] =
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);
/*
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain
* avg. effective throughput, and maximum effective throughput.
* Maximum throughput is likely measuring cache effects.
*/
@tput[this->sn, this->p] = avg(this->x / this->elapsed);
@maxtput[this->sn, this->p] = max(this->x / this->elapsed);
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;
/* Count number of IOs by IO-type */
@ctrd[this->sn, this->p] = sum(args[0]->b_flags & B_READ ? 1 : 0);
@ctwr[this->sn, this->p] = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
@iops[this->sn, this->p] =
count(); /* Normalized to per second in tick-X probe */
/* Maximum sampled IOPS per device */
@maxiops[this->sn, this->p] =
max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);
/*
* Collect minimum and maximum latency for later measurement
* of range for this metric.
*/
minlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
minlati[args[0]->b_edev, R];
maxlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
maxlati[args[0]->b_edev, R];
minlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
minlati[args[0]->b_edev, W];
maxlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
maxlati[args[0]->b_edev, W];
/*
* IOsize distribution not grouped by direction, i.e. no distinction
* is made between reads and writes. IO buckets double in size from
* previous bucket. i.e. 4, 8, 16, 32...
*/
this->bs = args[0]->b_bcount ;
/* 4K< */
@iosztiny[this->sn, this->p] =
sum(this->bs < 0x1000 ? 1 : 0);
/* 4K to 8K< */
@iosz4k[this->sn, this->p] =
sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
/* 8K to 16K< */
@iosz8k[this->sn, this->p] =
sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
/* 16K to 32K< */
@iosz16k[this->sn, this->p] =
sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
/* 32K to 64K< */
@iosz32k[this->sn, this->p] =
sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
/* 64K to 128K< */
@iosz64k[this->sn, this->p] =
sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
/* >128K */
@ioszbig[this->sn, this->p] =
sum(this->bs >= 0x20000 ? 1 : 0);
/* Average IO size for given device */
@aviosz[this->sn, this->p] = avg(this->bs);
/*
* Each time we observe an error at completion through B_ERROR flag,
* increment count of errors for given device. This should always
* be zero, assuming healthy device.
*/
@cterr[this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}
/*
* Entry controlled by timer. By design, each device will be registered
* here about once per second when there is even a litte bit of IO.
*/
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
/*
* Measure operating latency range in uS for Reads and Writes,
* storing largest observed difference.
*/
@rangerlat[this->sn, this->p] =
max((maxlati[args[0]->b_edev, R] -
minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
@rangewlat[this->sn, this->p] =
max((maxlati[args[0]->b_edev, W] -
minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);
@avpending[this->sn, this->p] = avg(pend[args[0]->b_edev]);
@maxpending[this->sn, this->p] = max(pend[args[0]->b_edev]);
/* Reset various counters for next measurement period */
minlati[args[0]->b_edev, R] = 0;
maxlati[args[0]->b_edev, R] = 0;
minlati[args[0]->b_edev, W] = 0;
maxlati[args[0]->b_edev, W] = 0;
iocnt[args[0]->b_edev, R] = 0;
iocnt[args[0]->b_edev, W] = 0;
miniosz[args[0]->b_edev] = 0;
maxiosz[args[0]->b_edev] = 0;
minratei[args[0]->b_edev] = 0;
maxratei[args[0]->b_edev] = 0;
ival_timer[args[0]->b_edev] = timestamp;
}
/*
* Count number of retries issued to a disk. These are a good
* indicator of potentially failing, or borderline device.
* Under normal circumstances we should not expect
* this to be a positive value.
*/
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
this->sn = xlate <devinfo_t *>(args[1])->dev_statname;
this->p = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
@ctretry[this->sn, this->p] = count();
}
tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;
/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);
printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput,
@ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
@ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
@timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
@iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
@avpending, @maxpending, @cterr, @ctretry);
trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps);
trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k);
trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
trunc(@time10ms); trunc(@timelt1ms);
ticks = timestamp ;
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=16M
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* bw-tput-iops-actv-time-hist-with-ts-csv.d
* This script is meant to be a more informative replacement for the
* iostat utility. It measures some features that iostat in my opinion
* is not useful for. For example, iostat does not offer distributions
* at all, whereas here we plot distributions of IO rates as observed
* for each disk. Also, iostat does not offer operating ranges for
* measurements, which here we define as max - min of some measurement.
*
* Currently, script is limited to scsi_vhci devices, because they are
* ones we normally use for actual data storage. Anything else is either
* a locally attached device, or something not used by system, i.e.
* USB storage media, etc.
*/
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long minratei[dev_t], maxratei[dev_t]; /* IO Rate min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */
unsigned long iocnt[dev_t, int];
int pend[dev_t];
hrtime_t start[dev_t, uint64_t];
hrtime_t ival_timer[dev_t];
hrtime_t ticks;
inline const int NSEC_PER_SEC = 1000000000;
inline const int NSEC_PER_MSEC = 1000000;
inline const int NSEC_PER_USEC = 1000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;
/*
* These are the Output Parameter definitions for collected metrics
*
* sdname == Name of device, i.e. sd0
* mpxiowwn == mpxio device, like: 5000cca24500f698
* actvtm == amount of real, busy time spent processing IO
* rangerlat, rangewlat == latency range: max - min for Reads and Writes
* totbytes == total number of bytes Read and Written during sample interval
* tput,maxtput == mean and maximum or burst throughput
* ctrd,ctwr == Count of Reads and Writes over interval set in tick-Xsec clause
* aviosz,rangeiosz == IO size mean and range: max - min
* iops,maxiops == normalized IOPS mean and maximum
* avKBps,rangeKBps == normalized IO rate and IO rate range: max - min
* // Histogram of IO Rate distribution with 4 buckets //
* [ ratelt1MBps ] ==> 1 second interval with < 1,000KB/s bw
* [ rate10MBps ] ==> 1 second interval with < 10,000KB/s bw
* [ rate100MBps ] ==> 1 second interval with < 100,000KB/s bw
* [ rate1GBps ] ==> 1 second interval with < 1,000,000KB/s bw
* avtime,maxtime == average and maximum IO completion latency
* // Histogram of latency distribution with 6 buckets //
* [ timegt1000ms ] ==> >1 second (SLOW!)
* [ time100ms ] ==> >50ms && 100ms<= (not acceptable)
* [ time50ms ] ==> >25ms && 50ms<= (acceptable)
* [ time25ms ] ==> >10ms && 25ms<= (ok)
* [ time10ms ] ==> >1ms && 10ms<= (good)
* [ timelt1ms ] ==> 1ms< (probably cache)
* // Histogram of IO size distribution with 7 buckets //
* [ iosztiny ] ==> 4K<
* [ iosz4k ] ==> between 4K and 8K
* [ iosz8k ] ==> between 8K and 16K
* [ iosz16k ] ==> between 16K and 32K
* [ iosz32k ] ==> between 32K and 64K
* [ iosz64k ] ==> between 64K and 128K
* [ ioszbig ] ==> anything 128K or above
* avpending == Average measured IOs in processing per sample interval
* maxpending == Maximum measured IOs in processing per sample interval
* cterr == Counter tracking number of errors per sample interval
* ctretry == Counter tracking number of retries per sample interval
*/
BEGIN {
printf("timestamp,sdname,mpxiowwn,actvtm,rangerlat,rangewlat,totbytes,tput,maxtput,ctrd,ctwr,aviosz,rangeiosz,iops,maxiops,avKBps,rangeKBps,ratelt1MBps,rate10MBps,rate100MBps,rate1GBps,avtime,maxtime,timegt1000ms,time100ms,time50ms,time25ms,time10ms,timelt1ms,iosztiny,iosz4k,iosz8k,iosz16k,iosz32k,iosz64k,ioszbig,avpending,maxpending,cterr,ctretry\n");
interval = walltimestamp - (walltimestamp%1000000000);
}
/*
* Set interval timer initial value once for a given device.
* Each subsequent update will happen roughly once per second,
* or if no IO, whenever there is some IO that triggers io:::done
* probe, at which point we determine that timer expired.
*/
io:::start
/ival_timer[args[0]->b_edev] == 0/ {
ival_timer[args[0]->b_edev] = timestamp;
}
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
/* Increment pending IOs by 1 */
pend[args[0]->b_edev]++;
}
io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
this->x = args[0]->b_bcount * 976562;
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
start[args[0]->b_edev, args[0]->b_blkno] = 0;
/* Decrement pending IOs by 1, set to 0 if value < 0 */
pend[args[0]->b_edev]--;
pend[args[0]->b_edev] =
pend[args[0]->b_edev] > 0 ?
pend[args[0]->b_edev] : 0; /* avoid underflow */
/* Total Number of bytes per device */
@totbytes[interval, this->sn, this->p] = sum(args[0]->b_bcount);
/* Total nanoseconds of active time per device */
@actv_tm[interval, this->sn, this->p] = sum(this->elapsed);
/*
* Instead of converting nanoseconds to seconds we multiply
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
* In extreme observations, i.e. stalled IO, we may have huge
* this->elapsed values, in which case result will be 0 KB/s, even
* if there in fact was a non-zero value of bytes transferred.
*/
this->b_nsec = args[0]->b_bcount * NSEC_PER_SEC;
this->kb_per_sec = (this->b_nsec / this->elapsed) >> 10;
/* Measure IO rate range in KB/s */
@rangeKBps[interval, this->sn, this->p] =
max(maxratei[args[0]->b_edev] - minratei[args[0]->b_edev]);
/* IO Rate histogram base10, limited at 1000000 KB/s | 976 MB/s */
/* 1000KB/s <= */
@ratelt1MBps[interval, this->sn, this->p] = sum(this->kb_per_sec <= 0x3e8 ? 1 : 0);
/* > 1000KB/s && 10,000KB/s <= */
@rate10MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x3e8 &&
this->kb_per_sec <= 0x2710 ? 1 : 0);
/* > 10,000KB/s && 100,000KB/s <= */
@rate100MBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x2710 &&
this->kb_per_sec <= 0x186a0 ? 1 : 0);
/* > 100,000KB/s && 1,000,000 KB/s <= */
@rate1GBps[interval, this->sn, this->p] = sum(this->kb_per_sec > 0x186a0 &&
this->kb_per_sec <= 0xf4240 ? 1 : 0);
/*
* Collect minimum and maximum observed rate for later measurement
* of range for this metric.
*/
minratei[args[0]->b_edev] = minratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : minratei[args[0]->b_edev] < this->kb_per_sec ?
minratei[args[0]->b_edev] : this->kb_per_sec;
maxratei[args[0]->b_edev] = maxratei[args[0]->b_edev] == 0 ?
this->kb_per_sec : maxratei[args[0]->b_edev] > this->kb_per_sec ?
maxratei[args[0]->b_edev] : this->kb_per_sec;
/* Actual Kbytes/sec per device */
@avKBps[interval, this->sn, this->p] = avg(this->kb_per_sec);
/* Average and Maximum Latency per device */
@avtime[interval, this->sn, this->p] = avg(this->elapsed);
@maxtime[interval, this->sn, this->p] = max(this->elapsed);
/*
* Latency histogram with buckets:
* >1000ms, >50 to 100ms, >25 to 50ms, >10 to 25ms, >1 to 10ms, 1ms<
*/
@timegt1000ms[interval, this->sn, this->p] = sum(
this->elapsed >= 1 * NSEC_PER_SEC ? 1 : 0);
@time100ms[interval, this->sn, this->p] = sum(
this->elapsed > 50 * NSEC_PER_MSEC &&
this->elapsed <= 100 * NSEC_PER_MSEC ? 1 : 0);
@time50ms[interval, this->sn, this->p] = sum(
this->elapsed > 25 * NSEC_PER_MSEC &&
this->elapsed <= 50 * NSEC_PER_MSEC ? 1 : 0);
@time25ms[interval, this->sn, this->p] = sum(
this->elapsed > 10 * NSEC_PER_MSEC &&
this->elapsed <= 25 * NSEC_PER_MSEC ? 1 : 0);
@time10ms[interval, this->sn, this->p] = sum(
this->elapsed > 1 * NSEC_PER_MSEC &&
this->elapsed <= 10 * NSEC_PER_MSEC ? 1 : 0);
@timelt1ms[interval, this->sn, this->p] = sum(
this->elapsed < 1 * NSEC_PER_MSEC ? 1 : 0);
/*
* Collect minimum and maximum io size for later measurement
* of range for this metric.
*/
miniosz[args[0]->b_edev] =
args[0]->b_bcount < miniosz[args[0]->b_edev] ?
args[0]->b_bcount : miniosz[args[0]->b_edev];
maxiosz[args[0]->b_edev] =
args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
args[0]->b_bcount : maxiosz[args[0]->b_edev];
/* Measure IO size range in Kilobytes */
@rangeiosz[interval, this->sn, this->p] =
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);
/*
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain
* avg. effective throughput, and maximum effective throughput.
* Maximum throughput is likely measuring cache effects.
*/
@tput[interval, this->sn, this->p] = avg(this->x / this->elapsed);
@maxtput[interval, this->sn, this->p] = max(this->x / this->elapsed);
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_WRITE ? 1 : 0;
/* Count number of IOs by IO-type */
@ctrd[interval, this->sn, this->p] = sum(args[0]->b_flags & B_READ ? 1 : 0);
@ctwr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_WRITE ? 1 : 0);
@iops[interval, this->sn, this->p] =
count(); /* Normalized to per second in tick-X probe */
/* Maximum sampled IOPS per device */
@maxiops[interval, this->sn, this->p] =
max(iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W]);
/*
* Collect minimum and maximum latency for later measurement
* of range for this metric.
*/
minlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
minlati[args[0]->b_edev, R];
maxlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
maxlati[args[0]->b_edev, R];
minlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
minlati[args[0]->b_edev, W];
maxlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
maxlati[args[0]->b_edev, W];
/*
* IOsize distribution not grouped by direction, i.e. no distinction
* is made between reads and writes. IO buckets double in size from
* previous bucket. i.e. 4, 8, 16, 32...
*/
this->bs = args[0]->b_bcount ;
/* 4K< */
@iosztiny[interval, this->sn, this->p] =
sum(this->bs < 0x1000 ? 1 : 0);
/* 4K to 8K< */
@iosz4k[interval, this->sn, this->p] =
sum(this->bs >= 0x1000 && this->bs < 0x2000 ? 1 : 0);
/* 8K to 16K< */
@iosz8k[interval, this->sn, this->p] =
sum(this->bs >= 0x2000 && this->bs < 0x4000 ? 1 : 0);
/* 16K to 32K< */
@iosz16k[interval, this->sn, this->p] =
sum(this->bs >= 0x4000 && this->bs < 0x8000 ? 1 : 0);
/* 32K to 64K< */
@iosz32k[interval, this->sn, this->p] =
sum(this->bs >= 0x8000 && this->bs < 0x10000 ? 1 : 0);
/* 64K to 128K< */
@iosz64k[interval, this->sn, this->p] =
sum(this->bs >= 0x10000 && this->bs < 0x20000 ? 1 : 0);
/* >128K */
@ioszbig[interval, this->sn, this->p] =
sum(this->bs >= 0x20000 ? 1 : 0);
/* Average IO size for given device */
@aviosz[interval, this->sn, this->p] = avg(this->bs);
/*
* Each time we observe an error at completion through B_ERROR flag,
* increment count of errors for given device. This should always
* be zero, assuming healthy device.
*/
@cterr[interval, this->sn, this->p] = sum(args[0]->b_flags & B_ERROR ? 1 : 0);
}
/*
* Entry controlled by timer. By design, each device will be registered
* here about once per second when there is even a litte bit of IO.
*/
io:::done
/ival_timer[args[0]->b_edev] > 0 &&
timestamp >= ival_timer[args[0]->b_edev] + NSEC_PER_SEC/ {
this->sn = args[1]->dev_statname;
this->p = substr(args[1]->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
/*
* Measure operating latency range in uS for Reads and Writes,
* storing largest observed difference.
*/
@rangerlat[interval, this->sn, this->p] =
max((maxlati[args[0]->b_edev, R] -
minlati[args[0]->b_edev, R]) / NSEC_PER_USEC);
@rangewlat[interval, this->sn, this->p] =
max((maxlati[args[0]->b_edev, W] -
minlati[args[0]->b_edev, W]) / NSEC_PER_USEC);
@avpending[interval, this->sn, this->p] = avg(pend[args[0]->b_edev]);
@maxpending[interval, this->sn, this->p] = max(pend[args[0]->b_edev]);
/* Reset various counters for next measurement period */
minlati[args[0]->b_edev, R] = 0;
maxlati[args[0]->b_edev, R] = 0;
minlati[args[0]->b_edev, W] = 0;
maxlati[args[0]->b_edev, W] = 0;
iocnt[args[0]->b_edev, R] = 0;
iocnt[args[0]->b_edev, W] = 0;
miniosz[args[0]->b_edev] = 0;
maxiosz[args[0]->b_edev] = 0;
minratei[args[0]->b_edev] = 0;
maxratei[args[0]->b_edev] = 0;
ival_timer[args[0]->b_edev] = timestamp;
}
/*
* Count number of retries issued to a disk. These are a good
* indicator of potentially failing, or borderline device.
* Under normal circumstances we should not expect
* this to be a positive value.
*/
::sd_set_retry_bp:entry
/xlate <devinfo_t *>(args[1])->dev_pathname != "<nfs>" &&
xlate <devinfo_t *>(args[1])->dev_pathname != "" / {
this->sn = xlate <devinfo_t *>(args[1])->dev_statname;
this->p = substr(xlate <devinfo_t *>(args[1])->dev_pathname, 25, 16);
this->p = (strstr(this->p, "disk@") == 0 ||
strstr(this->p, "disk@") == "") ? this->p : "NA";
@ctretry[interval, this->sn, this->p] = count();
}
tick-10sec {
/* First time we enter this clause, ticks will be 0, so we just assume 10 seconds */
this->elapsed = ticks > 0 ? (timestamp - ticks) / NSEC_PER_SEC : 10 ;
/* Normalize Data for correct per second reporting of rates, like IOPS */
normalize(@actv_tm, NSEC_PER_MSEC); /* from nanoseconds to milliseconds */
normalize(@iops, this->elapsed);
printa("%ld,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput,
@ctrd, @ctwr, @aviosz, @rangeiosz, @iops, @maxiops, @avKBps, @rangeKBps,
@ratelt1MBps, @rate10MBps, @rate100MBps, @rate1GBps, @avtime, @maxtime,
@timegt1000ms, @time100ms, @time50ms, @time25ms, @time10ms, @timelt1ms,
@iosztiny, @iosz4k, @iosz8k, @iosz16k, @iosz32k, @iosz64k, @ioszbig,
@avpending, @maxpending, @cterr, @ctretry);
trunc(@actv_tm); trunc(@rangerlat); trunc(@rangewlat); trunc(@totbytes);
trunc(@tput); trunc(@maxtput); trunc(@ctrd); trunc(@ctwr); trunc(@aviosz);
trunc(@rangeiosz); trunc(@iops); trunc(@maxiops); trunc(@avKBps);
trunc(@rangeKBps); trunc(@ratelt1MBps); trunc(@rate10MBps); trunc(@rate100MBps); trunc(@rate1GBps); trunc(@avtime); trunc(@maxtime);
trunc(@iosztiny); trunc(@iosz4k); trunc(@iosz8k); trunc(@iosz16k);
trunc(@iosz32k); trunc(@iosz64k); trunc(@ioszbig);
trunc(@avpending); trunc(@maxpending); trunc(@cterr); trunc(@ctretry);
trunc(@timegt1000ms); trunc(@time100ms); trunc(@time50ms); trunc(@time25ms);
trunc(@time10ms); trunc(@timelt1ms);
ticks = timestamp ;
interval = walltimestamp - (walltimestamp%1000000000);
}
#!/usr/sbin/dtrace -Cs
#pragma D option quiet
#pragma D option dynvarsize=10M
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
* Copyright (c) 2017 Sam Zaydel / RackTop Systems.
*
* bw-tput-iops-actv-time-hist.d
* This script is meant to be a more informative replacement for the
* iostat utility. It measures some features that iostat in my opinion
* is not useful for. For example, iostat does not offer distributions
* at all, whereas here we plot distributions of bandwidth as observed
* for each disk. Also, iostat does not offer operating ranges for
* measurements, which here we define as max - min of some measurement.
*/
unsigned long minlati[dev_t, int], maxlati[dev_t, int]; /* Latency min and max */
unsigned long miniosz[dev_t], maxiosz[dev_t]; /* IO size min and max */
unsigned long long bufcnt[dev_t], ts[dev_t];
unsigned long long iocnt[dev_t, int];
self unsigned long int tottime[dev_t];
inline const int NSEC_PER_SEC = 1000000000;
inline const int SPA_MAXBLOCKSIZE = 128 << 10;
inline const int R = 0;
inline const int W = 1;
io:::start /ts[args[0]->b_edev] == 0/ {
ts[args[0]->b_edev] = timestamp;
/*
* We need to seed minimum value with something other than zero.
* Because this value is never signed, zero is by definition smallest
* value will result in range potentially spanning 0 => SPA_MAXBLOCKSIZE.
* As soon as any IO SPA_MAXBLOCKSIZE< is encountered, this value will
* update accordingly.
*/
miniosz[args[0]->b_edev] = SPA_MAXBLOCKSIZE;
}
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
}
io:::done /start[args[0]->b_edev, args[0]->b_blkno]/ {
this->sn = args[1]->dev_statname;
this->p = args[1]->dev_pathname;
this->x = args[0]->b_bcount * 976562;
this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno];
bufcnt[args[0]->b_edev] += args[0]->b_bcount ; /* sum of bytes */
self->tottime[args[0]->b_edev] += this->elapsed;
start[args[0]->b_edev, args[0]->b_blkno] = 0;
/*
* Collect minimum and maximum io size for later measurement
* of range for this metric.
*/
miniosz[args[0]->b_edev] =
args[0]->b_bcount < miniosz[args[0]->b_edev] ?
args[0]->b_bcount : miniosz[args[0]->b_edev];
maxiosz[args[0]->b_edev] =
args[0]->b_bcount > maxiosz[args[0]->b_edev] ?
args[0]->b_bcount : maxiosz[args[0]->b_edev];
/*
* Convert from bytes and nanoseconds to KB/s with 976562 to obtain
* avg. effective throughput, and maximum effective throughput.
* Maximum throughput is likely measuring cache effects.
*/
@tput[this->sn, this->p] = avg(this->x / this->elapsed);
@maxtput[this->sn, this->p] = max(this->x / this->elapsed);
iocnt[args[0]->b_edev, R] += args[0]->b_flags & B_READ ? 1 : 0;
iocnt[args[0]->b_edev, W] += args[0]->b_flags & B_READ ? 0 : 1;
/*
* Collect minimum and maximum latency for later measurement
* of range for this metric.
*/
minlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed < minlati[args[0]->b_edev, R] ? this->elapsed :
minlati[args[0]->b_edev, R];
maxlati[args[0]->b_edev, R] =
args[0]->b_flags & B_READ &&
this->elapsed > maxlati[args[0]->b_edev, R] ? this->elapsed :
maxlati[args[0]->b_edev, R];
minlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed < minlati[args[0]->b_edev, W] ? this->elapsed :
minlati[args[0]->b_edev, W];
maxlati[args[0]->b_edev, W] =
(args[0]->b_flags & B_READ) == 0 &&
this->elapsed > maxlati[args[0]->b_edev, W] ? this->elapsed :
maxlati[args[0]->b_edev, W];
}
io:::done
/timestamp >= ts[args[0]->b_edev] + NSEC_PER_SEC/ {
this->sn = args[1]->dev_statname;
this->p = args[1]->dev_pathname;
this->delta = timestamp - ts[args[0]->b_edev];
/*
* Instead of converting nanoseconds to seconds we multiply
* the top by NSEC_PER_SEC and the divide by delta in nanoseconds.
*/
this->b_nsec = (bufcnt[args[0]->b_edev]) * NSEC_PER_SEC;
this->kb_per_sec = (this->b_nsec / this->delta) >> 10;
this->ioct = iocnt[args[0]->b_edev, R] + iocnt[args[0]->b_edev, W];
this->iops = (this->ioct * NSEC_PER_SEC) / this->delta;
/* Actual Kbytes/sec per device */
@bw[this->sn, this->p] = quantize(this->kb_per_sec);
@avbw[this->sn, this->p] = avg(this->kb_per_sec);
@maxbw[this->sn, this->p] = max(this->kb_per_sec);
/* Total Number of bytes per device */
@totbytes[this->sn, this->p] = sum(bufcnt[args[0]->b_edev]);
/* Total nanoseconds of active time per device */
@actv_tm[this->sn, this->p] = sum(self->tottime[args[0]->b_edev]);
/* Average IOPS per device */
@aviops[this->sn, this->p] = avg(this->iops);
@maxiops[this->sn, this->p] = max(this->iops);
/* Average IO size per device */
@aviosz[this->sn, this->p] = avg(args[0]->b_bcount);
/* Count number of IOs by IO-type */
@ctrd[this->sn, this->p] = sum(iocnt[args[0]->b_edev, R]);
@ctwr[this->sn, this->p] = sum(iocnt[args[0]->b_edev, W]);
/* Measure operating latency range in uS for Reads and Writes */
@rangerlat[this->sn, this->p] =
max((maxlati[args[0]->b_edev, R] -
minlati[args[0]->b_edev, R]) / 1000); /* largest difference stored */
@rangewlat[this->sn, this->p] =
max((maxlati[args[0]->b_edev, W] -
minlati[args[0]->b_edev, W]) / 1000); /* largest difference stored */
@rangeiosz[this->sn, this->p] =
max((maxiosz[args[0]->b_edev] - miniosz[args[0]->b_edev]) >> 10);
/* Reset various counters for next measurement period */
minlati[args[0]->b_edev, R] = 0;
maxlati[args[0]->b_edev, R] = 0;
minlati[args[0]->b_edev, W] = 0;
maxlati[args[0]->b_edev, W] = 0;
iocnt[args[0]->b_edev, R] = 0;
iocnt[args[0]->b_edev, W] = 0;
bufcnt[args[0]->b_edev] = 0;
self->tottime[args[0]->b_edev] = 0;
ts[args[0]->b_edev] = timestamp;
}
END {
normalize(@actv_tm, 1000000); /* from nanoseconds to milliseconds */
normalize(@totbytes, 1024); /* from bytes to kilobytes */
printa("\t %s (%s)\n\t Active Time: %@d(ms) | Lat. Range[max-min] Read: %@d(uS) / Write: %@d(uS)\n\t Volume: %@d(KB) | Tput Avg: %@d(KB/s) / Max: %@d(KB/s)\n\t IOs Read: %@d / Write: %@d | IO Size Avg: %@d(b) / Range[max-min]: %@d(KB)\n\t IOPS Avg: %@d | Max: %@d | Bandwidth Avg: %@d(KB/s) | Max: %@d(KB/s)\n\t Bandwidth Histogram(KB/s)%@d\n",
@actv_tm, @rangerlat, @rangewlat, @totbytes, @tput, @maxtput,
@ctrd, @ctwr, @aviosz, @rangeiosz, @aviops, @maxiops, @avbw, @maxbw, @bw);
}
// Identify devices to which SYNC CACHE commands are being sent
dtrace -qn '::sd_send_scsi_SYNCHRONIZE_CACHE:entry {print(stringof(args[0]->un_sd->sd_inq->inq_vid))}'
#pragma D option quiet
BEGIN
{
start = timestamp;
}
io:::start
{
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/
{
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
"read " : "write ";
@q[this->name] = quantize(this->delta);
@a[this->name] = avg(this->delta);
@v[this->name] = stddev(this->delta);
@i[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
END
{
printa(@q);
normalize(@i, (timestamp - start) / 1000000000);
normalize(@b, (timestamp - start) / 1000000000 * 1024);
printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev",
"iops", "throughput");
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
}
/*
# dtrace -s rw.d -c 'sleep 60'
read
value ------------- Distribution ------------- count
32 | 0
64 | 23
128 |@ 655
256 |@@@@ 1638
512 |@@ 743
1024 |@ 380
2048 |@@@ 1341
4096 |@@@@@@@@@@@@ 5295
8192 |@@@@@@@@@@@ 5033
16384 |@@@ 1297
32768 |@@ 684
65536 |@ 400
131072 | 225
262144 | 206
524288 | 127
1048576 | 19
2097152 | 0
write
value ------------- Distribution ------------- count
32 | 0
64 | 47
128 | 469
256 | 591
512 | 327
1024 | 924
2048 |@ 6734
4096 |@@@@@@@ 43416
8192 |@@@@@@@@@@@@@@@@@ 102013
16384 |@@@@@@@@@@ 60992
32768 |@@@ 20312
65536 |@ 6789
131072 | 860
262144 | 208
524288 | 153
1048576 | 36
2097152 | 0
avg latency stddev iops throughput
write 19442us 32468us 4064/s 261889k/s
read 23733us 88206us 301/s 13113k/s
*/
# Get a quick sense of the size(s) of IO, quantized, grouped by (device, Read/Write Ops).
dtrace -n 'io:::start {@[args[1]->dev_pathname, args[0]->b_flags & B_READ ? "READ" : "WRITE"] = quantize(args[0]->b_bcount);}'
dtrace -qn '
int activ[string];
BEGIN {
start = timestamp;
}
io:::start {
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/ {
this->delta =
(timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name =
(args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
"read" : "write";
/* Convert bytes to KB, to measure microseconds per KB */
this->kb = (args[0]->b_bcount>>10) ? (args[0]->b_bcount>>10) > 0 : 1;
@q[this->name] = quantize(this->delta);
/* Above graph should support this mean value. Largest count on histogram
* is expected to hover in the three buckets closest to the mean value.
*/
@a[this->name] = avg(this->delta);
@v[this->name] = stddev(this->delta);
@i[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
/* Time spent per KB of IO. This is an unusual metric, but may prove
* useful at some point.
*/
@kbt[this->name] = quantize(this->delta / this->kb);
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
END {
this->elapsed = (timestamp - start) / 1000000000;
printa("\tOP: %s\n\ttime spent/KB (us): %@d\n", @kbt);
printa("\tOP: %s\n\tlatency (us): %@d\n", @q);
/* When there is very little IO of a given type, normalization can create
* an unacceptable degree of error. We are normalizing over some period of
* time, and if we have lots of IO for the given command over a short
* interval but the script runs for a much longer period of time,
* we will normalize over that time period, which can create incorrect
* and perhaps confusing results.
*/
normalize(@i, this->elapsed);
normalize(@b, this->elapsed * 1024);
printf("%-30s %11s %11s %11s %11s\n", "", "avg.latency", "stddev",
"IOPS", "throughput");
printa("%-30s %@9uus %@9uus %@9u/s %@8uKB/s\n", @a, @v, @i, @b);
}'
// Is IO aligned on 4K boundary? This is critical for SSDs, and could be impacting
// performance to a lesser degree of mechanical devices.
dtrace -qn '
io:::start {
@[args[0]->b_flags & B_WRITE ? "W" : "R"] = quantize(args[0]->b_blkno % 8);
}
END {
printa("OP: (%s) %@d\n", @)
}'
dtrace -qn 'BEGIN {
x[123] = 0;
}
io:::start {
x[123] = 1;
start[arg0] = timestamp;
}
io:::done /this->start = start[arg0]/ {
@delta[args[0]->b_flags & B_WRITE ? "WRITE" : "READ", args[1]->dev_pathname] = quantize((timestamp - this->start) / 1000); self->ts = 0;
start[arg0] = 0;
}
tick-30sec /x[123]/ {
printa("%12s (us) Device: %s %@d\n", @delta);
trunc(@delta);
x[123] = 0;
}'
// Measure throughput per device and get some useful statistics
// in a form of a summary that looks like this:
// +=======================================+
// |=============== Summary ===============|
// +=======================================+
// Start Time: 2017 Jul 13 18:35:46
// Human Time: 3.972 seconds
// Active (All drives): 5.460 seconds
// Spent 0% of elapsed time Reading
// Spent 100% of elapsed time Writing
// Average Write throughput: 76993KB/sec
// Burst Write throughput: 468351KB/sec
// =========================================
//
dtrace -qn '
inline const int NSEC_IN_SEC = 1000000000;
BEGIN {
starttime = walltimestamp;
runtime = timestamp;
busytime = 0;
readtime = 0;
writetime = 0;
}
io:::start {
start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
}
io:::done
/start[args[0]->b_edev, args[0]->b_blkno]/ {
this->devp = strtok(args[1]->dev_pathname, ":");
this->elapsed = timestamp -
start[args[0]->b_edev, args[0]->b_blkno];
/*
* Odd looking calculation below is to go from bytes and nanoseconds
* to kilobytes(KB) and seconds. bytes/ns => KB/sec.
*/
@cumtp[args[0]->b_flags & B_READ ? "Read" : "Write"] =
avg((args[0]->b_bcount * 976562) / this->elapsed);
@cumtp_max[args[0]->b_flags & B_READ ? "Read" : "Write"] =
max((args[0]->b_bcount * 976562) / this->elapsed);
@tp[args[1]->dev_statname, this->devp] =
quantize((args[0]->b_bcount * 976562) / this->elapsed);
readtime += args[0]->b_flags & B_READ ? this->elapsed : 0;
writetime += args[0]->b_flags & B_WRITE ? this->elapsed : 0;
start[args[0]->b_edev, args[0]->b_blkno] = 0;
}
END {
runtime = timestamp - runtime;
/*
* Small amount of rounding-off error is possible here,
* so the two percentages may not add-up to 100% exactly.
*/
pctrd = readtime * 100 / (readtime + writetime);
pctwr = writetime * 100 / (readtime + writetime);
printf("+=======================================+\n");
printf("|=============== Summary ===============|\n");
printf("+=======================================+\n");
printf("Start Time: %Y\n", starttime);
printf("Human Time: %3d.%03d seconds\n",
runtime / NSEC_IN_SEC,
(runtime / 1000) % 1000);
printf("Active (All drives): %3d.%03d seconds\n",
(readtime+writetime) / NSEC_IN_SEC,
((readtime+writetime) / 1000) % 1000);
printf("Spent %3d%% of elapsed time Reading\n", pctrd);
printf("Spent %3d%% of elapsed time Writing\n", pctwr);
printa("Average %-5s throughput: %@dKB/sec\n", @cumtp);
printa("Burst %-5s throughput: %@dKB/sec\n", @cumtp_max);
printf("=========================================\n\n");
printa(" %s (%s)\n\t KB/s%@d\n", @tp);
}'
#!/usr/sbin/dtrace -s
#pragma D option quiet
BEGIN
{
start = timestamp;
}
io:::start
{
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/
{
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
"read " : "write ";
@q[this->name] = quantize(this->delta);
@a[this->name] = avg(this->delta);
@v[this->name] = stddev(this->delta);
@i[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
END
{
printa(@q);
normalize(@i, (timestamp - start) / 1000000000);
normalize(@b, (timestamp - start) / 1000000000 * 1024);
printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev",
"iops", "throughput");
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
}
#!/usr/sbin/dtrace -s
/*
* sdretry.d
*
* Example script from Chapter 4 of the book: DTrace: Dynamic Tracing in
* Oracle Solaris, Mac OS X, and FreeBSD", by Brendan Gregg and Jim Mauro,
* Prentice Hall, 2011. ISBN-10: 0132091518. http://dtracebook.com.
*
* See the book for the script description and warnings. Many of these are
* provided as example solutions, and will need changes to work on your OS.
*/
#pragma D option quiet
dtrace:::BEGIN
{
printf("Tracing... output every 10 seconds.\n");
}
fbt::sd_set_retry_bp:entry
{
@[xlate <devinfo_t *>(args[1])->dev_statname,
xlate <devinfo_t *>(args[1])->dev_major,
xlate <devinfo_t *>(args[1])->dev_minor] = count();
}
tick-10sec
{
printf("\n%Y:\n", walltimestamp);
printf("%28s %-3s,%-4s %s\n", "DEVICE", "MAJ", "MIN", "RETRIES");
printa("%28s %-03d,%-4d %@d\n", @);
trunc(@);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment