Skip to content

Instantly share code, notes, and snippets.

@szaydel
Last active January 2, 2019 16:07
Show Gist options
  • Save szaydel/77d40f65658f051567be to your computer and use it in GitHub Desktop.
Save szaydel/77d40f65658f051567be to your computer and use it in GitHub Desktop.
OpenZFS write throttling observations and other observations around the ZIO pipeline. New write throttle in OpenZFS is well explained in this post by Adam: http://blog.delphix.com/ahl/2014/tuning-openzfs-write-throttle/. These snippets are borrowing
#!/usr/sbin/dtrace -qCs
#define UPPER_THRESHOLD 100
/* See zio.h line 147 or thereabout */
enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
};
BEGIN {
qnames[ZIO_PRIORITY_SYNC_READ] = "Sync Read";
qnames[ZIO_PRIORITY_SYNC_WRITE] = "Sync Write";
qnames[ZIO_PRIORITY_ASYNC_READ] = "Async Read";
qnames[ZIO_PRIORITY_ASYNC_WRITE] = "Async Write";
qnames[ZIO_PRIORITY_SCRUB] = "Scrub";
qnames[ZIO_PRIORITY_NUM_QUEUEABLE] = "Not Queueable";
qnames[ZIO_PRIORITY_NOW] = "Not Yet Queued";
}
::vdev_queue_io_add:entry {
this->pri = args[1]->io_priority; /* one of zio_priority above */
this->spa = args[1]->io_spa;
/*
* For queue to which this IO is being queued,
* how many IOs are already active, grouped-by pool?
*/
@lq[stringof(this->spa->spa_name), qnames[this->pri]] =
lquantize(args[0]->vq_class[this->pri].vqc_active, 0, UPPER_THRESHOLD);
}
END {
printa("\t %s | [%s]%@d\n", @lq);
}
dtrace -qn '
/* See zio.h line 147 or thereabout */
enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
};
BEGIN {
qnames[ZIO_PRIORITY_SYNC_READ] = "Sync Read";
qnames[ZIO_PRIORITY_SYNC_WRITE] = "Sync Write";
qnames[ZIO_PRIORITY_ASYNC_READ] = "Async Read";
qnames[ZIO_PRIORITY_ASYNC_WRITE] = "Async Write";
qnames[ZIO_PRIORITY_SCRUB] = "Scrub";
qnames[ZIO_PRIORITY_NUM_QUEUEABLE] = "Not Queueable";
qnames[ZIO_PRIORITY_NOW] = "Not Yet Queued";
}
::vdev_queue_io_add:entry {
this->pri = args[1]->io_priority; /* one of zio_priority above */
/*
* For queue to which this IO is being queued,
* how many IOs are already active?
*/
@lq[qnames[this->pri]] = lquantize(args[0]->vq_class[this->pri].vqc_active, 0, 100);
}'
dtrace -qn '
/* Measure amount of dirty data and whether or not IO is going to be
* delayed by some fraction of time.
* Adjusting amount of allowed dirty data before throttle kicks in
* will likely increase latency, and would result in pushing disks harder
* which potentially is going to increase throughput.
* To adjust, tune zfs_dirty_data_max, which is the upper bound for how
* much dirty data is permitted.
* # echo zfs_dirty_data_max/Z0x400000000|mdb -kw
*/
::dsl_pool_need_dirty_delay:entry {
self->spa = args[0]->dp_spa ;
self->dirty = args[0]->dp_dirty_total >> 20 ; /* In Megabytes */
}
::dsl_pool_need_dirty_delay:return /self->spa/ {
this->res = args[1] > 0 ? "Yes" : "No" ;
@[stringof(self->spa->spa_name), this->res] =
quantize(self->dirty) ;
self->spa = 0 ;
self->dirty = 0;
}
END { printa(" [%s] | Throttle [%s] %@d\n", @) ; }
'
// Print name of pool and amount of dirty data for that pool.
dtrace -qn '::dsl_pool_need_dirty_delay:entry {@[args[0]->dp_spa->spa_name] = quantize(args[0]->dp_dirty_total >> 20);}'
dtrace -qn '
::dsl_pool_need_dirty_delay:entry /args[0]->dp_spa->spa_name != "bp"/ {
@av[args[0]->dp_spa->spa_name] = avg(args[0]->dp_dirty_total >> 20);
@m[args[0]->dp_spa->spa_name] = max(args[0]->dp_dirty_total >> 20);
}
tick-5sec {
printa("%s,%@dMB,%@dMB\n", @av,@m); trunc(@av); trunc(@m);
}
END {
printa("%s,%@dMB,%@dMB\n", @av,@m);
}'
dtrace -qn '
int limit, ct, slowct, xslowct ;
inline int LIMIT = 1000000 ; /* limit in nanoseconds */
::dmu_tx_wait:entry {
self->spa = args[0]->tx_pool->dp_spa ;
self->txbirth = args[0]->tx_start ;
}
::dmu_tx_wait:return /self->txbirth/ {
this->delta = timestamp - self->txbirth ;
this->slow = this->delta >= LIMIT ;
this->xslow = this->delta >= LIMIT*10 ;
this->sn = (char *)self->spa->spa_name ;
@hist[stringof(this->sn)] = quantize(this->delta) ;
@avdelay[stringof(this->sn)] = avg(this->delta) ;
@maxdelay[stringof(this->sn)] = max(this->delta) ;
@sddelay[stringof(this->sn)] = stddev(this->delta) ;
++ct ;
slowct += this->slow ? 1 : 0 ;
xslowct += this->xslow ? 1 : 0 ;
self->txbirth = 0 ; self->spa = 0 ;
}
END /ct > 0/ {
printf(" %% TX(s) delayed by <= %d(us): %d\n",
LIMIT/1000, 100 - (100*slowct)/ct) ;
printf(" %% TX(s) delayed by >= %d(ms): %d\n",
LIMIT/100000, (100*xslowct)/ct) ;
printa(" [%s] Delay [Mean|Max|SD](ns): %@d|%@d|%@d %@d",
@avdelay, @maxdelay, @sddelay, @hist) ;
}'
dtrace -qn '
::dmu_tx_wait:entry {
self->spa = args[0]->tx_pool->dp_spa ;
self->s = timestamp ;
self->wasdelayed = args[0]->tx_waited ;
}
::dmu_tx_wait:return /self->s/ {
this->unlucky = self->wasdelayed > 0 ? "Yes" : "No" ;
this->sn = (char *)self->spa->spa_name ;
@[stringof(this->sn), this->unlucky] = quantize(timestamp - self->s) ;
self->s = 0 ; self->spa = 0 ; self->wasdelayed = 0 ;
}
END { printa(" [%s] | TX with >1 Delay [%s] %@d\n", @) ; }
'
#!/usr/sbin/dtrace -s -C
#pragma D option quiet
/*
* See /usr/src/uts/common/fs/zfs/sys/zio_impl.h
* for more details about this enum.
*/
enum zio_stage {
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
ZIO_STAGE_READY = 1 << 18, /* RWFCI */
ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
};
BEGIN {
s[NULL] = "null";
s[ZIO_STAGE_OPEN] = "open";
s[ZIO_STAGE_READ_BP_INIT] = "read bp init";
s[ZIO_STAGE_WRITE_BP_INIT] = "write bp init";
s[ZIO_STAGE_FREE_BP_INIT] = "free bp init";
s[ZIO_STAGE_ISSUE_ASYNC] = "issue async";
s[ZIO_STAGE_WRITE_COMPRESS] = "write compress";
s[ZIO_STAGE_CHECKSUM_GENERATE] = "checksum generate";
s[ZIO_STAGE_NOP_WRITE] = "nop write";
s[ZIO_STAGE_DDT_READ_START] = "ddt read start";
s[ZIO_STAGE_DDT_READ_DONE] = "ddt read done";
s[ZIO_STAGE_DDT_WRITE] = "ddt write";
s[ZIO_STAGE_DDT_FREE] = "ddt free";
s[ZIO_STAGE_GANG_ASSEMBLE] = "gang assemble";
s[ZIO_STAGE_GANG_ISSUE] = "gang issue";
s[ZIO_STAGE_DVA_THROTTLE] = "dva throttle";
s[ZIO_STAGE_DVA_ALLOCATE] = "dva alloc";
s[ZIO_STAGE_DVA_FREE] = "dva free";
s[ZIO_STAGE_DVA_CLAIM] = "dva claim";
s[ZIO_STAGE_READY] = "ready";
s[ZIO_STAGE_VDEV_IO_START] = "vdev io start";
s[ZIO_STAGE_VDEV_IO_DONE] = "vdev io done";
s[ZIO_STAGE_VDEV_IO_ASSESS] = "vdev io assess";
s[ZIO_STAGE_CHECKSUM_VERIFY] = "chksum verify";
s[ZIO_STAGE_DONE] = "done";
}
::zio_execute:entry {
this->args = args[0];
}
::zio_execute:return /this->args/ {
this->w = this->args->io_waiter != NULL ? 1 : 0;
this->e = this->args->io_executor != NULL ? 1 : 0;
this->st = this->args->io_stage;
this->vd = this->args->io_vd != NULL ? 1 : 0;
@[this->w, this->e, this->vd, s[this->st]] = count();
}
dtrace -qn '
BEGIN {
/*
This is the limit for dirty data before throttle is engaged.
Dirty bytes above this limit will cause throttle mechanism to
start slowly and gradually increase as the amount of dirty data
increases.
*/
dirty_no_throttle_limit =
(`zfs_delay_min_dirty_percent * `zfs_dirty_data_max) / 100;
printf("timestamp,pool,mb.dirty,max.mb.dirty,pct.dirty.used,throttle\n");
}
::txg_sync_thread:txg-syncing {
this->dp = (dsl_pool_t *)arg0;
this->spa = (char *)this->dp->dp_spa->spa_name;
}
::txg_sync_thread:txg-syncing
/this->dp->dp_spa->spa_name != NULL &&
stringof(this->dp->dp_spa->spa_name) != "bp"/ {
this->thr =
(this->dp->dp_dirty_total >= dirty_no_throttle_limit) ? 1 : 0;
this->dd_actual_mb = this->dp->dp_dirty_total >> 20;
this->dd_limit_mb = `zfs_dirty_data_max >> 20;
this->dd_pct = (100 * this->dd_actual_mb) / this->dd_limit_mb;
printf("%Y,%s,%d,%d,%d,%d\n",
walltimestamp,
stringof(this->spa),
this->dd_actual_mb,
this->dd_limit_mb,
this->dd_pct,
this->thr);
}'
dtrace -qn '
#pragma D option dynvarsize=8M
::zio_execute:entry {
self->ziop = args[0];
self->t = timestamp;
}
::zio_execute:return /self->t/ {
@ = quantize(timestamp - self->t);
self->t = 0;
self->ziop = 0;
}'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment