Last active
January 2, 2019 16:07
-
-
Save szaydel/77d40f65658f051567be to your computer and use it in GitHub Desktop.
OpenZFS write throttling observations and other observations around the ZIO pipeline. New write throttle in OpenZFS is well explained in this post by Adam: http://blog.delphix.com/ahl/2014/tuning-openzfs-write-throttle/. These snippets are borrowing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -qCs | |
#define UPPER_THRESHOLD 100 | |
/* See zio.h line 147 or thereabout */ | |
enum zio_priority { | |
ZIO_PRIORITY_SYNC_READ, | |
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ | |
ZIO_PRIORITY_ASYNC_READ, /* prefetch */ | |
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ | |
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ | |
ZIO_PRIORITY_NUM_QUEUEABLE, | |
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ | |
}; | |
BEGIN { | |
qnames[ZIO_PRIORITY_SYNC_READ] = "Sync Read"; | |
qnames[ZIO_PRIORITY_SYNC_WRITE] = "Sync Write"; | |
qnames[ZIO_PRIORITY_ASYNC_READ] = "Async Read"; | |
qnames[ZIO_PRIORITY_ASYNC_WRITE] = "Async Write"; | |
qnames[ZIO_PRIORITY_SCRUB] = "Scrub"; | |
qnames[ZIO_PRIORITY_NUM_QUEUEABLE] = "Not Queueable"; | |
qnames[ZIO_PRIORITY_NOW] = "Not Yet Queued"; | |
} | |
::vdev_queue_io_add:entry { | |
this->pri = args[1]->io_priority; /* one of zio_priority above */ | |
this->spa = args[1]->io_spa; | |
/* | |
* For queue to which this IO is being queued, | |
* how many IOs are already active, grouped-by pool? | |
*/ | |
@lq[stringof(this->spa->spa_name), qnames[this->pri]] = | |
lquantize(args[0]->vq_class[this->pri].vqc_active, 0, UPPER_THRESHOLD); | |
} | |
END { | |
printa("\t %s | [%s]%@d\n", @lq); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
/* See zio.h line 147 or thereabout */ | |
enum zio_priority { | |
ZIO_PRIORITY_SYNC_READ, | |
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ | |
ZIO_PRIORITY_ASYNC_READ, /* prefetch */ | |
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ | |
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ | |
ZIO_PRIORITY_NUM_QUEUEABLE, | |
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ | |
}; | |
BEGIN { | |
qnames[ZIO_PRIORITY_SYNC_READ] = "Sync Read"; | |
qnames[ZIO_PRIORITY_SYNC_WRITE] = "Sync Write"; | |
qnames[ZIO_PRIORITY_ASYNC_READ] = "Async Read"; | |
qnames[ZIO_PRIORITY_ASYNC_WRITE] = "Async Write"; | |
qnames[ZIO_PRIORITY_SCRUB] = "Scrub"; | |
qnames[ZIO_PRIORITY_NUM_QUEUEABLE] = "Not Queueable"; | |
qnames[ZIO_PRIORITY_NOW] = "Not Yet Queued"; | |
} | |
::vdev_queue_io_add:entry { | |
this->pri = args[1]->io_priority; /* one of zio_priority above */ | |
/* | |
* For queue to which this IO is being queued, | |
* how many IOs are already active? | |
*/ | |
@lq[qnames[this->pri]] = lquantize(args[0]->vq_class[this->pri].vqc_active, 0, 100); | |
}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
/* Measure amount of dirty data and whether or not IO is going to be | |
* delayed by some fraction of time. | |
* Adjusting amount of allowed dirty data before throttle kicks in | |
* will likely increase latency, and would result in pushing disks harder | |
* which potentially is going to increase throughput. | |
* To adjust, tune zfs_dirty_data_max, which is the upper bound for how | |
* much dirty data is permitted. | |
* # echo zfs_dirty_data_max/Z0x400000000|mdb -kw | |
*/ | |
::dsl_pool_need_dirty_delay:entry { | |
self->spa = args[0]->dp_spa ; | |
self->dirty = args[0]->dp_dirty_total >> 20 ; /* In Megabytes */ | |
} | |
::dsl_pool_need_dirty_delay:return /self->spa/ { | |
this->res = args[1] > 0 ? "Yes" : "No" ; | |
@[stringof(self->spa->spa_name), this->res] = | |
quantize(self->dirty) ; | |
self->spa = 0 ; | |
self->dirty = 0; | |
} | |
END { printa(" [%s] | Throttle [%s] %@d\n", @) ; } | |
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Print name of pool and amount of dirty data for that pool. | |
dtrace -qn '::dsl_pool_need_dirty_delay:entry {@[args[0]->dp_spa->spa_name] = quantize(args[0]->dp_dirty_total >> 20);}' | |
dtrace -qn ' | |
::dsl_pool_need_dirty_delay:entry /args[0]->dp_spa->spa_name != "bp"/ { | |
@av[args[0]->dp_spa->spa_name] = avg(args[0]->dp_dirty_total >> 20); | |
@m[args[0]->dp_spa->spa_name] = max(args[0]->dp_dirty_total >> 20); | |
} | |
tick-5sec { | |
printa("%s,%@dMB,%@dMB\n", @av,@m); trunc(@av); trunc(@m); | |
} | |
END { | |
printa("%s,%@dMB,%@dMB\n", @av,@m); | |
}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
int limit, ct, slowct, xslowct ; | |
inline int LIMIT = 1000000 ; /* limit in nanoseconds */ | |
::dmu_tx_wait:entry { | |
self->spa = args[0]->tx_pool->dp_spa ; | |
self->txbirth = args[0]->tx_start ; | |
} | |
::dmu_tx_wait:return /self->txbirth/ { | |
this->delta = timestamp - self->txbirth ; | |
this->slow = this->delta >= LIMIT ; | |
this->xslow = this->delta >= LIMIT*10 ; | |
this->sn = (char *)self->spa->spa_name ; | |
@hist[stringof(this->sn)] = quantize(this->delta) ; | |
@avdelay[stringof(this->sn)] = avg(this->delta) ; | |
@maxdelay[stringof(this->sn)] = max(this->delta) ; | |
@sddelay[stringof(this->sn)] = stddev(this->delta) ; | |
++ct ; | |
slowct += this->slow ? 1 : 0 ; | |
xslowct += this->xslow ? 1 : 0 ; | |
self->txbirth = 0 ; self->spa = 0 ; | |
} | |
END /ct > 0/ { | |
printf(" %% TX(s) delayed by <= %d(us): %d\n", | |
LIMIT/1000, 100 - (100*slowct)/ct) ; | |
printf(" %% TX(s) delayed by >= %d(ms): %d\n", | |
LIMIT/100000, (100*xslowct)/ct) ; | |
printa(" [%s] Delay [Mean|Max|SD](ns): %@d|%@d|%@d %@d", | |
@avdelay, @maxdelay, @sddelay, @hist) ; | |
}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
::dmu_tx_wait:entry { | |
self->spa = args[0]->tx_pool->dp_spa ; | |
self->s = timestamp ; | |
self->wasdelayed = args[0]->tx_waited ; | |
} | |
::dmu_tx_wait:return /self->s/ { | |
this->unlucky = self->wasdelayed > 0 ? "Yes" : "No" ; | |
this->sn = (char *)self->spa->spa_name ; | |
@[stringof(this->sn), this->unlucky] = quantize(timestamp - self->s) ; | |
self->s = 0 ; self->spa = 0 ; self->wasdelayed = 0 ; | |
} | |
END { printa(" [%s] | TX with >1 Delay [%s] %@d\n", @) ; } | |
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/sbin/dtrace -s -C | |
#pragma D option quiet | |
/* | |
* See /usr/src/uts/common/fs/zfs/sys/zio_impl.h | |
* for more details about this enum. | |
*/ | |
enum zio_stage { | |
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ | |
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ | |
ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ | |
ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ | |
ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ | |
ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ | |
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ | |
ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ | |
ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ | |
ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ | |
ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ | |
ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ | |
ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ | |
ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ | |
ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ | |
ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ | |
ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ | |
ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ | |
ZIO_STAGE_READY = 1 << 18, /* RWFCI */ | |
ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */ | |
ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */ | |
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */ | |
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ | |
ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ | |
}; | |
BEGIN { | |
s[NULL] = "null"; | |
s[ZIO_STAGE_OPEN] = "open"; | |
s[ZIO_STAGE_READ_BP_INIT] = "read bp init"; | |
s[ZIO_STAGE_WRITE_BP_INIT] = "write bp init"; | |
s[ZIO_STAGE_FREE_BP_INIT] = "free bp init"; | |
s[ZIO_STAGE_ISSUE_ASYNC] = "issue async"; | |
s[ZIO_STAGE_WRITE_COMPRESS] = "write compress"; | |
s[ZIO_STAGE_CHECKSUM_GENERATE] = "checksum generate"; | |
s[ZIO_STAGE_NOP_WRITE] = "nop write"; | |
s[ZIO_STAGE_DDT_READ_START] = "ddt read start"; | |
s[ZIO_STAGE_DDT_READ_DONE] = "ddt read done"; | |
s[ZIO_STAGE_DDT_WRITE] = "ddt write"; | |
s[ZIO_STAGE_DDT_FREE] = "ddt free"; | |
s[ZIO_STAGE_GANG_ASSEMBLE] = "gang assemble"; | |
s[ZIO_STAGE_GANG_ISSUE] = "gang issue"; | |
s[ZIO_STAGE_DVA_THROTTLE] = "dva throttle"; | |
s[ZIO_STAGE_DVA_ALLOCATE] = "dva alloc"; | |
s[ZIO_STAGE_DVA_FREE] = "dva free"; | |
s[ZIO_STAGE_DVA_CLAIM] = "dva claim"; | |
s[ZIO_STAGE_READY] = "ready"; | |
s[ZIO_STAGE_VDEV_IO_START] = "vdev io start"; | |
s[ZIO_STAGE_VDEV_IO_DONE] = "vdev io done"; | |
s[ZIO_STAGE_VDEV_IO_ASSESS] = "vdev io assess"; | |
s[ZIO_STAGE_CHECKSUM_VERIFY] = "chksum verify"; | |
s[ZIO_STAGE_DONE] = "done"; | |
} | |
::zio_execute:entry { | |
this->args = args[0]; | |
} | |
::zio_execute:return /this->args/ { | |
this->w = this->args->io_waiter != NULL ? 1 : 0; | |
this->e = this->args->io_executor != NULL ? 1 : 0; | |
this->st = this->args->io_stage; | |
this->vd = this->args->io_vd != NULL ? 1 : 0; | |
@[this->w, this->e, this->vd, s[this->st]] = count(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
BEGIN { | |
/* | |
This is the limit for dirty data before throttle is engaged. | |
Dirty bytes above this limit will cause throttle mechanism to | |
start slowly and gradually increase as the amount of dirty data | |
increases. | |
*/ | |
dirty_no_throttle_limit = | |
(`zfs_delay_min_dirty_percent * `zfs_dirty_data_max) / 100; | |
printf("timestamp,pool,mb.dirty,max.mb.dirty,pct.dirty.used,throttle\n"); | |
} | |
::txg_sync_thread:txg-syncing { | |
this->dp = (dsl_pool_t *)arg0; | |
this->spa = (char *)this->dp->dp_spa->spa_name; | |
} | |
::txg_sync_thread:txg-syncing | |
/this->dp->dp_spa->spa_name != NULL && | |
stringof(this->dp->dp_spa->spa_name) != "bp"/ { | |
this->thr = | |
(this->dp->dp_dirty_total >= dirty_no_throttle_limit) ? 1 : 0; | |
this->dd_actual_mb = this->dp->dp_dirty_total >> 20; | |
this->dd_limit_mb = `zfs_dirty_data_max >> 20; | |
this->dd_pct = (100 * this->dd_actual_mb) / this->dd_limit_mb; | |
printf("%Y,%s,%d,%d,%d,%d\n", | |
walltimestamp, | |
stringof(this->spa), | |
this->dd_actual_mb, | |
this->dd_limit_mb, | |
this->dd_pct, | |
this->thr); | |
}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dtrace -qn ' | |
#pragma D option dynvarsize=8M | |
::zio_execute:entry { | |
self->ziop = args[0]; | |
self->t = timestamp; | |
} | |
::zio_execute:return /self->t/ { | |
@ = quantize(timestamp - self->t); | |
self->t = 0; | |
self->ziop = 0; | |
}' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment