I've had a chance to test these suggestions again on an EC2 i2.4xlarge
instance. atime=off
made no difference, zfs_prefetch_disable=1
made no difference either.
Running iotop
I am not seeing substantial read workloads; it's all writes as expected.
I can't get more than ~65K IOPS.
For lack of something else to try, here are the details of the most recent test I ran:
I set up the test with these commands:
$ sudo zpool create testpool xvdb xvdc xvdd xvde -o ashift=12 -f
$ sudo zfs set recordsize=4k testpool
$ sudo zfs create testpool/testfs -o atime=off
$ sudo fio --name randwrite --ioengine=libaio --iodepth=8 --rw=randwrite --bs=4k --size=398G --numjobs=8 --runtime=300 --group_reporting --fallocate=none --filename=/testpool/testfs/testfile
[snipped]
write: io=83097MB, bw=283635KB/s, iops=70908, runt=300001msec
slat (usec): min=12, max=760093, avg=105.69, stdev=1301.80
clat (usec): min=2, max=760359, avg=791.50, stdev=3756.16
lat (usec): min=160, max=760402, avg=898.42, stdev=4014.92
[snipped]
It was suggested that perhaps there's some issue with arc_meta
being larger than arc_p
. That seems not to be the case:
$ sudo grep arc_meta_used /proc/spl/kstat/zfs/arcstats
arc_meta_used 4 22214205408
$ sudo grep "^p\s" /proc/spl/kstat/zfs/arcstats
p 4 56332704256
In case it's of interest, here are the full contents of the arcstats
file:
$ sudo cat /proc/spl/kstat/zfs/arcstats
6 1 0x01 91 4368 3134408873 1202064674663
name type data
hits 4 2678466
misses 4 3356
demand_data_hits 4 2675829
demand_data_misses 4 0
demand_metadata_hits 4 2637
demand_metadata_misses 4 3337
prefetch_data_hits 4 0
prefetch_data_misses 4 0
prefetch_metadata_hits 4 0
prefetch_metadata_misses 4 19
mru_hits 4 1341160
mru_ghost_hits 4 0
mfu_hits 4 1337306
mfu_ghost_hits 4 0
deleted 4 7451397
mutex_miss 4 7377
evict_skip 4 13
evict_not_enough 4 99
evict_l2_cached 4 0
evict_l2_eligible 4 39389438464
evict_l2_ineligible 4 137216
evict_l2_skip 4 0
hash_elements 4 13272618
hash_elements_max 4 13696621
hash_collisions 4 15807102
hash_chains 4 3301851
hash_chain_max 4 9
p 4 56332704256
c 4 64462960640
c_min 4 33554432
c_max 4 64462960640
size 4 64306323424
hdr_size 4 5419828504
data_size 4 42092118016
metadata_size 4 13506312704
other_size 4 3288064200
anon_size 4 16384
anon_evictable_data 4 0
anon_evictable_metadata 4 0
mru_size 4 55598348800
mru_evictable_data 4 42092118016
mru_evictable_metadata 4 43360768
mru_ghost_size 4 8864609792
mru_ghost_evictable_data 4 8774275072
mru_ghost_evictable_metadata 4 90334720
mfu_size 4 65536
mfu_evictable_data 4 0
mfu_evictable_metadata 4 65536
mfu_ghost_size 4 3478016
mfu_ghost_evictable_data 4 233472
mfu_ghost_evictable_metadata 4 3244544
l2_hits 4 0
l2_misses 4 0
l2_feeds 4 0
l2_rw_clash 4 0
l2_read_bytes 4 0
l2_write_bytes 4 0
l2_writes_sent 4 0
l2_writes_done 4 0
l2_writes_error 4 0
l2_writes_lock_retry 4 0
l2_evict_lock_retry 4 0
l2_evict_reading 4 0
l2_evict_l1cached 4 0
l2_free_on_write 4 0
l2_cdata_free_on_write 4 0
l2_abort_lowmem 4 0
l2_cksum_bad 4 0
l2_io_error 4 0
l2_size 4 0
l2_asize 4 0
l2_hdr_size 4 0
l2_compress_successes 4 0
l2_compress_zeros 4 0
l2_compress_failures 4 0
memory_throttle_count 4 0
duplicate_buffers 4 0
duplicate_buffers_size 4 0
duplicate_reads 4 0
memory_direct_count 4 0
memory_indirect_count 4 0
arc_no_grow 4 0
arc_tempreserve 4 0
arc_loaned_bytes 4 0
arc_prune 4 0
arc_meta_used 4 22214205408
arc_meta_limit 4 48347220480
arc_meta_max 4 22512226424
arc_meta_min 4 16777216
arc_need_free 4 0
arc_sys_free 4 2014466048
And here are the IO stats for the pool after the test:
$ sudo zpool iostat -v
capacity operations bandwidth
pool alloc free read write read write
---------- ----- ----- ----- ----- ----- -----
testpool 80.9G 2.83T 0 4.47K 1.26K 143M
xvdb 20.3G 724G 0 1.07K 322 35.8M
xvdc 20.2G 724G 0 1.09K 322 35.8M
xvdd 20.0G 724G 0 1.13K 322 35.7M
xvde 20.4G 724G 0 1.17K 322 36.0M
---------- ----- ----- ----- ----- ----- -----
And the pool contents:
$ sudo zpool list -v
NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT
testpool 2.91T 80.9G 2.83T - 4% 2% 1.00x ONLINE -
xvdb 744G 20.3G 724G - 5% 2%
xvdc 744G 20.2G 724G - 5% 2%
xvdd 744G 20.0G 724G - 4% 2%
xvde 744G 20.4G 724G - 4% 2%
Here are all of the ZFS options:
$ sudo zfs get all
NAME PROPERTY VALUE SOURCE
testpool type filesystem -
testpool creation Mon Apr 11 21:30 2016 -
testpool used 80.9G -
testpool available 2.74T -
testpool referenced 96K -
testpool compressratio 1.00x -
testpool mounted yes -
testpool quota none default
testpool reservation none default
testpool recordsize 4K local
testpool mountpoint /testpool default
testpool sharenfs off default
testpool checksum on default
testpool compression off default
testpool atime on default
testpool devices on default
testpool exec on default
testpool setuid on default
testpool readonly off default
testpool zoned off default
testpool snapdir hidden default
testpool aclinherit restricted default
testpool canmount on default
testpool xattr on default
testpool copies 1 default
testpool version 5 -
testpool utf8only off -
testpool normalization none -
testpool casesensitivity sensitive -
testpool vscan off default
testpool nbmand off default
testpool sharesmb off default
testpool refquota none default
testpool refreservation none default
testpool primarycache all default
testpool secondarycache all default
testpool usedbysnapshots 0 -
testpool usedbydataset 96K -
testpool usedbychildren 80.9G -
testpool usedbyrefreservation 0 -
testpool logbias latency default
testpool dedup off default
testpool mlslabel none default
testpool sync standard default
testpool refcompressratio 1.00x -
testpool written 96K -
testpool logicalused 77.5G -
testpool logicalreferenced 40K -
testpool filesystem_limit none default
testpool snapshot_limit none default
testpool filesystem_count none default
testpool snapshot_count none default
testpool snapdev hidden default
testpool acltype off default
testpool context none default
testpool fscontext none default
testpool defcontext none default
testpool rootcontext none default
testpool relatime on temporary
testpool redundant_metadata all default
testpool overlay off default
testpool/testfs type filesystem -
testpool/testfs creation Mon Apr 11 21:31 2016 -
testpool/testfs used 80.5G -
testpool/testfs available 2.74T -
testpool/testfs referenced 80.5G -
testpool/testfs compressratio 1.00x -
testpool/testfs mounted yes -
testpool/testfs quota none default
testpool/testfs reservation none default
testpool/testfs recordsize 4K inherited from testpool
testpool/testfs mountpoint /testpool/testfs default
testpool/testfs sharenfs off default
testpool/testfs checksum on default
testpool/testfs compression off default
testpool/testfs atime off local
testpool/testfs devices on default
testpool/testfs exec on default
testpool/testfs setuid on default
testpool/testfs readonly off default
testpool/testfs zoned off default
testpool/testfs snapdir hidden default
testpool/testfs aclinherit restricted default
testpool/testfs canmount on default
testpool/testfs xattr on default
testpool/testfs copies 1 default
testpool/testfs version 5 -
testpool/testfs utf8only off -
testpool/testfs normalization none -
testpool/testfs casesensitivity sensitive -
testpool/testfs vscan off default
testpool/testfs nbmand off default
testpool/testfs sharesmb off default
testpool/testfs refquota none default
testpool/testfs refreservation none default
testpool/testfs primarycache all default
testpool/testfs secondarycache all default
testpool/testfs usedbysnapshots 0 -
testpool/testfs usedbydataset 80.5G -
testpool/testfs usedbychildren 0 -
testpool/testfs usedbyrefreservation 0 -
testpool/testfs logbias latency default
testpool/testfs dedup off default
testpool/testfs mlslabel none default
testpool/testfs sync standard default
testpool/testfs refcompressratio 1.00x -
testpool/testfs written 80.5G -
testpool/testfs logicalused 77.4G -
testpool/testfs logicalreferenced 77.4G -
testpool/testfs filesystem_limit none default
testpool/testfs snapshot_limit none default
testpool/testfs filesystem_count none default
testpool/testfs snapshot_count none default
testpool/testfs snapdev hidden default
testpool/testfs acltype off default
testpool/testfs context none default
testpool/testfs fscontext none default
testpool/testfs defcontext none default
testpool/testfs rootcontext none default
testpool/testfs relatime off default
testpool/testfs redundant_metadata all default
testpool/testfs overlay off default