While investigating neondatabase/neon#11446 I learned a lot about the exact conditions in which io_uring punts to async workers (io-wq
).
Specifically, I was surprised that, on Debian Bookworm 6.1.0-32-amd64
, all Direct IO writes on ext4 would get punted to async workers.
Even fallocate
ing the space upfront didn't help.
I wrote a reproducer app (see appendix) and used bpftrace
+ light kernel patching to triangulate in which cases we punt, and why.
The gist is: on mainline kernel 6.12.25, fallocate()
before you write, and it should work.
Exampel Rust app, using https://github.com/neondatabase/tokio-epoll-uring to perform O_DIRECT write of an 8k buffer using io_uring. CLI flag specifies whether and how to fallocate before the write. (Code: seen appendix)
Bpftrace script to monitor for punting (tracepoint:io_uring:io_uring_queue_async_work
) and a kretprobe on ext4_file_write_iter
which is the entrypoint into ext4 for our O_DIRECT write that will decide whether to punt or not.
sudo bpftrace -v -e '
kfunc:io_write { @in_io_uring[tid] = true; }
kretfunc:io_write { delete(@in_io_uring[tid]); }
kretfunc:ext4_file_write_iter {
if (!@in_io_uring[tid]) { return; }
printf("%d %s\n%s\n\n", retval, probe, kstack);
}
tracepoint:io_uring:io_uring_queue_async_work { printf("%s\n%s\n\n", probe, kstack); }
'
There are two patterns:
- The
io_uring_enter
submission path bails becauseext4_file_write_iter
bails with EAGAIN (-11). We punt (io_uring_queue_async_work
) and the submission is retried from anio_wqe_worker
, this time successfully. - The
io_uring_enter
submission path gets -529 (EIOCBQUEUED) back fromext4_file_write_iter
. This means the ext4 write path was able to issue the IO directly to the block device.
$ ./target/debug/examples/direct_io_write no-fallocate
-11 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_submit_sqes+508
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
tracepoint:io_uring:io_uring_queue_async_work
io_queue_iowq+245
io_queue_iowq+245
io_queue_async+153
io_submit_sqes+1264
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
8192 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_wq_submit_work+132
io_worker_handle_work+395
io_wqe_worker+293
ret_from_fork+31
$ ./target/debug/examples/direct_io_write fallocate-0
-11 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_submit_sqes+508
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
tracepoint:io_uring:io_uring_queue_async_work
io_queue_iowq+245
io_queue_iowq+245
io_queue_async+153
io_submit_sqes+1264
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
-529 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_wq_submit_work+132
io_worker_handle_work+395
io_wqe_worker+293
ret_from_fork+31
$ ./target/debug/examples/direct_io_write fallocate-keep-size
-11 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_submit_sqes+508
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
tracepoint:io_uring:io_uring_queue_async_work
io_queue_iowq+245
io_queue_iowq+245
io_queue_async+153
io_submit_sqes+1264
__do_sys_io_uring_enter+961
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+110
8192 kretfunc:ext4:ext4_file_write_iter
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
bpf_trampoline_307090262685+105
ext4_file_write_iter+5
io_write+353
bpf_trampoline_6442501221+118
io_write+5
io_issue_sqe+102
io_wq_submit_work+132
io_worker_handle_work+395
io_wqe_worker+293
ret_from_fork+31
root@crashandburn:~# /tmp/direct_io_write no-fallocate
-11 fexit:vmlinux:ext4_file_write_iter
fexit_vmlinux_ext4_file_write_iter_3+382
fexit_vmlinux_ext4_file_write_iter_3+382
bpf_trampoline_6442622353+108
ext4_file_write_iter+9
bpf_trampoline_6442524098+121
io_write+9
io_submit_sqe+389
io_submit_sqes+229
__se_sys_io_uring_enter+281
__x64_sys_io_uring_enter+45
x64_sys_call+8811
do_syscall_64+192
entry_SYSCALL_64_after_hwframe+103
tracepoint:io_uring:io_uring_queue_async_work
io_queue_iowq+380
io_queue_iowq+380
io_queue_async+264
io_submit_sqe+953
io_submit_sqes+229
__se_sys_io_uring_enter+281
__x64_sys_io_uring_enter+45
x64_sys_call+8811
do_syscall_64+192
entry_SYSCALL_64_after_hwframe+103
8192 fexit:vmlinux:ext4_file_write_iter
fexit_vmlinux_ext4_file_write_iter_3+382
fexit_vmlinux_ext4_file_write_iter_3+382
bpf_trampoline_6442622353+108
ext4_file_write_iter+9
bpf_trampoline_6442524098+121
io_write+9
io_wq_submit_work+440
io_worker_handle_work+449
io_wq_worker+310
ret_from_fork+63
ret_from_fork_asm+17
root@crashandburn:~# /tmp/direct_io_write fallocate-0
-529 fexit:vmlinux:ext4_file_write_iter
fexit_vmlinux_ext4_file_write_iter_3+382
fexit_vmlinux_ext4_file_write_iter_3+382
bpf_trampoline_6442622353+108
ext4_file_write_iter+9
bpf_trampoline_6442524098+121
io_write+9
io_submit_sqe+389
io_submit_sqes+229
__se_sys_io_uring_enter+281
__x64_sys_io_uring_enter+45
x64_sys_call+8811
do_syscall_64+192
entry_SYSCALL_64_after_hwframe+103
root@crashandburn:~# /tmp/direct_io_write fallocate-keep-size
-11 fexit:vmlinux:ext4_file_write_iter
fexit_vmlinux_ext4_file_write_iter_3+382
fexit_vmlinux_ext4_file_write_iter_3+382
bpf_trampoline_6442622353+108
ext4_file_write_iter+9
bpf_trampoline_6442524098+121
io_write+9
io_submit_sqe+389
io_submit_sqes+229
__se_sys_io_uring_enter+281
__x64_sys_io_uring_enter+45
x64_sys_call+8811
do_syscall_64+192
entry_SYSCALL_64_after_hwframe+103
tracepoint:io_uring:io_uring_queue_async_work
io_queue_iowq+380
io_queue_iowq+380
io_queue_async+264
io_submit_sqe+953
io_submit_sqes+229
__se_sys_io_uring_enter+281
__x64_sys_io_uring_enter+45
x64_sys_call+8811
do_syscall_64+192
entry_SYSCALL_64_after_hwframe+103
8192 fexit:vmlinux:ext4_file_write_iter
fexit_vmlinux_ext4_file_write_iter_3+382
fexit_vmlinux_ext4_file_write_iter_3+382
bpf_trampoline_6442622353+108
ext4_file_write_iter+9
bpf_trampoline_6442524098+121
io_write+9
io_wq_submit_work+440
io_worker_handle_work+449
io_wq_worker+310
ret_from_fork+63
ret_from_fork_asm+17
6.1.0-32-amd64 | 6.12.25 mainline | |
---|---|---|
no-fallocate | punts with EAGAIN (-11) | punts with EAGAIN (-1)1 |
fallocate-0 | punts with EAGAIN (-11) | no punting, issue directly, EIOCBQUEUED(-529) |
fallocate-keep-size | punts with EAGAIN (-11) | punts with EAGAIN (-11) |
So, tl;dr: we need a more recent kernel, and we need to fallocate the space we overwrite.
Let's see where exactly the 6.12.25 kernel punts.
I compile 6.12.25
and use qemu+gdb, which allows me to figure out that ext4_dio_write_checks
is what returns the EAGAIN
.
I then sprinkle a few pr_debug statements (see the patch in Appendix).
Reboot into this kernel
echo 'file fs/ext4/file.c +p' > /sys/kernel/debug/dynamic_debug/control
Both no-fallocate
and fallocate-keep-size
punt in ext4_dio_write_checks
returning EAGAIN because unaligned_io=0 extend=1
i.e., here:
if (!*ilock_shared && (unaligned_io || *extend)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
The extend=1
was set by
*extend = ext4_extending_io(inode, offset, count);
Looking at what that function does
static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
if (offset + len > i_size_read(inode) ||
offset + len > EXT4_I(inode)->i_disksize)
return true;
return false;
}
Ok, so, that explains it: our write operation has to fit within i_size_read
AND i_disksize
.
Otherwise it's treated as an extending IO and we punt because extend=1
.
The i_size_read
is the user-visible size.
But what is i_disksize
?
/*
* i_disksize keeps track of what the inode size is ON DISK, not
* in memory. During truncate, i_size is set to the new size by
* the VFS prior to calling ext4_truncate(), but the filesystem won't
* set i_disksize to 0 until the truncate is actually under way.
*
* The intent is that i_disksize always represents the blocks which
* are used by this file. This allows recovery to restart truncate
* on orphans if we crash during truncate. We actually write i_disksize
* into the on-disk inode when writing inodes out, instead of i_size.
*
* The only time when i_disksize and i_size may be different is when
* a truncate is in progress. The only things which change i_disksize
* are ext4_get_block (growth) and ext4_truncate (shrinkth).
*/
loff_t i_disksize;
Ok, so, that explains the behavior:
no-fallocate
: the i_size/i_disksize are both zerofallocate-0
: it changes thei_size
andi_disksize
fallocate-keep-size
doesn't change neitheri_size
nori_disksize
This here is the code that exempts size field updates if FALLOC_FL_KEEP_SIZE
is set (ext4_fallocate
)
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..e08d0c211946 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -456,6 +456,8 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
((!IS_NOSEC(inode) || *extend || !overwrite ||
(unaligned_io && *unwritten)))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
+ pr_debug("returning EAGAIN because extend=%d overwrite=%d unaligned_io=%d\n",
+ *extend, overwrite, unaligned_io);
ret = -EAGAIN;
goto out;
}
@@ -474,6 +476,8 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
*/
if (!*ilock_shared && (unaligned_io || *extend)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
+ pr_debug("returning EAGAIN because unaligned_io=%d extend=%d\n",
+ unaligned_io, *extend);
ret = -EAGAIN;
goto out;
}
@@ -495,7 +499,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
return ret;
}
-static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static noinline ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
handle_t *handle;
use nix::{fcntl::FallocateFlags, libc::O_DIRECT};
use std::os::unix::fs::OpenOptionsExt;
#[tokio::main]
async fn main() {
let fallocate: Option<FallocateFlags> = {
let arg = std::env::args().nth(1).unwrap_or_else(|| {
panic!("missing first argument, must be one of fallocate-keep-size, fallocate-0, no-fallocate")
});
match arg.as_str() {
"fallocate-keep-size" => Some(FallocateFlags::FALLOC_FL_KEEP_SIZE),
"fallocate-0" => Some(FallocateFlags::empty()),
"no-fallocate" => None,
_ => panic!("invalid argument"),
}
};
let system = tokio_epoll_uring::System::launch().await.unwrap();
let file = "testfile.data";
match std::fs::remove_file(file) {
Ok(_) => println!("File removed successfully"),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
println!("File not found, proceeding to create a new one");
}
Err(e) => {
panic!("{e}");
}
}
println!("creating file");
let file = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.custom_flags(O_DIRECT)
.open(file)
.unwrap();
if let Some(flags) = fallocate {
println!("fallocating");
use std::os::fd::AsRawFd;
nix::fcntl::fallocate(file.as_raw_fd(), flags, 0, 8192).unwrap();
} else {
println!("skipping fallocate");
}
println!("issuing the write");
let fd: std::os::fd::OwnedFd = file.into();
let buf =
unsafe { std::alloc::alloc(std::alloc::Layout::from_size_align(8192, 8192).unwrap()) };
if buf.is_null() {
panic!("Failed to allocate buffer");
}
let mut vec = unsafe { Vec::from_raw_parts(buf, 8192, 8192) };
vec.fill(1);
let (_, res) = system.write(fd, 0, vec).await;
let written = res.unwrap();
assert_eq!(written, 8192, "not expecting short write");
}