Skip to content

Instantly share code, notes, and snippets.

@problame
Last active April 29, 2025 12:49
Show Gist options
  • Save problame/ed876bea40b915ba53267b8265e99352 to your computer and use it in GitHub Desktop.
Save problame/ed876bea40b915ba53267b8265e99352 to your computer and use it in GitHub Desktop.
io_uring, ext4, and O_DIRECT writes - how to make it issue directly to disk, without punting

Summary

While investigating neondatabase/neon#11446 I learned a lot about the exact conditions in which io_uring punts to async workers (io-wq).

Specifically, I was surprised that, on Debian Bookworm 6.1.0-32-amd64, all Direct IO writes on ext4 would get punted to async workers. Even fallocateing the space upfront didn't help.

I wrote a reproducer app (see appendix) and used bpftrace + light kernel patching to triangulate in which cases we punt, and why.

The gist is: on mainline kernel 6.12.25, fallocate() before you write, and it should work.

Experiments

Exampel Rust app, using https://github.com/neondatabase/tokio-epoll-uring to perform O_DIRECT write of an 8k buffer using io_uring. CLI flag specifies whether and how to fallocate before the write. (Code: seen appendix)

Bpftrace script to monitor for punting (tracepoint:io_uring:io_uring_queue_async_work) and a kretprobe on ext4_file_write_iter which is the entrypoint into ext4 for our O_DIRECT write that will decide whether to punt or not.

sudo bpftrace -v -e '
kfunc:io_write { @in_io_uring[tid] = true; }
kretfunc:io_write { delete(@in_io_uring[tid]); }
kretfunc:ext4_file_write_iter {
    if (!@in_io_uring[tid]) { return; }
    printf("%d %s\n%s\n\n", retval, probe, kstack);
}
tracepoint:io_uring:io_uring_queue_async_work { printf("%s\n%s\n\n", probe, kstack);  }
' 

There are two patterns:

  1. The io_uring_enter submission path bails because ext4_file_write_iter bails with EAGAIN (-11). We punt (io_uring_queue_async_work) and the submission is retried from an io_wqe_worker, this time successfully.
  2. The io_uring_enter submission path gets -529 (EIOCBQUEUED) back from ext4_file_write_iter. This means the ext4 write path was able to issue the IO directly to the block device.

Debian Bookworm 6.1.0-32-amd64

$ ./target/debug/examples/direct_io_write no-fallocate

-11 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_submit_sqes+508
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


tracepoint:io_uring:io_uring_queue_async_work

        io_queue_iowq+245
        io_queue_iowq+245
        io_queue_async+153
        io_submit_sqes+1264
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


8192 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_wq_submit_work+132
        io_worker_handle_work+395
        io_wqe_worker+293
        ret_from_fork+31

$ ./target/debug/examples/direct_io_write fallocate-0
-11 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_submit_sqes+508
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


tracepoint:io_uring:io_uring_queue_async_work

        io_queue_iowq+245
        io_queue_iowq+245
        io_queue_async+153
        io_submit_sqes+1264
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


-529 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_wq_submit_work+132
        io_worker_handle_work+395
        io_wqe_worker+293
        ret_from_fork+31
$ ./target/debug/examples/direct_io_write fallocate-keep-size
-11 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_submit_sqes+508
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


tracepoint:io_uring:io_uring_queue_async_work

        io_queue_iowq+245
        io_queue_iowq+245
        io_queue_async+153
        io_submit_sqes+1264
        __do_sys_io_uring_enter+961
        do_syscall_64+85
        entry_SYSCALL_64_after_hwframe+110


8192 kretfunc:ext4:ext4_file_write_iter

        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_prog_140abba06862ddbe_kretfunc_ext4_ext4_file_write_iter_3+215
        bpf_trampoline_307090262685+105
        ext4_file_write_iter+5
        io_write+353
        bpf_trampoline_6442501221+118
        io_write+5
        io_issue_sqe+102
        io_wq_submit_work+132
        io_worker_handle_work+395
        io_wqe_worker+293
        ret_from_fork+31

6.12.25 mainline

root@crashandburn:~# /tmp/direct_io_write  no-fallocate

-11 fexit:vmlinux:ext4_file_write_iter

        fexit_vmlinux_ext4_file_write_iter_3+382
        fexit_vmlinux_ext4_file_write_iter_3+382
        bpf_trampoline_6442622353+108
        ext4_file_write_iter+9
        bpf_trampoline_6442524098+121
        io_write+9
        io_submit_sqe+389
        io_submit_sqes+229
        __se_sys_io_uring_enter+281
        __x64_sys_io_uring_enter+45
        x64_sys_call+8811
        do_syscall_64+192
        entry_SYSCALL_64_after_hwframe+103


tracepoint:io_uring:io_uring_queue_async_work

        io_queue_iowq+380
        io_queue_iowq+380
        io_queue_async+264
        io_submit_sqe+953
        io_submit_sqes+229
        __se_sys_io_uring_enter+281
        __x64_sys_io_uring_enter+45
        x64_sys_call+8811
        do_syscall_64+192
        entry_SYSCALL_64_after_hwframe+103



8192 fexit:vmlinux:ext4_file_write_iter

        fexit_vmlinux_ext4_file_write_iter_3+382
        fexit_vmlinux_ext4_file_write_iter_3+382
        bpf_trampoline_6442622353+108
        ext4_file_write_iter+9
        bpf_trampoline_6442524098+121
        io_write+9
        io_wq_submit_work+440
        io_worker_handle_work+449
        io_wq_worker+310
        ret_from_fork+63
        ret_from_fork_asm+17

root@crashandburn:~# /tmp/direct_io_write  fallocate-0

-529 fexit:vmlinux:ext4_file_write_iter

        fexit_vmlinux_ext4_file_write_iter_3+382
        fexit_vmlinux_ext4_file_write_iter_3+382
        bpf_trampoline_6442622353+108
        ext4_file_write_iter+9
        bpf_trampoline_6442524098+121
        io_write+9
        io_submit_sqe+389
        io_submit_sqes+229
        __se_sys_io_uring_enter+281
        __x64_sys_io_uring_enter+45
        x64_sys_call+8811
        do_syscall_64+192
        entry_SYSCALL_64_after_hwframe+103
root@crashandburn:~# /tmp/direct_io_write  fallocate-keep-size

-11 fexit:vmlinux:ext4_file_write_iter

        fexit_vmlinux_ext4_file_write_iter_3+382
        fexit_vmlinux_ext4_file_write_iter_3+382
        bpf_trampoline_6442622353+108
        ext4_file_write_iter+9
        bpf_trampoline_6442524098+121
        io_write+9
        io_submit_sqe+389
        io_submit_sqes+229
        __se_sys_io_uring_enter+281
        __x64_sys_io_uring_enter+45
        x64_sys_call+8811
        do_syscall_64+192
        entry_SYSCALL_64_after_hwframe+103


tracepoint:io_uring:io_uring_queue_async_work

        io_queue_iowq+380
        io_queue_iowq+380
        io_queue_async+264
        io_submit_sqe+953
        io_submit_sqes+229
        __se_sys_io_uring_enter+281
        __x64_sys_io_uring_enter+45
        x64_sys_call+8811
        do_syscall_64+192
        entry_SYSCALL_64_after_hwframe+103


8192 fexit:vmlinux:ext4_file_write_iter

        fexit_vmlinux_ext4_file_write_iter_3+382
        fexit_vmlinux_ext4_file_write_iter_3+382
        bpf_trampoline_6442622353+108
        ext4_file_write_iter+9
        bpf_trampoline_6442524098+121
        io_write+9
        io_wq_submit_work+440
        io_worker_handle_work+449
        io_wq_worker+310
        ret_from_fork+63
        ret_from_fork_asm+17

Experiments Summary & Take-aways

6.1.0-32-amd64 6.12.25 mainline
no-fallocate punts with EAGAIN (-11) punts with EAGAIN (-1)1
fallocate-0 punts with EAGAIN (-11) no punting, issue directly, EIOCBQUEUED(-529)
fallocate-keep-size punts with EAGAIN (-11) punts with EAGAIN (-11)

So, tl;dr: we need a more recent kernel, and we need to fallocate the space we overwrite.

Understanding Why

Let's see where exactly the 6.12.25 kernel punts.

I compile 6.12.25 and use qemu+gdb, which allows me to figure out that ext4_dio_write_checks is what returns the EAGAIN.

I then sprinkle a few pr_debug statements (see the patch in Appendix).

Reboot into this kernel

echo 'file fs/ext4/file.c  +p' > /sys/kernel/debug/dynamic_debug/control 

Both no-fallocate and fallocate-keep-size punt in ext4_dio_write_checks

returning EAGAIN because unaligned_io=0 extend=1

i.e., here:

        if (!*ilock_shared && (unaligned_io || *extend)) {
		if (iocb->ki_flags & IOCB_NOWAIT) {

The extend=1 was set by

	*extend = ext4_extending_io(inode, offset, count);

Looking at what that function does

static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
	if (offset + len > i_size_read(inode) ||
	    offset + len > EXT4_I(inode)->i_disksize)
		return true;
	return false;
}

Ok, so, that explains it: our write operation has to fit within i_size_read AND i_disksize. Otherwise it's treated as an extending IO and we punt because extend=1.

The i_size_read is the user-visible size. But what is i_disksize?

/*
	 * i_disksize keeps track of what the inode size is ON DISK, not
	 * in memory.  During truncate, i_size is set to the new size by
	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
	 * set i_disksize to 0 until the truncate is actually under way.
	 *
	 * The intent is that i_disksize always represents the blocks which
	 * are used by this file.  This allows recovery to restart truncate
	 * on orphans if we crash during truncate.  We actually write i_disksize
	 * into the on-disk inode when writing inodes out, instead of i_size.
	 *
	 * The only time when i_disksize and i_size may be different is when
	 * a truncate is in progress.  The only things which change i_disksize
	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
	 */
	loff_t	i_disksize;

Ok, so, that explains the behavior:

  • no-fallocate: the i_size/i_disksize are both zero
  • fallocate-0: it changes the i_size and i_disksize
  • fallocate-keep-size doesn't change neither i_size nor i_disksize

This here is the code that exempts size field updates if FALLOC_FL_KEEP_SIZE is set (ext4_fallocate)

	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
	    (offset + len > inode->i_size ||
	     offset + len > EXT4_I(inode)->i_disksize)) {
		new_size = offset + len;
		ret = inode_newsize_ok(inode, new_size);
		if (ret)
			goto out;
	}

Appendix

pr_debug Kernel Patch against 6.12.25 mainline

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..e08d0c211946 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -456,6 +456,8 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
            ((!IS_NOSEC(inode) || *extend || !overwrite ||
             (unaligned_io && *unwritten)))) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
+                       pr_debug("returning EAGAIN because extend=%d overwrite=%d unaligned_io=%d\n",
+                                *extend, overwrite, unaligned_io);
                        ret = -EAGAIN;
                        goto out;
                }
@@ -474,6 +476,8 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
         */
        if (!*ilock_shared && (unaligned_io || *extend)) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
+                       pr_debug("returning EAGAIN because unaligned_io=%d extend=%d\n",
+                                unaligned_io, *extend);
                        ret = -EAGAIN;
                        goto out;
                }
@@ -495,7 +499,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
        return ret;
 }
 
-static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static noinline ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        ssize_t ret;
        handle_t *handle;

Repro App

use nix::{fcntl::FallocateFlags, libc::O_DIRECT};
use std::os::unix::fs::OpenOptionsExt;

#[tokio::main]
async fn main() {
    let fallocate: Option<FallocateFlags> = {
        let arg = std::env::args().nth(1).unwrap_or_else(|| {
            panic!("missing first argument, must be one of fallocate-keep-size, fallocate-0, no-fallocate")
        });
        match arg.as_str() {
            "fallocate-keep-size" => Some(FallocateFlags::FALLOC_FL_KEEP_SIZE),
            "fallocate-0" => Some(FallocateFlags::empty()),
            "no-fallocate" => None,
            _ => panic!("invalid argument"),
        }
    };

    let system = tokio_epoll_uring::System::launch().await.unwrap();

    let file = "testfile.data";
    match std::fs::remove_file(file) {
        Ok(_) => println!("File removed successfully"),
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
            println!("File not found, proceeding to create a new one");
        }
        Err(e) => {
            panic!("{e}");
        }
    }

    println!("creating file");
    let file = std::fs::OpenOptions::new()
        .write(true)
        .create_new(true)
        .custom_flags(O_DIRECT)
        .open(file)
        .unwrap();

    if let Some(flags) = fallocate {
        println!("fallocating");
        use std::os::fd::AsRawFd;
        nix::fcntl::fallocate(file.as_raw_fd(), flags, 0, 8192).unwrap();
    } else {
        println!("skipping fallocate");
    }

    println!("issuing the write");
    let fd: std::os::fd::OwnedFd = file.into();
    let buf =
        unsafe { std::alloc::alloc(std::alloc::Layout::from_size_align(8192, 8192).unwrap()) };
    if buf.is_null() {
        panic!("Failed to allocate buffer");
    }
    let mut vec = unsafe { Vec::from_raw_parts(buf, 8192, 8192) };
    vec.fill(1);

    let (_, res) = system.write(fd, 0, vec).await;
    let written = res.unwrap();
    assert_eq!(written, 8192, "not expecting short write");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment