Skip to content

Instantly share code, notes, and snippets.

@jiewei-ke
Last active April 17, 2023 06:45
Show Gist options
  • Save jiewei-ke/4962ac0061703fdcd0837bab2dbec261 to your computer and use it in GitHub Desktop.
Save jiewei-ke/4962ac0061703fdcd0837bab2dbec261 to your computer and use it in GitHub Desktop.
This gist repo is about an ext4 deadlock issue which may be happened when OOM killer is triggered on CentOS 7 with version after 3.10.0-862.el7.
# ext4 deadlock issue
## 0. Background
This gist repo is about an ext4 deadlock issue which may be happened when OOM
killer is triggered on CentOS 7 with version after 3.10.0-862.el7.
See https://bugzilla.redhat.com/show_bug.cgi?id=1911392 for more details.
## 1. How to reproduce this issue?
Download the ext4-repro-with-multiple-cgroup.sh and ext4-repro-with-write, and run
$ bash ext4-repro-with-multiple-cgroup.sh
It will do the following things,
1. Prepare a test binary, named 'ext4-repro-with-write', which will open a file
(located on an ext4 fs) and keep writing to it; see ext4-repro-with-write.go;
2. Create 10 cgroups, each with memory limit size 100 MB;
3. For each cgroup, create 30 services to run the ext4-repro-with-write binary;
4. The deadlock issue can be reproduced in 10 mins, the ext4 will hang; other
tasks doing I/O on the same ext4 fs will hang also.
To stop the test, run
$ systemctl stop ext4-repro-cgroup*-*
## 2. How to fix it?
We proposed two patches to fix this issue.
The final solution will be updated here when we get confirmation from Redhat.
#!/usr/bin/bash
echo "clean up"
systemctl stop ext4-repro-cgroup*-*
mkdir /usr/share/ext4-repro/
cp ext4-repro-with-write /tmp/ext4-repro-with-write
chmod +x /tmp/ext4-repro-with-write
for i in `seq 1 10`;do
echo "prepare cgroup"
cat << EOF > /etc/systemd/system/system-fio${i}.slice
[Unit]
Description=zbs Slice
Before=slices.target
Wants=system.slice
After=system.slice
EOF
mkdir /etc/systemd/system/system-fio${i}.slice.d
cat << EOF > /etc/systemd/system/system-fio${i}.slice.d/50-MemoryLimit.conf
[Slice]
MemoryLimit=104857600
EOF
systemctl daemon-reload
systemctl restart system-fio${i}.slice
systemctl status system-fio${i}.slice
echo "prepare fio data path"
TEST_PATH="/usr/share/ext4-repro"
mkdir $TEST_PATH
for j in `seq 1 30`;do
mkdir /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/
cat << EOF > /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/cgroup.conf
[Service]
Slice=system-fio${i}.slice
EOF
cat << EOF > /usr/lib/systemd/system/ext4-repro-cgroup${i}-${j}.service
[Unit]
After=network.target
[Service]
ExecStart=/tmp/ext4-repro-with-write
Restart=always
RestartSec=3s
[Install]
WantedBy=multi-user.target
EOF
done
pushd /usr/lib/systemd/system
systemctl restart ext4-repro-cgroup${i}*.service
popd
done
package main
import (
"flag"
"fmt"
"math/rand"
"os"
"strconv"
"time"
)
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
const path = "/usr/share/ext4-repo/"
func RandStringBytes(n int) string {
b := make([]byte, n)
for i := range b {
b[i] = letterBytes[rand.Intn(len(letterBytes))]
}
return string(b)
}
func createFileName(path string) string {
name, _ := os.Hostname()
fn := path + name + strconv.Itoa(int(time.Now().Unix())) + RandStringBytes(8)
return fn
}
func w(path string) {
fn := createFileName(path)
fmt.Println(fn)
f, _ := os.OpenFile(fn, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
fmt.Println(f)
defer os.Remove(fn)
defer f.Close()
for {
s := RandStringBytes(10)
_, _ = f.WriteString(s)
f.Sync()
}
}
func main() {
path := flag.String("path", "/usr/share/ext4-repo/", "a string")
flag.Parse()
fmt.Println("path:", *path)
go w(*path)
for {
time.Sleep(time.Duration(1) * time.Second)
}
}
From 4b64f5a2fdeee9589572de9ddd44b8f8134777e4 Mon Sep 17 00:00:00 2001
From: Jiewei Ke <jiewei@smartx.com>
Date: Mon, 21 Dec 2020 20:57:27 -0500
Subject: [PATCH 1/2] buffer: crossport patches to ensure grow_dev_page() will
always use GFP_NOFAIL
Following patches are crossported to 3.10.0,
bc48f001de12 buffer: eliminate the need to call free_more_memory() in getblk_slow()
94dc24c0c59a buffer: grow_dev_page() should use GFP_NOFAIL for all cases
640ab98fb362 buffer: have alloc_page_buffers() use __GFP_NOFAIL
---
drivers/md/md-bitmap.c | 2 +-
fs/buffer.c | 62 ++++++++-------------------------------------
fs/ntfs/aops.c | 2 +-
fs/ntfs/mft.c | 2 +-
include/linux/buffer_head.h | 2 +-
5 files changed, 15 insertions(+), 55 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index f0b5deb..aee6b00 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -351,7 +351,7 @@ static int read_page(struct file *file, unsigned long index,
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
if (!bh) {
ret = -ENOMEM;
goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index ccb6790..0cda816 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -254,27 +254,6 @@ out:
}
/*
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
- */
-static void free_more_memory(void)
-{
- struct zone *zone;
- int nid;
-
- wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
- yield();
-
- for_each_online_node(nid) {
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL,
- &zone);
- if (zone)
- try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS, NULL);
- }
-}
-
-/*
* I/O completion handler for block_read_full_page() - pages
* which come unlocked at the end of I/O.
*/
@@ -829,16 +808,19 @@ int remove_inode_buffers(struct inode *inode)
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- int retry)
+ bool retry)
{
struct buffer_head *bh, *head;
+ gfp_t gfp = GFP_NOFS;
long offset;
-try_again:
+ if (retry)
+ gfp |= __GFP_NOFAIL;
+
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
- bh = alloc_buffer_head(GFP_NOFS);
+ bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
@@ -864,23 +846,7 @@ no_grow:
} while (head);
}
- /*
- * Return failure for non-async IO requests. Async IO requests
- * are not allowed to fail, so we have to wait until buffer heads
- * become available. But we don't want tasks sleeping with
- * partially complete buffers, so all were released above.
- */
- if (!retry)
- return NULL;
-
- /* We're _really_ low on memory. Now we just
- * wait for old buffer heads to become free due to
- * finishing IO. Since this is an async request and
- * the reserve list is empty, we're sure there are
- * async buffer heads in use.
- */
- free_more_memory();
- goto try_again;
+ return NULL;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
@@ -969,8 +935,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
gfp_mask |= __GFP_NOFAIL;
page = find_or_create_page(inode->i_mapping, index, gfp_mask);
- if (!page)
- return ret;
BUG_ON(!PageLocked(page));
@@ -988,9 +952,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
/*
* Allocate some buffers for this page
*/
- bh = alloc_page_buffers(page, size, 0);
- if (!bh)
- goto failed;
+ bh = alloc_page_buffers(page, size, true);
/*
* Link the page to the buffers and initialise them. Take the
@@ -1070,8 +1032,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
ret = grow_buffers(bdev, block, size);
if (ret < 0)
return NULL;
- if (ret == 0)
- free_more_memory();
}
}
@@ -1537,7 +1497,7 @@ void create_empty_buffers(struct page *page,
{
struct buffer_head *bh, *head, *tail;
- head = alloc_page_buffers(page, blocksize, 1);
+ head = alloc_page_buffers(page, blocksize, true);
bh = head;
do {
bh->b_state |= b_state;
@@ -2586,7 +2546,7 @@ int nobh_write_begin(struct address_space *mapping,
* Be careful: the buffer linked list is a NULL terminated one, rather
* than the circular one we're used to.
*/
- head = alloc_page_buffers(page, blocksize, 0);
+ head = alloc_page_buffers(page, blocksize, false);
if (!head) {
ret = -ENOMEM;
goto out_release;
@@ -2866,7 +2826,7 @@ int block_truncate_page(struct address_space *mapping,
length = blocksize - length;
iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
+
page = grab_cache_page(mapping, index);
err = -ENOMEM;
if (!page)
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..9cb0166 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1599,7 +1599,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
spin_lock(&mapping->private_lock);
if (unlikely(!page_has_buffers(page))) {
spin_unlock(&mapping->private_lock);
- bh = head = alloc_page_buffers(page, bh_size, 1);
+ bh = head = alloc_page_buffers(page, bh_size, true);
spin_lock(&mapping->private_lock);
if (likely(!page_has_buffers(page))) {
struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3014a36..9db0e31 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -506,7 +506,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
if (unlikely(!page_has_buffers(page))) {
struct buffer_head *tail;
- bh = head = alloc_page_buffers(page, blocksize, 1);
+ bh = head = alloc_page_buffers(page, blocksize, true);
do {
set_buffer_uptodate(bh);
tail = bh;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ab09b3c..d470df0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset);
int try_to_free_buffers(struct page *);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- int retry);
+ bool retry);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
--
2.11.0
This file has been truncated, but you can view the full file.
From 7dc5fb5fd28afbc09736025ca42f0e15aefe719f Mon Sep 17 00:00:00 2001
From: Jiewei Ke <jiewei@smartx.com>
Date: Mon, 28 Dec 2020 01:56:41 -0500
Subject: [PATCH 2/2] fs, mm: avoid unexpected NOFAIL removal in
__add_to_page_cache_locked
NOFAIL flag may be unexpectedly removed after mapping_gfp_constraint() is
called in __add_to_page_cache_locked(), which may cause critical issues
since upper layer cannot handle it.
For example, a deadlock in ext4 may happen like this: when the memory
usage is above the the cgroup limit, ext4 fs may loop forever in
__getblk_slow() waiting for free memory during transaction commit and
block all other transactions; while in order to release some memory usage,
OOM-killer may want to kill the task which is waiting for transaction and
becomes uninterruptable.
This patch fixes it by adding back the NOFAIL flag after
mapping_gfp_constraint() if it has been specified by upper layer. For the
ext4 case, there are no failure cases in grow_dev_page() that occur
because of a failed memory allocation.
Also add return value check for find_or_create_page() and
alloc_page_buffers() for safety, even though NOFAIL flag has been
specified.
---
fs/buffer.c | 4 ++++
mm/filemap.c | 5 +++++
2 files changed, 9 insertions(+)
diff --git a/fs/buffer.c b/fs/buffer.c
index 0cda816..af3bbcb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -935,6 +935,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
gfp_mask |= __GFP_NOFAIL;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment