jiewei-ke/0.1911392-ext4-deadlock

## 0.1911392-ext4-deadlock
# ext4 deadlock issue

## 0. Background

This gist repo is about an ext4 deadlock issue which may be happened when OOM
killer is triggered on CentOS 7 with version after 3.10.0-862.el7.

See https://bugzilla.redhat.com/show_bug.cgi?id=1911392 for more details.

## 1. How to reproduce this issue?

Download the ext4-repro-with-multiple-cgroup.sh and ext4-repro-with-write, and run

$ bash ext4-repro-with-multiple-cgroup.sh

It will do the following things,

1. Prepare a test binary, named 'ext4-repro-with-write', which will open a file
   (located on an ext4 fs) and keep writing to it; see ext4-repro-with-write.go;

2. Create 10 cgroups, each with memory limit size 100 MB;

3. For each cgroup, create 30 services to run the ext4-repro-with-write binary;

4. The deadlock issue can be reproduced in 10 mins, the ext4 will hang; other
   tasks doing I/O on the same ext4 fs will hang also.

To stop the test, run

$ systemctl stop ext4-repro-cgroup*-*

## 2. How to fix it?

We proposed two patches to fix this issue.

The final solution will be updated here when we get confirmation from Redhat.

## 1.ext4-repro-with-multiple-cgroup.sh
#!/usr/bin/bash

echo "clean up"

systemctl stop ext4-repro-cgroup*-*

mkdir /usr/share/ext4-repro/
cp ext4-repro-with-write /tmp/ext4-repro-with-write
chmod +x /tmp/ext4-repro-with-write

for i in `seq 1 10`;do
    echo "prepare cgroup"
    cat << EOF > /etc/systemd/system/system-fio${i}.slice
[Unit]
Description=zbs Slice
Before=slices.target
Wants=system.slice
After=system.slice
EOF

    mkdir  /etc/systemd/system/system-fio${i}.slice.d
    cat << EOF >  /etc/systemd/system/system-fio${i}.slice.d/50-MemoryLimit.conf
[Slice]
MemoryLimit=104857600
EOF

    systemctl daemon-reload
    systemctl restart system-fio${i}.slice
    systemctl status system-fio${i}.slice


echo "prepare fio data path"
TEST_PATH="/usr/share/ext4-repro"
mkdir $TEST_PATH

for j in `seq 1 30`;do
    mkdir /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/
    cat << EOF > /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/cgroup.conf
    [Service]
    Slice=system-fio${i}.slice
EOF

    cat << EOF > /usr/lib/systemd/system/ext4-repro-cgroup${i}-${j}.service
    [Unit]
    After=network.target

    [Service]
    ExecStart=/tmp/ext4-repro-with-write
    Restart=always
    RestartSec=3s

    [Install]
    WantedBy=multi-user.target
EOF

done

pushd /usr/lib/systemd/system
systemctl restart ext4-repro-cgroup${i}*.service
popd

done

## 1.ext4-repro-with-write

      
    Raw
  

              1.ext4-repro-with-write
            
          
            View raw
        
    
## 1.ext4-repro-with-write.go
package main

import (
	"flag"
	"fmt"
	"math/rand"
	"os"
	"strconv"
	"time"
)

const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
const path = "/usr/share/ext4-repo/"

func RandStringBytes(n int) string {
	b := make([]byte, n)
	for i := range b {
		b[i] = letterBytes[rand.Intn(len(letterBytes))]
	}
	return string(b)
}

func createFileName(path string) string {
	name, _ := os.Hostname()
	fn := path + name + strconv.Itoa(int(time.Now().Unix())) + RandStringBytes(8)
	return fn
}

func w(path string) {
	fn := createFileName(path)
	fmt.Println(fn)
	f, _ := os.OpenFile(fn, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
	fmt.Println(f)
	defer os.Remove(fn)
	defer f.Close()
	for {
		s := RandStringBytes(10)
		_, _ = f.WriteString(s)
		f.Sync()
	}
}

func main() {
	path := flag.String("path", "/usr/share/ext4-repo/", "a string")
	flag.Parse()

	fmt.Println("path:", *path)
	go w(*path)
	for {
		time.Sleep(time.Duration(1) * time.Second)
	}
}

## 2.buffer-crossport-patches-to-ensure-grow_dev_page-wil.patch
From 4b64f5a2fdeee9589572de9ddd44b8f8134777e4 Mon Sep 17 00:00:00 2001
From: Jiewei Ke <jiewei@smartx.com>
Date: Mon, 21 Dec 2020 20:57:27 -0500
Subject: [PATCH 1/2] buffer: crossport patches to ensure grow_dev_page() will
 always use GFP_NOFAIL

Following patches are crossported to 3.10.0,

bc48f001de12 buffer: eliminate the need to call free_more_memory() in getblk_slow()
94dc24c0c59a buffer: grow_dev_page() should use GFP_NOFAIL for all cases
640ab98fb362 buffer: have alloc_page_buffers() use __GFP_NOFAIL
---
 drivers/md/md-bitmap.c      |  2 +-
 fs/buffer.c                 | 62 ++++++++-------------------------------------
 fs/ntfs/aops.c              |  2 +-
 fs/ntfs/mft.c               |  2 +-
 include/linux/buffer_head.h |  2 +-
 5 files changed, 15 insertions(+), 55 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index f0b5deb..aee6b00 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -351,7 +351,7 @@ static int read_page(struct file *file, unsigned long index,
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);

-	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
 	if (!bh) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index ccb6790..0cda816 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -254,27 +254,6 @@ out:
 }

 /*
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
- */
-static void free_more_memory(void)
-{
-	struct zone *zone;
-	int nid;
-
-	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
-	yield();
-
-	for_each_online_node(nid) {
-		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL,
-						&zone);
-		if (zone)
-			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS, NULL);
-	}
-}
-
-/*
  * I/O completion handler for block_read_full_page() - pages
  * which come unlocked at the end of I/O.
  */
@@ -829,16 +808,19 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry)
+		bool retry)
 {
 	struct buffer_head *bh, *head;
+	gfp_t gfp = GFP_NOFS;
 	long offset;

-try_again:
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = alloc_buffer_head(GFP_NOFS);
+		bh = alloc_buffer_head(gfp);
 		if (!bh)
 			goto no_grow;

@@ -864,23 +846,7 @@ no_grow:
 		} while (head);
 	}

-	/*
-	 * Return failure for non-async IO requests.  Async IO requests
-	 * are not allowed to fail, so we have to wait until buffer heads
-	 * become available.  But we don't want tasks sleeping with
-	 * partially complete buffers, so all were released above.
-	 */
-	if (!retry)
-		return NULL;
-
-	/* We're _really_ low on memory. Now we just
-	 * wait for old buffer heads to become free due to
-	 * finishing IO.  Since this is an async request and
-	 * the reserve list is empty, we're sure there are
-	 * async buffer heads in use.
-	 */
-	free_more_memory();
-	goto try_again;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);

@@ -969,8 +935,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	gfp_mask |= __GFP_NOFAIL;

 	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
-	if (!page)
-		return ret;

 	BUG_ON(!PageLocked(page));

@@ -988,9 +952,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	/*
 	 * Allocate some buffers for this page
 	 */
-	bh = alloc_page_buffers(page, size, 0);
-	if (!bh)
-		goto failed;
+	bh = alloc_page_buffers(page, size, true);

 	/*
 	 * Link the page to the buffers and initialise them.  Take the
@@ -1070,8 +1032,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 		ret = grow_buffers(bdev, block, size);
 		if (ret < 0)
 			return NULL;
-		if (ret == 0)
-			free_more_memory();
 	}
 }

@@ -1537,7 +1497,7 @@ void create_empty_buffers(struct page *page,
 {
 	struct buffer_head *bh, *head, *tail;

-	head = alloc_page_buffers(page, blocksize, 1);
+	head = alloc_page_buffers(page, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -2586,7 +2546,7 @@ int nobh_write_begin(struct address_space *mapping,
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
-	head = alloc_page_buffers(page, blocksize, 0);
+	head = alloc_page_buffers(page, blocksize, false);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
@@ -2866,7 +2826,7 @@ int block_truncate_page(struct address_space *mapping,

 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
+
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..9cb0166 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1599,7 +1599,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, 1);
+		bh = head = alloc_page_buffers(page, bh_size, true);
 		spin_lock(&mapping->private_lock);
 		if (likely(!page_has_buffers(page))) {
 			struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3014a36..9db0e31 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -506,7 +506,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	if (unlikely(!page_has_buffers(page))) {
 		struct buffer_head *tail;

-		bh = head = alloc_page_buffers(page, blocksize, 1);
+		bh = head = alloc_page_buffers(page, blocksize, true);
 		do {
 			set_buffer_uptodate(bh);
 			tail = bh;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ab09b3c..d470df0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
 int try_to_free_buffers(struct page *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry);
+		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
--
2.11.0


## 2.fs-mm-avoid-unexpected-NOFAIL-removal-in-__add_to_pa.patch
From 7dc5fb5fd28afbc09736025ca42f0e15aefe719f Mon Sep 17 00:00:00 2001
From: Jiewei Ke <jiewei@smartx.com>
Date: Mon, 28 Dec 2020 01:56:41 -0500
Subject: [PATCH 2/2] fs, mm: avoid unexpected NOFAIL removal in
 __add_to_page_cache_locked

NOFAIL flag may be unexpectedly removed after mapping_gfp_constraint() is
called in __add_to_page_cache_locked(), which may cause critical issues
since upper layer cannot handle it.

For example, a deadlock in ext4 may happen like this: when the memory
usage is above the the cgroup limit, ext4 fs may loop forever in
__getblk_slow() waiting for free memory during transaction commit and
block all other transactions; while in order to release some memory usage,
OOM-killer may want to kill the task which is waiting for transaction and
becomes uninterruptable.

This patch fixes it by adding back the NOFAIL flag after
mapping_gfp_constraint() if it has been specified by upper layer. For the
ext4 case, there are no failure cases in grow_dev_page() that occur
because of a failed memory allocation.

Also add return value check for find_or_create_page() and
alloc_page_buffers() for safety, even though NOFAIL flag has been
specified.
---
 fs/buffer.c  | 4 ++++
 mm/filemap.c | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index 0cda816..af3bbcb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -935,6 +935,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	gfp_mask |= __GFP_NOFAIL;
	# ext4 deadlock issue

	## 0. Background

	This gist repo is about an ext4 deadlock issue which may be happened when OOM
	killer is triggered on CentOS 7 with version after 3.10.0-862.el7.

	See https://bugzilla.redhat.com/show_bug.cgi?id=1911392 for more details.

	## 1. How to reproduce this issue?

	Download the ext4-repro-with-multiple-cgroup.sh and ext4-repro-with-write, and run

	$ bash ext4-repro-with-multiple-cgroup.sh

	It will do the following things,

	1. Prepare a test binary, named 'ext4-repro-with-write', which will open a file
	(located on an ext4 fs) and keep writing to it; see ext4-repro-with-write.go;

	2. Create 10 cgroups, each with memory limit size 100 MB;

	3. For each cgroup, create 30 services to run the ext4-repro-with-write binary;

	4. The deadlock issue can be reproduced in 10 mins, the ext4 will hang; other
	tasks doing I/O on the same ext4 fs will hang also.

	To stop the test, run

	$ systemctl stop ext4-repro-cgroup-

	## 2. How to fix it?

	We proposed two patches to fix this issue.

	The final solution will be updated here when we get confirmation from Redhat.
	#!/usr/bin/bash

	echo "clean up"

	systemctl stop ext4-repro-cgroup-

	mkdir /usr/share/ext4-repro/
	cp ext4-repro-with-write /tmp/ext4-repro-with-write
	chmod +x /tmp/ext4-repro-with-write

	for i in `seq 1 10`;do
	echo "prepare cgroup"
	cat << EOF > /etc/systemd/system/system-fio${i}.slice
	[Unit]
	Description=zbs Slice
	Before=slices.target
	Wants=system.slice
	After=system.slice
	EOF

	mkdir /etc/systemd/system/system-fio${i}.slice.d
	cat << EOF > /etc/systemd/system/system-fio${i}.slice.d/50-MemoryLimit.conf
	[Slice]
	MemoryLimit=104857600
	EOF

	systemctl daemon-reload
	systemctl restart system-fio${i}.slice
	systemctl status system-fio${i}.slice


	echo "prepare fio data path"
	TEST_PATH="/usr/share/ext4-repro"
	mkdir $TEST_PATH

	for j in `seq 1 30`;do
	mkdir /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/
	cat << EOF > /etc/systemd/system/ext4-repro-cgroup${i}-${j}.service.d/cgroup.conf
	[Service]
	Slice=system-fio${i}.slice
	EOF

	cat << EOF > /usr/lib/systemd/system/ext4-repro-cgroup${i}-${j}.service
	[Unit]
	After=network.target

	[Service]
	ExecStart=/tmp/ext4-repro-with-write
	Restart=always
	RestartSec=3s

	[Install]
	WantedBy=multi-user.target
	EOF

	done

	pushd /usr/lib/systemd/system
	systemctl restart ext4-repro-cgroup${i}*.service
	popd

	done
	package main

	import (
	"flag"
	"fmt"
	"math/rand"
	"os"
	"strconv"
	"time"
	)

	const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
	const path = "/usr/share/ext4-repo/"

	func RandStringBytes(n int) string {
	b := make([]byte, n)
	for i := range b {
	b[i] = letterBytes[rand.Intn(len(letterBytes))]
	}
	return string(b)
	}

	func createFileName(path string) string {
	name, _ := os.Hostname()
	fn := path + name + strconv.Itoa(int(time.Now().Unix())) + RandStringBytes(8)
	return fn
	}

	func w(path string) {
	fn := createFileName(path)
	fmt.Println(fn)
	f, _ := os.OpenFile(fn, os.O_APPEND\|os.O_CREATE\|os.O_WRONLY, 0644)
	fmt.Println(f)
	defer os.Remove(fn)
	defer f.Close()
	for {
	s := RandStringBytes(10)
	_, _ = f.WriteString(s)
	f.Sync()
	}
	}

	func main() {
	path := flag.String("path", "/usr/share/ext4-repo/", "a string")
	flag.Parse()

	fmt.Println("path:", *path)
	go w(*path)
	for {
	time.Sleep(time.Duration(1) * time.Second)
	}
	}
	From 4b64f5a2fdeee9589572de9ddd44b8f8134777e4 Mon Sep 17 00:00:00 2001
	From: Jiewei Ke <jiewei@smartx.com>
	Date: Mon, 21 Dec 2020 20:57:27 -0500
	Subject: [PATCH 1/2] buffer: crossport patches to ensure grow_dev_page() will
	always use GFP_NOFAIL

	Following patches are crossported to 3.10.0,

	bc48f001de12 buffer: eliminate the need to call free_more_memory() in getblk_slow()
	94dc24c0c59a buffer: grow_dev_page() should use GFP_NOFAIL for all cases
	640ab98fb362 buffer: have alloc_page_buffers() use __GFP_NOFAIL
	---
	drivers/md/md-bitmap.c \| 2 +-
	fs/buffer.c \| 62 ++++++++-------------------------------------
	fs/ntfs/aops.c \| 2 +-
	fs/ntfs/mft.c \| 2 +-
	include/linux/buffer_head.h \| 2 +-
	5 files changed, 15 insertions(+), 55 deletions(-)

	diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
	index f0b5deb..aee6b00 100644
	--- a/drivers/md/md-bitmap.c
	+++ b/drivers/md/md-bitmap.c
	@@ -351,7 +351,7 @@ static int read_page(struct file *file, unsigned long index,
	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
	(unsigned long long)index << PAGE_SHIFT);

	- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
	+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
	if (!bh) {
	ret = -ENOMEM;
	goto out;
	diff --git a/fs/buffer.c b/fs/buffer.c
	index ccb6790..0cda816 100644
	--- a/fs/buffer.c
	+++ b/fs/buffer.c
	@@ -254,27 +254,6 @@ out:
	}

	/*
	- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
	- */
	-static void free_more_memory(void)
	-{
	- struct zone *zone;
	- int nid;
	-
	- wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
	- yield();
	-
	- for_each_online_node(nid) {
	- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
	- gfp_zone(GFP_NOFS), NULL,
	- &zone);
	- if (zone)
	- try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
	- GFP_NOFS, NULL);
	- }
	-}
	-
	-/*
	* I/O completion handler for block_read_full_page() - pages
	* which come unlocked at the end of I/O.
	*/
	@@ -829,16 +808,19 @@ int remove_inode_buffers(struct inode *inode)
	* which may not fail from ordinary buffer allocations.
	*/
	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
	- int retry)
	+ bool retry)
	{
	struct buffer_head bh, head;
	+ gfp_t gfp = GFP_NOFS;
	long offset;

	-try_again:
	+ if (retry)
	+ gfp \|= __GFP_NOFAIL;
	+
	head = NULL;
	offset = PAGE_SIZE;
	while ((offset -= size) >= 0) {
	- bh = alloc_buffer_head(GFP_NOFS);
	+ bh = alloc_buffer_head(gfp);
	if (!bh)
	goto no_grow;

	@@ -864,23 +846,7 @@ no_grow:
	} while (head);
	}

	- /*
	- * Return failure for non-async IO requests. Async IO requests
	- * are not allowed to fail, so we have to wait until buffer heads
	- * become available. But we don't want tasks sleeping with
	- * partially complete buffers, so all were released above.
	- */
	- if (!retry)
	- return NULL;
	-
	- /* We're _really_ low on memory. Now we just
	- * wait for old buffer heads to become free due to
	- * finishing IO. Since this is an async request and
	- * the reserve list is empty, we're sure there are
	- * async buffer heads in use.
	- */
	- free_more_memory();
	- goto try_again;
	+ return NULL;
	}
	EXPORT_SYMBOL_GPL(alloc_page_buffers);

	@@ -969,8 +935,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
	gfp_mask \|= __GFP_NOFAIL;

	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
	- if (!page)
	- return ret;

	BUG_ON(!PageLocked(page));

	@@ -988,9 +952,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
	/*
	* Allocate some buffers for this page
	*/
	- bh = alloc_page_buffers(page, size, 0);
	- if (!bh)
	- goto failed;
	+ bh = alloc_page_buffers(page, size, true);

	/*
	* Link the page to the buffers and initialise them. Take the
	@@ -1070,8 +1032,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
	ret = grow_buffers(bdev, block, size);
	if (ret < 0)
	return NULL;
	- if (ret == 0)
	- free_more_memory();
	}
	}

	@@ -1537,7 +1497,7 @@ void create_empty_buffers(struct page *page,
	{
	struct buffer_head bh, head, *tail;

	- head = alloc_page_buffers(page, blocksize, 1);
	+ head = alloc_page_buffers(page, blocksize, true);
	bh = head;
	do {
	bh->b_state \|= b_state;
	@@ -2586,7 +2546,7 @@ int nobh_write_begin(struct address_space *mapping,
	* Be careful: the buffer linked list is a NULL terminated one, rather
	* than the circular one we're used to.
	*/
	- head = alloc_page_buffers(page, blocksize, 0);
	+ head = alloc_page_buffers(page, blocksize, false);
	if (!head) {
	ret = -ENOMEM;
	goto out_release;
	@@ -2866,7 +2826,7 @@ int block_truncate_page(struct address_space *mapping,

	length = blocksize - length;
	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	-
	+
	page = grab_cache_page(mapping, index);
	err = -ENOMEM;
	if (!page)
	diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
	index fa9c05f..9cb0166 100644
	--- a/fs/ntfs/aops.c
	+++ b/fs/ntfs/aops.c
	@@ -1599,7 +1599,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
	spin_lock(&mapping->private_lock);
	if (unlikely(!page_has_buffers(page))) {
	spin_unlock(&mapping->private_lock);
	- bh = head = alloc_page_buffers(page, bh_size, 1);
	+ bh = head = alloc_page_buffers(page, bh_size, true);
	spin_lock(&mapping->private_lock);
	if (likely(!page_has_buffers(page))) {
	struct buffer_head *tail;
	diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
	index 3014a36..9db0e31 100644
	--- a/fs/ntfs/mft.c
	+++ b/fs/ntfs/mft.c
	@@ -506,7 +506,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
	if (unlikely(!page_has_buffers(page))) {
	struct buffer_head *tail;

	- bh = head = alloc_page_buffers(page, blocksize, 1);
	+ bh = head = alloc_page_buffers(page, blocksize, true);
	do {
	set_buffer_uptodate(bh);
	tail = bh;
	diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
	index ab09b3c..d470df0 100644
	--- a/include/linux/buffer_head.h
	+++ b/include/linux/buffer_head.h
	@@ -157,7 +157,7 @@ void set_bh_page(struct buffer_head *bh,
	struct page *page, unsigned long offset);
	int try_to_free_buffers(struct page *);
	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
	- int retry);
	+ bool retry);
	void create_empty_buffers(struct page *, unsigned long,
	unsigned long b_state);
	void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
	--
	2.11.0
	From 7dc5fb5fd28afbc09736025ca42f0e15aefe719f Mon Sep 17 00:00:00 2001
	From: Jiewei Ke <jiewei@smartx.com>
	Date: Mon, 28 Dec 2020 01:56:41 -0500
	Subject: [PATCH 2/2] fs, mm: avoid unexpected NOFAIL removal in
	__add_to_page_cache_locked

	NOFAIL flag may be unexpectedly removed after mapping_gfp_constraint() is
	called in __add_to_page_cache_locked(), which may cause critical issues
	since upper layer cannot handle it.

	For example, a deadlock in ext4 may happen like this: when the memory
	usage is above the the cgroup limit, ext4 fs may loop forever in
	__getblk_slow() waiting for free memory during transaction commit and
	block all other transactions; while in order to release some memory usage,
	OOM-killer may want to kill the task which is waiting for transaction and
	becomes uninterruptable.

	This patch fixes it by adding back the NOFAIL flag after
	mapping_gfp_constraint() if it has been specified by upper layer. For the
	ext4 case, there are no failure cases in grow_dev_page() that occur
	because of a failed memory allocation.

	Also add return value check for find_or_create_page() and
	alloc_page_buffers() for safety, even though NOFAIL flag has been
	specified.
	---
	fs/buffer.c \| 4 ++++
	mm/filemap.c \| 5 +++++
	2 files changed, 9 insertions(+)

	diff --git a/fs/buffer.c b/fs/buffer.c
	index 0cda816..af3bbcb 100644
	--- a/fs/buffer.c
	+++ b/fs/buffer.c
	@@ -935,6 +935,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
	gfp_mask \|= __GFP_NOFAIL;