Skip to content

Instantly share code, notes, and snippets.

@kyujin-cho
Last active March 30, 2023 03:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save kyujin-cho/6c41f0c4be920e9b3c5183857fd4f4b3 to your computer and use it in GitHub Desktop.
Save kyujin-cho/6c41f0c4be920e9b3c5183857fd4f4b3 to your computer and use it in GitHub Desktop.
MLNX_OFED 4.9-4.1.7.0 on Debian 11
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/include/linux/sched/mm.h include/linux/sched/mm.h
--- a/include/linux/sched/mm.h 2021-12-08 23:08:07.000000000 +0900
+++ b/include/linux/sched/mm.h 2022-08-05 17:28:35.539301521 +0900
@@ -35,18 +35,18 @@
#if !defined (HAVE_MMGET_STILL_VALID) && !defined(HAVE_MMGET_STILL_VALID_IN_SCHED_H) && !defined(HAVE_MMGET_STILL_VALID_IN_MM_H)
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
- * followed by taking the mmap_sem for writing before modifying the
+ * followed by taking the mmap_lock for writing before modifying the
* vmas or anything the coredump pretends not to change from under it.
*
* NOTE: find_extend_vma() called from GUP context is the only place
- * that can modify the "mm" (notably the vm_start/end) under mmap_sem
+ * that can modify the "mm" (notably the vm_start/end) under mmap_lock
* for reading and outside the context of the process, so it is also
- * the only case that holds the mmap_sem for reading that must call
- * this function. Generally if the mmap_sem is hold for reading
+ * the only case that holds the mmap_lock for reading that must call
+ * this function. Generally if the mmap_lock is hold for reading
* there's no need of this check after get_task_mm()/mmget_not_zero().
*
* This function can be obsoleted and the check can be removed, after
- * the coredump code will hold the mmap_sem for writing before
+ * the coredump code will hold the mmap_lock for writing before
* invoking the ->core_dump methods.
*/
static inline bool mmget_still_valid(struct mm_struct *mm)
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/include/rdma/ib_verbs.h include/rdma/ib_verbs.h
--- a/include/rdma/ib_verbs.h 2022-08-05 17:32:01.725190251 +0900
+++ b/include/rdma/ib_verbs.h 2022-08-05 18:09:39.200671886 +0900
@@ -78,6 +78,10 @@
#include <uapi/rdma/ib_user_ioctl_verbs.h>
#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
+#ifndef uninitialized_var
+#define uninitialized_var(x) x = x
+#endif
+
struct ib_umem_odp;
struct ib_uqp_object;
struct ib_usrq_object;
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/core/cmem.c drivers/infiniband/core/cmem.c
--- a/drivers/infiniband/core/cmem.c 2022-08-05 17:32:01.685190660 +0900
+++ b/drivers/infiniband/core/cmem.c 2022-08-05 17:28:35.543301480 +0900
@@ -22,7 +22,7 @@
}
/* no locking is needed:
* ib_cmem_release is called from vm_close which is always called
- * with mm->mmap_sem held for writing.
+ * with mm->mmap_lock held for writing.
* The only exception is when the process shutting down but in that case
* counter not relevant any more.
*/
@@ -58,7 +58,7 @@
ib_cmem = (struct ib_cmem *)(area->vm_private_data);
- /* vm_open and vm_close are always called with mm->mmap_sem held for
+ /* vm_open and vm_close are always called with mm->mmap_lock held for
* writing. The only exception is when the process is shutting down, at
* which point vm_close is called with no locks held, but since it is
* after the VMAs have been detached, it is impossible that vm_open will
@@ -190,7 +190,7 @@
*/
ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
/* ib_cmem_alloc_contiguous_pages is called as part of mmap
- * with mm->mmap_sem held for writing.
+ * with mm->mmap_lock held for writing.
* No need to lock
*/
#ifdef HAVE_PINNED_VM
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/core/umem.c drivers/infiniband/core/umem.c
--- a/drivers/infiniband/core/umem.c 2022-08-05 17:32:01.709190415 +0900
+++ b/drivers/infiniband/core/umem.c 2022-08-05 17:28:35.543301480 +0900
@@ -460,7 +460,7 @@
new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
#else
- down_write(&mm->mmap_sem);
+ down_write(&mm->mmap_lock);
#ifdef HAVE_PINNED_VM
if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) ||
(new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) {
@@ -473,7 +473,7 @@
#ifdef HAVE_ATOMIC_PINNED_VM
atomic64_sub(npages, &mm->pinned_vm);
#else
- up_write(&mm->mmap_sem);
+ up_write(&mm->mmap_lock);
#ifdef HAVE_PINNED_VM
pr_debug("%s: requested to lock(%lu) while limit is(%lu)\n",
__func__, new_pinned, lock_limit);
@@ -489,7 +489,7 @@
#ifdef HAVE_PINNED_VM
mm->pinned_vm = new_pinned;
#endif /* HAVE_PINNED_VM */
- up_write(&mm->mmap_sem);
+ up_write(&mm->mmap_lock);
#endif /* HAVE_ATOMIC_PINNED_VM */
cur_base = addr & PAGE_MASK;
@@ -508,7 +508,7 @@
sg = umem->sg_head.sgl;
while (npages) {
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_lock);
#ifdef HAVE_FOLL_LONGTERM
ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
@@ -552,7 +552,7 @@
PAGE_SIZE / sizeof(struct page *)));
#endif
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_lock);
goto umem_release;
}
@@ -563,7 +563,7 @@
dma_get_max_seg_size(context->device->dma_device),
&umem->sg_nents);
#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM)
- /* Continue to hold the mmap_sem as vma_list access
+ /* Continue to hold the mmap_lock as vma_list access
* * needs to be protected.
* */
for (i = 0; i < ret && umem->hugetlb; i++) {
@@ -571,7 +571,7 @@
umem->hugetlb = 0;
}
#endif
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_lock);
}
sg_mark_end(sg);
@@ -610,13 +610,13 @@
#ifdef HAVE_ATOMIC_PINNED_VM
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
#else
- down_write(&mm->mmap_sem);
+ down_write(&mm->mmap_lock);
#ifdef HAVE_PINNED_VM
mm->pinned_vm -= ib_umem_num_pages(umem);
#else
mm->locked_vm -= ib_umem_num_pages(umem);
#endif /* HAVE_PINNED_VM */
- up_write(&mm->mmap_sem);
+ up_write(&mm->mmap_lock);
#endif /* HAVE_ATOMIC_PINNED_VM */
out:
#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM)
@@ -681,13 +681,13 @@
#ifdef HAVE_ATOMIC_PINNED_VM
atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
#else
- down_write(&umem->owning_mm->mmap_sem);
+ down_write(&umem->owning_mm->mmap_lock);
#ifdef HAVE_PINNED_VM
umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
#else
umem->owning_mm->locked_vm -= ib_umem_num_pages(umem);
#endif /* HAVE_PINNED_VM */
- up_write(&umem->owning_mm->mmap_sem);
+ up_write(&umem->owning_mm->mmap_lock);
#endif /*HAVE_ATOMIC_PINNED_VM*/
__ib_umem_release_tail(umem);
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/core/umem_odp.c drivers/infiniband/core/umem_odp.c
--- a/drivers/infiniband/core/umem_odp.c 2022-08-05 17:32:01.713190374 +0900
+++ b/drivers/infiniband/core/umem_odp.c 2022-08-05 18:01:24.459140610 +0900
@@ -571,15 +571,15 @@
struct vm_area_struct *vma;
struct hstate *h;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_lock);
vma = find_vma(mm, ib_umem_start(umem));
if (!vma || !is_vm_hugetlb_page(vma)) {
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_lock);
return -EINVAL;
}
h = hstate_vma(vma);
umem->page_shift = huge_page_shift(h);
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_lock);
#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM)
umem->hugetlb = 1;
} else {
@@ -816,7 +816,7 @@
(bcnt + BIT(page_shift) - 1) >> page_shift,
PAGE_SIZE / sizeof(struct page *));
- down_read(&owning_mm->mmap_sem);
+ down_read(&owning_mm->mmap_lock);
/*
* Note: this might result in redundent page getting. We can
* avoid this by checking dma_list to be 0 before calling
@@ -824,6 +824,12 @@
* complex (and doesn't gain us much performance in most use
* cases).
*/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+ npages = get_user_pages_remote(owning_mm,
+ user_virt, gup_num_pages,
+ access_mask & ODP_WRITE_ALLOWED_BIT,
+ local_page_list, NULL, NULL);
+#else
#if defined(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS) || defined(HAVE_GET_USER_PAGES_REMOTE_7_PARAMS) || defined(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS_W_LOCKED)
npages = get_user_pages_remote(owning_process, owning_mm,
user_virt, gup_num_pages,
@@ -832,11 +838,13 @@
flags, local_page_list, NULL, NULL);
#else
flags, local_page_list, NULL);
-#endif
+#endif /* HAVE_GET_USER_PAGES_REMOTE_8_PARAMS_W_LOCKED */
#else
access_mask & ODP_WRITE_ALLOWED_BIT, 0,
local_page_list, NULL);
-#endif
+#endif /* HAVE_GET_USER_PAGES_GUP_FLAGS */
+#else
+#ifdef HAVE_GET_USER_PAGES_5_PARAMS
#else
npages = get_user_pages(owning_process, owning_mm,
user_virt, gup_num_pages,
@@ -845,9 +853,11 @@
#else
access_mask & ODP_WRITE_ALLOWED_BIT,
0, local_page_list, NULL);
-#endif
-#endif
- up_read(&owning_mm->mmap_sem);
+#endif /* HAVE_GET_USER_PAGES_7_PARAMS */
+#endif /* HAVE_GET_USER_PAGES_5_PARAMS */
+#endif /* defined(...) */
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) */
+ up_read(&owning_mm->mmap_lock);
if (npages < 0) {
if (npages != -EAGAIN)
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/core/uverbs_cmd.c drivers/infiniband/core/uverbs_cmd.c
--- a/drivers/infiniband/core/uverbs_cmd.c 2022-08-05 17:32:01.717190333 +0900
+++ b/drivers/infiniband/core/uverbs_cmd.c 2022-08-05 18:09:48.096560360 +0900
@@ -49,7 +49,6 @@
#include "core_priv.h"
#include "uverbs_exp.h"
-
/*
* Copy a response to userspace. If the provided 'resp' is larger than the
* user buffer it is silently truncated. If the user provided a larger buffer
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/core/uverbs_main.c drivers/infiniband/core/uverbs_main.c
--- a/drivers/infiniband/core/uverbs_main.c 2022-08-05 17:32:01.717190333 +0900
+++ b/drivers/infiniband/core/uverbs_main.c 2022-08-05 17:28:35.543301480 +0900
@@ -1018,12 +1018,12 @@
return;
/*
- * The umap_lock is nested under mmap_sem since it used within
+ * The umap_lock is nested under mmap_lock since it used within
* the vma_ops callbacks, so we have to clean the list one mm
* at a time to get the lock ordering right. Typically there
* will only be one mm, so no big deal.
*/
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_lock);
if (!mmget_still_valid(mm))
goto skip_mm;
mutex_lock(&ufile->umap_lock);
@@ -1050,7 +1050,7 @@
}
mutex_unlock(&ufile->umap_lock);
skip_mm:
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_lock);
mmput(mm);
}
}
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/hw/mlx4/mr.c drivers/infiniband/hw/mlx4/mr.c
--- a/drivers/infiniband/hw/mlx4/mr.c 2021-12-08 23:08:07.000000000 +0900
+++ b/drivers/infiniband/hw/mlx4/mr.c 2022-08-05 17:28:35.543301480 +0900
@@ -380,7 +380,7 @@
if (!ib_access_writable(access_flags)) {
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
+ down_read(&current->mm->mmap_lock);
/*
* FIXME: Ideally this would iterate over all the vmas that
* cover the memory, but for now it requires a single vma to
@@ -395,7 +395,7 @@
access_flags |= IB_ACCESS_LOCAL_WRITE;
}
- up_read(&current->mm->mmap_sem);
+ up_read(&current->mm->mmap_lock);
}
return ib_umem_get(udata, start, length, access_flags, 0, peer_mem_flags);
diff -Naur /home/kyujin/mlnx-ofed-patch/mlnx-ofed-kernel-4.9-orig/drivers/infiniband/hw/mlx5/main_exp.c drivers/infiniband/hw/mlx5/main_exp.c
--- a/drivers/infiniband/hw/mlx5/main_exp.c 2022-08-05 17:32:01.797189514 +0900
+++ b/drivers/infiniband/hw/mlx5/main_exp.c 2022-08-05 17:28:35.547301439 +0900
@@ -2546,7 +2546,7 @@
dm->size = act_size;
if (context) {
- down_read(&current->mm->mmap_sem);
+ down_read(&current->mm->mmap_lock);
vma = find_vma(current->mm, uaddr & PAGE_MASK);
if (!vma || (vma->vm_end - vma->vm_start < map_size)) {
ret = -EINVAL;
@@ -2568,7 +2568,7 @@
goto err_vma;
}
- up_read(&current->mm->mmap_sem);
+ up_read(&current->mm->mmap_lock);
} else {
dm->dm_base_addr = ioremap(memic_addr, length);
if (!dm->dm_base_addr) {
@@ -2583,7 +2583,7 @@
return &dm->ibdm;
err_vma:
- up_read(&current->mm->mmap_sem);
+ up_read(&current->mm->mmap_lock);
err_map:
mlx5_cmd_dealloc_memic(dm_db, memic_addr, act_size);
#! /bin/bash
if [ -z $1 ]; then
echo "Usage: ./patch-ofed-kernel-dkms.sh <Path to original mlnx-ofed-kernel-dkms_4.9-OFED.4.9.4.1.7.1_all.deb file>"
exit 1
fi
dpkg-deb -R $1 tmp # unpack mlnx-ofed-kernel-dkms
cd tmp
./configure # load backport patches to source
# Credit: https://forums.developer.nvidia.com/t/is-debian-11-bulleyes-not-supported-yet-on-the-latest-mlnx-ofed-4-9-lts-driver/219606
# Apply patch written by @firemeteor on NVIDIA Forum
curl -L https://pastebin.com/raw/e09pFd0h | tail -n '+3' | patch -p1
cd compat; ./autogen.sh; cd .. # run autogen.sh
# Apply patches to run on Linux 5.10
curl -L https://gist.github.com/kyujin-cho/6c41f0c4be920e9b3c5183857fd4f4b3/raw/kernel-5.10.patch | patch -p1
cd ..
# Pack modified sources to new deb
dpkg-deb -b tmp mlnx-ofed-kernel-dkms_4.9-OFED.4.9.4.1.7.1_all.deb
# Replace unpatched deb with patched one
mv mlnx-ofed-kernel-dkms_4.9-OFED.4.9.4.1.7.1_all.deb $1
@mzpqnxow
Copy link

Hey, thanks for this

I think it's slightly off though- I needed this:

#! /bin/bash
if [ -z $1 ]; then
    echo "Usage: ./patch-ofed-kernel-dkms.sh <Path to original mlnx-ofed-kernel-dkms_4.9-OFED.4.9.4.1.7.1_all.deb file>"
    exit 1
fi
dpkg-deb -R $1 tmp  # unpack mlnx-ofed-kernel-dkms
- cd tmp
+ cd tmp/usr/src/mlnx-ofed-kernel-4.9
...

Thanks again for publishing this, I wasn't having fun trolling through the Mellanox (err, Nvidia, I guess) forums

@nblattmann-sys
Copy link

Hello,

Thank you for this, just a little modification too :

...
curl -L https://pastebin.com/raw/e09pFd0h | tail -n '+3' | patch -p1
cd compat; ./autogen.sh; cd ..  # run autogen.sh
# Apply patches to run on Linux 5.10
curl -L https://gist.github.com/kyujin-cho/6c41f0c4be920e9b3c5183857fd4f4b3/raw/kernel-5.10.patch | patch -p1
cd ../../../..
# Pack modified sources to new deb
...

Thx!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment