DecksLabs/pinning-hook-genric.sh

## pinning-hook-genric.sh
#!/bin/bash
set -e -o errexit -o pipefail -o nounset

###################################
# This script can be used by itself, but it's recommended that you read
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
###################################

# Do not modify these variables (set by Proxmox when calling the script)
vmId="$1"
runPhase="$2"
idleVmId="990"
idleVm="no"
hostAllowedCpus=""
qemuAllowedCpus=""
vmCpus=""
cpuGovernor=""


echo "Running $runPhase on VM=$vmId"
#!/bin/bash
set -e -o errexit -o pipefail -o nounset

###################################
# This script can be used by itself, but it's recommended that you read
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
###################################

# Do not modify these variables (set by Proxmox when calling the script)
vmId="$1"
runPhase="$2"
idleVmId="990"
idleVm="no"
hostAllowedCpus=""
qemuAllowedCpus=""
vmCpus=""
cpuGovernor=""


echo "Running $runPhase on VM=$vmId"

# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
# On 5900x the core config, as seen in lscpu -e, looks like the following:
# CCX #0:
#  - NUMA: node 0
#  - CPU: 0-5, 12-17 (SMT threads/host CPU#)
#  - CORE: 0-5
# CCX #1:
#  - NUMA: node 1
#  - CPU: 6-11, 18-23
#  - CORE: 6-11
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
#
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
#
# Useful commands while debugging this code:
#  List running tasks with their affinity as of now: (the "]" filters out kthreads)
#  ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
#  Track cgroups resources usage: systemd-cgtop
#  See tree of cgroups: systemd-cgls


# Gets QEMU parent process PID for the current VM
getQemuPID () {
    local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
    if [[ -z $qemuParentPid ]]; then
        echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
        return 1
    fi

   echo $qemuParentPid
}

# Gets the last logical CPU (thread) of the system
getLastCpu () {
    echo $(( $(nproc --all) - 1 ))
}

# Pin vCPU to a host logic CPU (thread)
# The theread SHOULD be a single one, but it can be any taskset list
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: vCPU# hostThread#orList
pinVCpu () {
    local vCpuNum=$1
    local hostThreadNum="$2"

    local qemuParentPid=$(getQemuPID)
    local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
    if [[ -z $vCpuTaskPid ]]; then
        echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
        return 1
    fi

    echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
    taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
}


# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
# There thread SHOULD probably be a list unlike pinVCpu
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: hostThread#orList
pinNonVCpuTasks () {
    local hostThreadNum="$1"

    local qemuParentPid=$(getQemuPID)
    local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)

    while IFS= read -r tpid; do
        local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
        echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
        taskset --cpu-list --pid "$hostThreadNum" $tpid || true
    done <<< "$nonVCpuTaskPids"
}

# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
# to control their affinity manual pinning is needed.
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
#
# Params: hostThread#orList
pinKthreads () {
    local hostThreadNum="$1"

    echo "Attempting to pin all kthreads to $hostThreadNum..."
    local procStat=""
    local pid=""
    local comm=""
    for statFile in /proc/[0-9]*/stat; do
        # This CAN sometimes fail due to TOC-TOU
        procStat=""
	2>/dev/null read -a procStat < $statFile || true
        if [[ -z "${procStat[0]}" ]]; then continue; fi

        # Ignore not kthreads
        flags="${procStat[8]}"
        if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi

        pid="${procStat[0]}"
        comm="${procStat[1]:1:-1}"
        # This CAN fail for some kthreads that are needed on specific CPUs
        if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
            echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
        fi
    done
}

# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
# when these are rescheduled. This function is not perfect as it doesn't set a mask
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
# be needed as if the VM isn't started on boot most if not all busy IRQs would have
# been triggered by now.
#
# Params: hostThread#orList
pinIrqs () {
    local hostThreadNum="$1"

    echo "Pinning IRQs to host thread(s) $hostThreadNum..."
    for irqAffLst in /proc/irq/*/smp_affinity_list; do
        local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
        if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
           echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
        fi
    done
}

# Set governor/scaling for a host logic CPU (thread)
# Params: hostThread# desiredGovernor
setGovernor () {
    local hostCpu=$1
    local reqGov="$2"
    local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)

    if [[ -z "$curGov" ]]; then
        echo "ERROR: failed to query governor for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqGov" == "$curGov" ]]; then
        echo "CPU $hostCpu: requested governor $reqGov - it is already set"
        return
    fi

    echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
    echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
}

# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredGovernor
setGovernorRange () {
    for (( i=$1; i<=$2; i++ )); do
        setGovernor $i "$3"
    done
}

# Resets governor/scaling to default state
resetGovernor () {
    echo "Resetting CPU governor to default"
    service cpufrequtils restart
}

# Put host CPU (thread) into offline or online state
# Params: hostThread# desiredState{0,1}
setCpuState () {
    local hostCpu=$1
    local reqState=$2
    local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)

    if [[ -z "$curState" ]]; then
        echo "ERROR: failed to online status for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqState" == "$curState" ]]; then
        echo "CPU $hostCpu: requested state $reqState - it is already set"
        return
    fi

    echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
    echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
    if [[ $? -eq 0 ]]; then
        echo "[OK]"
    else
        echo "[FAILED]"
        return 1
    fi
}

# Put host CPU (thread) range into offline or online state. Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
setCpuStateRange () {
    for (( i=$1; i<=$2; i++ )); do
        setCpuState $i $3
    done
}

tidyCaches () {
    echo -n "Tidying caches... "
    sync
    echo 3 > /proc/sys/vm/drop_caches
    echo 1 > /proc/sys/vm/compact_memory
    echo "[OK]"
}

# Sets cgroup slice or scope cpu isolation
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
setCgroupAllowedCpus () {
    local entity="$1"
    local allowedCpus="$2"

    echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
    systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
}

# Sets logical CPUs (threads) which can be used by processes on the host
# Params: hostThreadsList (e.g. 11,12,13-19)
setHostAllowedCpus () {
    echo "Setting host userland CPU constrain to $1"
    setCgroupAllowedCpus "init.scope" "$1"
    setCgroupAllowedCpus "system.slice" "$1"
    setCgroupAllowedCpus "user.slice" "$1"
}

# Sets logical CPUs (threads) which can be QEMU processes
# Params: hostThreadsList (e.g. 11,12,13-19
setQemuAllowedCpus () {
    echo "Setting QEMU CPU default constrain to $1"
    setCgroupAllowedCpus "qemu.slice" "$1"
}

# Makes sure that a decoupled slice for some QEMU VMs exist
# This will only do something the first time a VM start
# Params: <none>
ensureQemuDecoupledSlice () {
    if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
        return 0
    fi

   echo "Creating decoupled QEMU cgroup"
   mkdir /sys/fs/cgroup/qemu-decoupled.slice

   # The slice itself MUST be allowed to run on ALL CPUs. The reason
   # for that is we will move vCPUs to an isolated set of cores BUT
   # put emulator and iothread(s) on the shared CPUs. Since cgroups v2
   # doesn't allow a thread/task to be in a different cgroup than the
   # parent these tasks must stay in the qemu-decoupled.slice but with
   # different affinity
   local lastCPU=$(getLastCpu)
   setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
}

# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
#
# All processes from host run under system.slice and user.slice, while all QEMU machines run
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
# be used directly (albeit without warranties).
#
# Params: <none>
decoupleQemuVm () {
    ensureQemuDecoupledSlice

    local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
    if [[ ! -d "$vmScope" ]]; then
        echo "Creating cgroups scope for VMID=$vmId at $vmScope"
        mkdir "$vmScope"
    fi

    local qemuParentPid=$(getQemuPID)
    echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
    echo $qemuParentPid > "$vmScope/cgroup.procs"
}

# Starts/stops the "idle" windows VM to force very low GPU power states
setIdleVm () {
    echo "Setting idle VM to $1"
    qm "$1" "$idleVmId"
}

# Since updates around 2023/03/20-22 GPUs and some other PCIe devices  will only work once.
# When VM is turned off and on it will just black-screen and the VM never boots. This is a
# workaround for that issue.
#
# Params: <none>
resetVmPciDevices () {
    echo "Resetting VM PCI devices..."

    local pciAddrFun=''
    local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
    while IFS= read -r pciAddr; do
        # Single function (mostly SR-IOV or vGPU) device
        if echo "$pciAddr" | grep -F '.' > /dev/null; then
            echo "Removing PCI device function at $pciAddr"
            echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
            continue
        fi

        # Whole device specified => remove all function
        for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
            pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
            echo "Removing PCI device $pciAddr function $pciAddrFun"
            echo 1 > "$pciAddrFunRm" || true
            # This is absolutely required. Attempting to remove one function CAN
            # remove all of them but it's not instantenous. However, if you hit
            # such a case and try to manually do /remove on another function while
            # the first is being removed a "general protection fault" will happen
            # in the subsequent "pci_stop_and_remove_bus_device_locked()"
            while [[ -f "$pciAddrFunRm" ]]; do
                sleep 1
                echo "Still waiting for $pciAddrFunRm..."
            done
        done
    done <<< "$vmPciDevices"

    echo "Re-scanning PCI devices..."
    echo 1 > /sys/bus/pci/rescan
    # rescan is asynchronous; if we wanted to be 100% correct here we should wait
    # for /sys entries to appear, but 2 seconds delay is good enough
    sleep 2
}

parseConfig () {
    echo "Parsing config"

    idleVm=`grep 'idleVm=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    idleVmId=${idleVm}
    echo "idleVm=${idleVm}"

    miscCpus=`grep 'miscCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "miscCpus=${miscCpus}"

    vmCpus=`grep 'vmCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "vmCpus=${vmCpus}"

    cpuGovernor=`grep 'cpuGovernor=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "cpuGovernor=${cpuGovernor}"

    echo "Finished parsing config"
}

parseConfig

case "$runPhase" in
    pre-start)
        # Stop idle VM, drop caches & compact memory for hugepages
	if [[ ${idleVm} != "no" ]]; then
            setIdleVm shutdown
        fi

        tidyCaches
        resetVmPciDevices
    ;;

    # Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
    # All modifications should be done in post-start as doing them in pre-start will execute them even
    #  if the VM fails to start (and thus post-stop will never be called)
    post-start)
        # This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
        # This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
        # ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
        # I have no idea about any alternatives besides CPU hotplug hack (see below)
        # WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
        # any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
        # The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
        setHostAllowedCpus "${miscCpus}"
        setQemuAllowedCpus "${miscCpus}"

        # Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
        echo "Offlining to-be pinned CPUs to move tasks away..."
        for cpu in ${vmCpus//,/ }
        do
            setCpuState ${cpu} 0
        done

        # Move kernel threads & IRQs away from vCPU threads
        # Doing this when CPUs are offlined makes it easier as
        # nothing is running on these CPUs actively
        pinIrqs "${miscCpus}"
        pinKthreads "${miscCpus}"

        # Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
        echo "Onlineing to-be pinned CPUs..."
        for cpu in ${vmCpus//,/ }
        do
            setCpuState ${cpu} 1
        done

        # Set frequency scaling to performance mode
        for cpu in ${vmCpus//,/ }
        do
            setGovernor ${cpu} ${cpuGovernor}
        done

        # Stats generation causes jitter in VR
        sysctl vm.stat_interval=120

        # Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
        # An alternative hacky way to do that would be to iterate over all currently running VMs and
        # taskset their affinity to 1st CCX, but a new VM starting while this one is running will
        # break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
        # requires the VM process to be moved to a non-qemu.slice
        decoupleQemuVm

        # Pin vCPUs to correct threads - this is crucial.
        # Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
        # to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
        # first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
        # In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
        # CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
        # mapping.
        vCpu=0
        for cpu in ${vmCpus//,/ }
        do
            pinVCpu ${vCpu} ${cpu}
            ((vCpu=vCpu+1))
        done

        # Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
        # probabably be pinned to a single core, but we're counting on host scheduler being smart.
        # To do static pinning here QMP needs to be used to query types of threads:
        # https://wiki.qemu.org/Documentation/QMP
        pinNonVCpuTasks "${miscCpus}"
      ;;

    pre-stop)
      ;;
    post-stop)
      if ! compgen -G "/run/qemu-server/*.pid" > /dev/null; then
          echo "No other pinned VM runnig, restoring defaults"

          lastCpu=$(getLastCpu)
          # Allow kthreads, IRQs, host & QEMU to use all CPUs again
          pinKthreads "0-$lastCpu"
          pinIrqs "0-$lastCpu"
          setHostAllowedCpus "0-$lastCpu"
          setQemuAllowedCpus "0-$lastCpu"

          # Restore default scaling
          resetGovernor

          # Restore default virtual mem stats frequency
          sysctl vm.stat_interval=1
      fi

      # Start idle VM
      resetVmPciDevices

      if [[ ${idleVm} != "no" ]]; then
          setIdleVm start
      fi

      ;;
    *)
      echo "Unknown run phase \"$runPhase\"!"
      ;;
esac
echo "Finished $runPhase on VM=$vmId"


# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
# On 5900x the core config, as seen in lscpu -e, looks like the following:
# CCX #0:
#  - NUMA: node 0
#  - CPU: 0-5, 12-17 (SMT threads/host CPU#)
#  - CORE: 0-5
# CCX #1:
#  - NUMA: node 1
#  - CPU: 6-11, 18-23
#  - CORE: 6-11
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
#
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
#
# Useful commands while debugging this code:
#  List running tasks with their affinity as of now: (the "]" filters out kthreads)
#  ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
#  Track cgroups resources usage: systemd-cgtop
#  See tree of cgroups: systemd-cgls


# Gets QEMU parent process PID for the current VM
getQemuPID () {
    local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
    if [[ -z $qemuParentPid ]]; then
        echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
        return 1
    fi

   echo $qemuParentPid
}

# Gets the last logical CPU (thread) of the system
getLastCpu () {
    echo $(( $(nproc --all) - 1 ))
}

# Pin vCPU to a host logic CPU (thread)
# The theread SHOULD be a single one, but it can be any taskset list
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: vCPU# hostThread#orList
pinVCpu () {
    local vCpuNum=$1
    local hostThreadNum="$2"

    local qemuParentPid=$(getQemuPID)
    local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
    if [[ -z $vCpuTaskPid ]]; then
        echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
        return 1
    fi

    echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
    taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
}


# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
# There thread SHOULD probably be a list unlike pinVCpu
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: hostThread#orList
pinNonVCpuTasks () {
    local hostThreadNum="$1"

    local qemuParentPid=$(getQemuPID)
    local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)

    while IFS= read -r tpid; do
        local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
        echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
        taskset --cpu-list --pid "$hostThreadNum" $tpid || true
    done <<< "$nonVCpuTaskPids"
}

# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
# to control their affinity manual pinning is needed.
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
#
# Params: hostThread#orList
pinKthreads () {
    local hostThreadNum="$1"

    echo "Attempting to pin all kthreads to $hostThreadNum..."
    local procStat=""
    local pid=""
    local comm=""
    for statFile in /proc/[0-9]*/stat; do
        # This CAN sometimes fail due to TOC-TOU
        procStat=""
	2>/dev/null read -a procStat < $statFile || true
        if [[ -z "${procStat[0]}" ]]; then continue; fi

        # Ignore not kthreads
        flags="${procStat[8]}"
        if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi

        pid="${procStat[0]}"
        comm="${procStat[1]:1:-1}"
        # This CAN fail for some kthreads that are needed on specific CPUs
        if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
            echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
        fi
    done
}

# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
# when these are rescheduled. This function is not perfect as it doesn't set a mask
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
# be needed as if the VM isn't started on boot most if not all busy IRQs would have
# been triggered by now.
#
# Params: hostThread#orList
pinIrqs () {
    local hostThreadNum="$1"

    echo "Pinning IRQs to host thread(s) $hostThreadNum..."
    for irqAffLst in /proc/irq/*/smp_affinity_list; do
        local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
        if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
           echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
        fi
    done
}

# Set governor/scaling for a host logic CPU (thread)
# Params: hostThread# desiredGovernor
setGovernor () {
    local hostCpu=$1
    local reqGov="$2"
    local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)

    if [[ -z "$curGov" ]]; then
        echo "ERROR: failed to query governor for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqGov" == "$curGov" ]]; then
        echo "CPU $hostCpu: requested governor $reqGov - it is already set"
        return
    fi

    echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
    echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
}

# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredGovernor
setGovernorRange () {
    for (( i=$1; i<=$2; i++ )); do
        setGovernor $i "$3"
    done
}

# Resets governor/scaling to default state
resetGovernor () {
    echo "Resetting CPU governor to default"
    service cpufrequtils restart
}

# Put host CPU (thread) into offline or online state
# Params: hostThread# desiredState{0,1}
setCpuState () {
    local hostCpu=$1
    local reqState=$2
    local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)

    if [[ -z "$curState" ]]; then
        echo "ERROR: failed to online status for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqState" == "$curState" ]]; then
        echo "CPU $hostCpu: requested state $reqState - it is already set"
        return
    fi

    echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
    echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
    if [[ $? -eq 0 ]]; then
        echo "[OK]"
    else
        echo "[FAILED]"
        return 1
    fi
}

# Put host CPU (thread) range into offline or online state. Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
setCpuStateRange () {
    for (( i=$1; i<=$2; i++ )); do
        setCpuState $i $3
    done
}

tidyCaches () {
    echo -n "Tidying caches... "
    sync
    echo 3 > /proc/sys/vm/drop_caches
    echo 1 > /proc/sys/vm/compact_memory
    echo "[OK]"
}

# Sets cgroup slice or scope cpu isolation
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
setCgroupAllowedCpus () {
    local entity="$1"
    local allowedCpus="$2"

    echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
    systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
}

# Sets logical CPUs (threads) which can be used by processes on the host
# Params: hostThreadsList (e.g. 11,12,13-19)
setHostAllowedCpus () {
    echo "Setting host userland CPU constrain to $1"
    setCgroupAllowedCpus "init.scope" "$1"
    setCgroupAllowedCpus "system.slice" "$1"
    setCgroupAllowedCpus "user.slice" "$1"
}

# Sets logical CPUs (threads) which can be QEMU processes
# Params: hostThreadsList (e.g. 11,12,13-19
setQemuAllowedCpus () {
    echo "Setting QEMU CPU default constrain to $1"
    setCgroupAllowedCpus "qemu.slice" "$1"
}

# Makes sure that a decoupled slice for some QEMU VMs exist
# This will only do something the first time a VM start
# Params: <none>
ensureQemuDecoupledSlice () {
    if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
        return 0
    fi

   echo "Creating decoupled QEMU cgroup"
   mkdir /sys/fs/cgroup/qemu-decoupled.slice

   # The slice itself MUST be allowed to run on ALL CPUs. The reason
   # for that is we will move vCPUs to an isolated set of cores BUT
   # put emulator and iothread(s) on the shared CPUs. Since cgroups v2
   # doesn't allow a thread/task to be in a different cgroup than the
   # parent these tasks must stay in the qemu-decoupled.slice but with
   # different affinity
   local lastCPU=$(getLastCpu)
   setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
}

# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
#
# All processes from host run under system.slice and user.slice, while all QEMU machines run
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
# be used directly (albeit without warranties).
#
# Params: <none>
decoupleQemuVm () {
    ensureQemuDecoupledSlice

    local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
    if [[ ! -d "$vmScope" ]]; then
        echo "Creating cgroups scope for VMID=$vmId at $vmScope"
        mkdir "$vmScope"
    fi

    local qemuParentPid=$(getQemuPID)
    echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
    echo $qemuParentPid > "$vmScope/cgroup.procs"
}

# Starts/stops the "idle" windows VM to force very low GPU power states
setIdleVm () {
    echo "Setting idle VM to $1"
    qm "$1" "$idleVmId"
}

# Since updates around 2023/03/20-22 GPUs and some other PCIe devices  will only work once.
# When VM is turned off and on it will just black-screen and the VM never boots. This is a
# workaround for that issue.
#
# Params: <none>
resetVmPciDevices () {
    echo "Resetting VM PCI devices..."

    local pciAddrFun=''
    local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
    while IFS= read -r pciAddr; do
        # Single function (mostly SR-IOV or vGPU) device
        if echo "$pciAddr" | grep -F '.' > /dev/null; then
            echo "Removing PCI device function at $pciAddr"
            echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
            continue
        fi

        # Whole device specified => remove all function
        for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
            pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
            echo "Removing PCI device $pciAddr function $pciAddrFun"
            echo 1 > "$pciAddrFunRm" || true
            # This is absolutely required. Attempting to remove one function CAN
            # remove all of them but it's not instantenous. However, if you hit
            # such a case and try to manually do /remove on another function while
            # the first is being removed a "general protection fault" will happen
            # in the subsequent "pci_stop_and_remove_bus_device_locked()"
            while [[ -f "$pciAddrFunRm" ]]; do
                sleep 1
                echo "Still waiting for $pciAddrFunRm..."
            done
        done
    done <<< "$vmPciDevices"

    echo "Re-scanning PCI devices..."
    echo 1 > /sys/bus/pci/rescan
    # rescan is asynchronous; if we wanted to be 100% correct here we should wait
    # for /sys entries to appear, but 2 seconds delay is good enough
    sleep 2
}

parseConfig () {
    echo "Parsing config"

    idleVm=`grep 'idleVm=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    idleVmId=${idleVm}
    echo "idleVm=${idleVm}"

    miscCpus=`grep 'miscCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "miscCpus=${miscCpus}"

    vmCpus=`grep 'vmCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "vmCpus=${vmCpus}"

    cpuGovernor=`grep 'cpuGovernor=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
    echo "cpuGovernor=${cpuGovernor}"

    echo "Finished parsing config"
}

parseConfig

case "$runPhase" in
    pre-start)
        # Stop idle VM, drop caches & compact memory for hugepages
	if [[ ${idleVm} != "no" ]]; then
            setIdleVm shutdown
        fi

        tidyCaches
        resetVmPciDevices
    ;;

    post-start)
        # This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
        # This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
        # ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
        # I have no idea about any alternatives besides CPU hotplug hack (see below)
        # WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
        # any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
        # The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
        setHostAllowedCpus "${miscCpus}"
        setQemuAllowedCpus "${miscCpus}"

        # Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
        echo "Offlining to-be pinned CPUs to move tasks away..."
        for cpu in ${vmCpus//,/ }
        do
            setCpuState ${cpu} 0
        done

        # Move kernel threads & IRQs away from vCPU threads
        # Doing this when CPUs are offlined makes it easier as
        # nothing is running on these CPUs actively
        pinIrqs "${miscCpus}"
        pinKthreads "${miscCpus}"

        # Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
        echo "Onlineing to-be pinned CPUs..."
        for cpu in ${vmCpus//,/ }
        do
            setCpuState ${cpu} 1
        done

        # Set frequency scaling to performance mode
        for cpu in ${vmCpus//,/ }
        do
            setGovernor ${cpu} ${cpuGovernor}
        done

        # Stats generation causes jitter in VR
        sysctl vm.stat_interval=120

        # Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
        # An alternative hacky way to do that would be to iterate over all currently running VMs and
        # taskset their affinity to 1st CCX, but a new VM starting while this one is running will
        # break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
        # requires the VM process to be moved to a non-qemu.slice
        decoupleQemuVm

        # Pin vCPUs to correct threads - this is crucial.
        # Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
        # to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
        # first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
        # In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
        # CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
        # mapping.
        vCpu=0
        for cpu in ${vmCpus//,/ }
        do
            pinVCpu ${vCpu} ${cpu}
            ((vCpu=vCpu+1))
        done

        # Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
        # probabably be pinned to a single core, but we're counting on host scheduler being smart.
        # To do static pinning here QMP needs to be used to query types of threads:
        # https://wiki.qemu.org/Documentation/QMP
        pinNonVCpuTasks "${miscCpus}"
      ;;

    pre-stop)
      ;;
    post-stop)
      if ! compgen -G "/run/qemu-server/*.pid" > /dev/null; then
          echo "No other pinned VM runnig, restoring defaults"

          lastCpu=$(getLastCpu)
          # Allow kthreads, IRQs, host & QEMU to use all CPUs again
          pinKthreads "0-$lastCpu"
          pinIrqs "0-$lastCpu"
          setHostAllowedCpus "0-$lastCpu"
          setQemuAllowedCpus "0-$lastCpu"

          # Restore default scaling
          resetGovernor

          # Restore default virtual mem stats frequency
          sysctl vm.stat_interval=1
      fi

      # Start idle VM
      resetVmPciDevices

      if [[ ${idleVm} != "no" ]]; then
          setIdleVm start
      fi

      ;;
    *)
      echo "Unknown run phase \"$runPhase\"!"
      ;;
esac
echo "Finished $runPhase on VM=$vmId"