Skip to content

Instantly share code, notes, and snippets.

@DecksLabs
Created May 18, 2023 15:11
Show Gist options
  • Save DecksLabs/5fc8b5b944c3142fd636940beb59e37a to your computer and use it in GitHub Desktop.
Save DecksLabs/5fc8b5b944c3142fd636940beb59e37a to your computer and use it in GitHub Desktop.
#!/bin/bash
set -e -o errexit -o pipefail -o nounset
###################################
# This script can be used by itself, but it's recommended that you read
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
###################################
# Do not modify these variables (set by Proxmox when calling the script)
vmId="$1"
runPhase="$2"
idleVmId="990"
idleVm="no"
hostAllowedCpus=""
qemuAllowedCpus=""
vmCpus=""
cpuGovernor=""
echo "Running $runPhase on VM=$vmId"
#!/bin/bash
set -e -o errexit -o pipefail -o nounset
###################################
# This script can be used by itself, but it's recommended that you read
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
###################################
# Do not modify these variables (set by Proxmox when calling the script)
vmId="$1"
runPhase="$2"
idleVmId="990"
idleVm="no"
hostAllowedCpus=""
qemuAllowedCpus=""
vmCpus=""
cpuGovernor=""
echo "Running $runPhase on VM=$vmId"
# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
# On 5900x the core config, as seen in lscpu -e, looks like the following:
# CCX #0:
# - NUMA: node 0
# - CPU: 0-5, 12-17 (SMT threads/host CPU#)
# - CORE: 0-5
# CCX #1:
# - NUMA: node 1
# - CPU: 6-11, 18-23
# - CORE: 6-11
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
#
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
#
# Useful commands while debugging this code:
# List running tasks with their affinity as of now: (the "]" filters out kthreads)
# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
# Track cgroups resources usage: systemd-cgtop
# See tree of cgroups: systemd-cgls
# Gets QEMU parent process PID for the current VM
getQemuPID () {
local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
if [[ -z $qemuParentPid ]]; then
echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
return 1
fi
echo $qemuParentPid
}
# Gets the last logical CPU (thread) of the system
getLastCpu () {
echo $(( $(nproc --all) - 1 ))
}
# Pin vCPU to a host logic CPU (thread)
# The theread SHOULD be a single one, but it can be any taskset list
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: vCPU# hostThread#orList
pinVCpu () {
local vCpuNum=$1
local hostThreadNum="$2"
local qemuParentPid=$(getQemuPID)
local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
if [[ -z $vCpuTaskPid ]]; then
echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
return 1
fi
echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
}
# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
# There thread SHOULD probably be a list unlike pinVCpu
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: hostThread#orList
pinNonVCpuTasks () {
local hostThreadNum="$1"
local qemuParentPid=$(getQemuPID)
local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
while IFS= read -r tpid; do
local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $tpid || true
done <<< "$nonVCpuTaskPids"
}
# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
# to control their affinity manual pinning is needed.
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
#
# Params: hostThread#orList
pinKthreads () {
local hostThreadNum="$1"
echo "Attempting to pin all kthreads to $hostThreadNum..."
local procStat=""
local pid=""
local comm=""
for statFile in /proc/[0-9]*/stat; do
# This CAN sometimes fail due to TOC-TOU
procStat=""
2>/dev/null read -a procStat < $statFile || true
if [[ -z "${procStat[0]}" ]]; then continue; fi
# Ignore not kthreads
flags="${procStat[8]}"
if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi
pid="${procStat[0]}"
comm="${procStat[1]:1:-1}"
# This CAN fail for some kthreads that are needed on specific CPUs
if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
fi
done
}
# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
# when these are rescheduled. This function is not perfect as it doesn't set a mask
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
# be needed as if the VM isn't started on boot most if not all busy IRQs would have
# been triggered by now.
#
# Params: hostThread#orList
pinIrqs () {
local hostThreadNum="$1"
echo "Pinning IRQs to host thread(s) $hostThreadNum..."
for irqAffLst in /proc/irq/*/smp_affinity_list; do
local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
fi
done
}
# Set governor/scaling for a host logic CPU (thread)
# Params: hostThread# desiredGovernor
setGovernor () {
local hostCpu=$1
local reqGov="$2"
local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)
if [[ -z "$curGov" ]]; then
echo "ERROR: failed to query governor for CPU $hostCpu"
return 1
fi
if [[ "$reqGov" == "$curGov" ]]; then
echo "CPU $hostCpu: requested governor $reqGov - it is already set"
return
fi
echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
}
# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredGovernor
setGovernorRange () {
for (( i=$1; i<=$2; i++ )); do
setGovernor $i "$3"
done
}
# Resets governor/scaling to default state
resetGovernor () {
echo "Resetting CPU governor to default"
service cpufrequtils restart
}
# Put host CPU (thread) into offline or online state
# Params: hostThread# desiredState{0,1}
setCpuState () {
local hostCpu=$1
local reqState=$2
local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)
if [[ -z "$curState" ]]; then
echo "ERROR: failed to online status for CPU $hostCpu"
return 1
fi
if [[ "$reqState" == "$curState" ]]; then
echo "CPU $hostCpu: requested state $reqState - it is already set"
return
fi
echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
if [[ $? -eq 0 ]]; then
echo "[OK]"
else
echo "[FAILED]"
return 1
fi
}
# Put host CPU (thread) range into offline or online state. Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
setCpuStateRange () {
for (( i=$1; i<=$2; i++ )); do
setCpuState $i $3
done
}
tidyCaches () {
echo -n "Tidying caches... "
sync
echo 3 > /proc/sys/vm/drop_caches
echo 1 > /proc/sys/vm/compact_memory
echo "[OK]"
}
# Sets cgroup slice or scope cpu isolation
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
setCgroupAllowedCpus () {
local entity="$1"
local allowedCpus="$2"
echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
}
# Sets logical CPUs (threads) which can be used by processes on the host
# Params: hostThreadsList (e.g. 11,12,13-19)
setHostAllowedCpus () {
echo "Setting host userland CPU constrain to $1"
setCgroupAllowedCpus "init.scope" "$1"
setCgroupAllowedCpus "system.slice" "$1"
setCgroupAllowedCpus "user.slice" "$1"
}
# Sets logical CPUs (threads) which can be QEMU processes
# Params: hostThreadsList (e.g. 11,12,13-19
setQemuAllowedCpus () {
echo "Setting QEMU CPU default constrain to $1"
setCgroupAllowedCpus "qemu.slice" "$1"
}
# Makes sure that a decoupled slice for some QEMU VMs exist
# This will only do something the first time a VM start
# Params: <none>
ensureQemuDecoupledSlice () {
if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
return 0
fi
echo "Creating decoupled QEMU cgroup"
mkdir /sys/fs/cgroup/qemu-decoupled.slice
# The slice itself MUST be allowed to run on ALL CPUs. The reason
# for that is we will move vCPUs to an isolated set of cores BUT
# put emulator and iothread(s) on the shared CPUs. Since cgroups v2
# doesn't allow a thread/task to be in a different cgroup than the
# parent these tasks must stay in the qemu-decoupled.slice but with
# different affinity
local lastCPU=$(getLastCpu)
setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
}
# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
#
# All processes from host run under system.slice and user.slice, while all QEMU machines run
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
# be used directly (albeit without warranties).
#
# Params: <none>
decoupleQemuVm () {
ensureQemuDecoupledSlice
local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
if [[ ! -d "$vmScope" ]]; then
echo "Creating cgroups scope for VMID=$vmId at $vmScope"
mkdir "$vmScope"
fi
local qemuParentPid=$(getQemuPID)
echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
echo $qemuParentPid > "$vmScope/cgroup.procs"
}
# Starts/stops the "idle" windows VM to force very low GPU power states
setIdleVm () {
echo "Setting idle VM to $1"
qm "$1" "$idleVmId"
}
# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once.
# When VM is turned off and on it will just black-screen and the VM never boots. This is a
# workaround for that issue.
#
# Params: <none>
resetVmPciDevices () {
echo "Resetting VM PCI devices..."
local pciAddrFun=''
local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
while IFS= read -r pciAddr; do
# Single function (mostly SR-IOV or vGPU) device
if echo "$pciAddr" | grep -F '.' > /dev/null; then
echo "Removing PCI device function at $pciAddr"
echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
continue
fi
# Whole device specified => remove all function
for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
echo "Removing PCI device $pciAddr function $pciAddrFun"
echo 1 > "$pciAddrFunRm" || true
# This is absolutely required. Attempting to remove one function CAN
# remove all of them but it's not instantenous. However, if you hit
# such a case and try to manually do /remove on another function while
# the first is being removed a "general protection fault" will happen
# in the subsequent "pci_stop_and_remove_bus_device_locked()"
while [[ -f "$pciAddrFunRm" ]]; do
sleep 1
echo "Still waiting for $pciAddrFunRm..."
done
done
done <<< "$vmPciDevices"
echo "Re-scanning PCI devices..."
echo 1 > /sys/bus/pci/rescan
# rescan is asynchronous; if we wanted to be 100% correct here we should wait
# for /sys entries to appear, but 2 seconds delay is good enough
sleep 2
}
parseConfig () {
echo "Parsing config"
idleVm=`grep 'idleVm=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
idleVmId=${idleVm}
echo "idleVm=${idleVm}"
miscCpus=`grep 'miscCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "miscCpus=${miscCpus}"
vmCpus=`grep 'vmCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "vmCpus=${vmCpus}"
cpuGovernor=`grep 'cpuGovernor=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "cpuGovernor=${cpuGovernor}"
echo "Finished parsing config"
}
parseConfig
case "$runPhase" in
pre-start)
# Stop idle VM, drop caches & compact memory for hugepages
if [[ ${idleVm} != "no" ]]; then
setIdleVm shutdown
fi
tidyCaches
resetVmPciDevices
;;
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
# All modifications should be done in post-start as doing them in pre-start will execute them even
# if the VM fails to start (and thus post-stop will never be called)
post-start)
# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
# ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
# I have no idea about any alternatives besides CPU hotplug hack (see below)
# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
# any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
setHostAllowedCpus "${miscCpus}"
setQemuAllowedCpus "${miscCpus}"
# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
echo "Offlining to-be pinned CPUs to move tasks away..."
for cpu in ${vmCpus//,/ }
do
setCpuState ${cpu} 0
done
# Move kernel threads & IRQs away from vCPU threads
# Doing this when CPUs are offlined makes it easier as
# nothing is running on these CPUs actively
pinIrqs "${miscCpus}"
pinKthreads "${miscCpus}"
# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
echo "Onlineing to-be pinned CPUs..."
for cpu in ${vmCpus//,/ }
do
setCpuState ${cpu} 1
done
# Set frequency scaling to performance mode
for cpu in ${vmCpus//,/ }
do
setGovernor ${cpu} ${cpuGovernor}
done
# Stats generation causes jitter in VR
sysctl vm.stat_interval=120
# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
# An alternative hacky way to do that would be to iterate over all currently running VMs and
# taskset their affinity to 1st CCX, but a new VM starting while this one is running will
# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
# requires the VM process to be moved to a non-qemu.slice
decoupleQemuVm
# Pin vCPUs to correct threads - this is crucial.
# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
# mapping.
vCpu=0
for cpu in ${vmCpus//,/ }
do
pinVCpu ${vCpu} ${cpu}
((vCpu=vCpu+1))
done
# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
# probabably be pinned to a single core, but we're counting on host scheduler being smart.
# To do static pinning here QMP needs to be used to query types of threads:
# https://wiki.qemu.org/Documentation/QMP
pinNonVCpuTasks "${miscCpus}"
;;
pre-stop)
;;
post-stop)
if ! compgen -G "/run/qemu-server/*.pid" > /dev/null; then
echo "No other pinned VM runnig, restoring defaults"
lastCpu=$(getLastCpu)
# Allow kthreads, IRQs, host & QEMU to use all CPUs again
pinKthreads "0-$lastCpu"
pinIrqs "0-$lastCpu"
setHostAllowedCpus "0-$lastCpu"
setQemuAllowedCpus "0-$lastCpu"
# Restore default scaling
resetGovernor
# Restore default virtual mem stats frequency
sysctl vm.stat_interval=1
fi
# Start idle VM
resetVmPciDevices
if [[ ${idleVm} != "no" ]]; then
setIdleVm start
fi
;;
*)
echo "Unknown run phase \"$runPhase\"!"
;;
esac
echo "Finished $runPhase on VM=$vmId"
# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
# On 5900x the core config, as seen in lscpu -e, looks like the following:
# CCX #0:
# - NUMA: node 0
# - CPU: 0-5, 12-17 (SMT threads/host CPU#)
# - CORE: 0-5
# CCX #1:
# - NUMA: node 1
# - CPU: 6-11, 18-23
# - CORE: 6-11
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
#
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
#
# Useful commands while debugging this code:
# List running tasks with their affinity as of now: (the "]" filters out kthreads)
# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
# Track cgroups resources usage: systemd-cgtop
# See tree of cgroups: systemd-cgls
# Gets QEMU parent process PID for the current VM
getQemuPID () {
local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
if [[ -z $qemuParentPid ]]; then
echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
return 1
fi
echo $qemuParentPid
}
# Gets the last logical CPU (thread) of the system
getLastCpu () {
echo $(( $(nproc --all) - 1 ))
}
# Pin vCPU to a host logic CPU (thread)
# The theread SHOULD be a single one, but it can be any taskset list
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: vCPU# hostThread#orList
pinVCpu () {
local vCpuNum=$1
local hostThreadNum="$2"
local qemuParentPid=$(getQemuPID)
local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
if [[ -z $vCpuTaskPid ]]; then
echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
return 1
fi
echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
}
# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
# There thread SHOULD probably be a list unlike pinVCpu
#
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
#
# Params: hostThread#orList
pinNonVCpuTasks () {
local hostThreadNum="$1"
local qemuParentPid=$(getQemuPID)
local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
while IFS= read -r tpid; do
local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $tpid || true
done <<< "$nonVCpuTaskPids"
}
# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
# to control their affinity manual pinning is needed.
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
#
# Params: hostThread#orList
pinKthreads () {
local hostThreadNum="$1"
echo "Attempting to pin all kthreads to $hostThreadNum..."
local procStat=""
local pid=""
local comm=""
for statFile in /proc/[0-9]*/stat; do
# This CAN sometimes fail due to TOC-TOU
procStat=""
2>/dev/null read -a procStat < $statFile || true
if [[ -z "${procStat[0]}" ]]; then continue; fi
# Ignore not kthreads
flags="${procStat[8]}"
if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi
pid="${procStat[0]}"
comm="${procStat[1]:1:-1}"
# This CAN fail for some kthreads that are needed on specific CPUs
if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
fi
done
}
# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
# when these are rescheduled. This function is not perfect as it doesn't set a mask
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
# be needed as if the VM isn't started on boot most if not all busy IRQs would have
# been triggered by now.
#
# Params: hostThread#orList
pinIrqs () {
local hostThreadNum="$1"
echo "Pinning IRQs to host thread(s) $hostThreadNum..."
for irqAffLst in /proc/irq/*/smp_affinity_list; do
local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
fi
done
}
# Set governor/scaling for a host logic CPU (thread)
# Params: hostThread# desiredGovernor
setGovernor () {
local hostCpu=$1
local reqGov="$2"
local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)
if [[ -z "$curGov" ]]; then
echo "ERROR: failed to query governor for CPU $hostCpu"
return 1
fi
if [[ "$reqGov" == "$curGov" ]]; then
echo "CPU $hostCpu: requested governor $reqGov - it is already set"
return
fi
echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
}
# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredGovernor
setGovernorRange () {
for (( i=$1; i<=$2; i++ )); do
setGovernor $i "$3"
done
}
# Resets governor/scaling to default state
resetGovernor () {
echo "Resetting CPU governor to default"
service cpufrequtils restart
}
# Put host CPU (thread) into offline or online state
# Params: hostThread# desiredState{0,1}
setCpuState () {
local hostCpu=$1
local reqState=$2
local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)
if [[ -z "$curState" ]]; then
echo "ERROR: failed to online status for CPU $hostCpu"
return 1
fi
if [[ "$reqState" == "$curState" ]]; then
echo "CPU $hostCpu: requested state $reqState - it is already set"
return
fi
echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
if [[ $? -eq 0 ]]; then
echo "[OK]"
else
echo "[FAILED]"
return 1
fi
}
# Put host CPU (thread) range into offline or online state. Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
setCpuStateRange () {
for (( i=$1; i<=$2; i++ )); do
setCpuState $i $3
done
}
tidyCaches () {
echo -n "Tidying caches... "
sync
echo 3 > /proc/sys/vm/drop_caches
echo 1 > /proc/sys/vm/compact_memory
echo "[OK]"
}
# Sets cgroup slice or scope cpu isolation
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
setCgroupAllowedCpus () {
local entity="$1"
local allowedCpus="$2"
echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
}
# Sets logical CPUs (threads) which can be used by processes on the host
# Params: hostThreadsList (e.g. 11,12,13-19)
setHostAllowedCpus () {
echo "Setting host userland CPU constrain to $1"
setCgroupAllowedCpus "init.scope" "$1"
setCgroupAllowedCpus "system.slice" "$1"
setCgroupAllowedCpus "user.slice" "$1"
}
# Sets logical CPUs (threads) which can be QEMU processes
# Params: hostThreadsList (e.g. 11,12,13-19
setQemuAllowedCpus () {
echo "Setting QEMU CPU default constrain to $1"
setCgroupAllowedCpus "qemu.slice" "$1"
}
# Makes sure that a decoupled slice for some QEMU VMs exist
# This will only do something the first time a VM start
# Params: <none>
ensureQemuDecoupledSlice () {
if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
return 0
fi
echo "Creating decoupled QEMU cgroup"
mkdir /sys/fs/cgroup/qemu-decoupled.slice
# The slice itself MUST be allowed to run on ALL CPUs. The reason
# for that is we will move vCPUs to an isolated set of cores BUT
# put emulator and iothread(s) on the shared CPUs. Since cgroups v2
# doesn't allow a thread/task to be in a different cgroup than the
# parent these tasks must stay in the qemu-decoupled.slice but with
# different affinity
local lastCPU=$(getLastCpu)
setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
}
# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
#
# All processes from host run under system.slice and user.slice, while all QEMU machines run
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
# be used directly (albeit without warranties).
#
# Params: <none>
decoupleQemuVm () {
ensureQemuDecoupledSlice
local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
if [[ ! -d "$vmScope" ]]; then
echo "Creating cgroups scope for VMID=$vmId at $vmScope"
mkdir "$vmScope"
fi
local qemuParentPid=$(getQemuPID)
echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
echo $qemuParentPid > "$vmScope/cgroup.procs"
}
# Starts/stops the "idle" windows VM to force very low GPU power states
setIdleVm () {
echo "Setting idle VM to $1"
qm "$1" "$idleVmId"
}
# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once.
# When VM is turned off and on it will just black-screen and the VM never boots. This is a
# workaround for that issue.
#
# Params: <none>
resetVmPciDevices () {
echo "Resetting VM PCI devices..."
local pciAddrFun=''
local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
while IFS= read -r pciAddr; do
# Single function (mostly SR-IOV or vGPU) device
if echo "$pciAddr" | grep -F '.' > /dev/null; then
echo "Removing PCI device function at $pciAddr"
echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
continue
fi
# Whole device specified => remove all function
for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
echo "Removing PCI device $pciAddr function $pciAddrFun"
echo 1 > "$pciAddrFunRm" || true
# This is absolutely required. Attempting to remove one function CAN
# remove all of them but it's not instantenous. However, if you hit
# such a case and try to manually do /remove on another function while
# the first is being removed a "general protection fault" will happen
# in the subsequent "pci_stop_and_remove_bus_device_locked()"
while [[ -f "$pciAddrFunRm" ]]; do
sleep 1
echo "Still waiting for $pciAddrFunRm..."
done
done
done <<< "$vmPciDevices"
echo "Re-scanning PCI devices..."
echo 1 > /sys/bus/pci/rescan
# rescan is asynchronous; if we wanted to be 100% correct here we should wait
# for /sys entries to appear, but 2 seconds delay is good enough
sleep 2
}
parseConfig () {
echo "Parsing config"
idleVm=`grep 'idleVm=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
idleVmId=${idleVm}
echo "idleVm=${idleVm}"
miscCpus=`grep 'miscCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "miscCpus=${miscCpus}"
vmCpus=`grep 'vmCpus=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "vmCpus=${vmCpus}"
cpuGovernor=`grep 'cpuGovernor=' "/etc/pve/qemu-server/${vmId}.conf" | sed 's/.*=//'`
echo "cpuGovernor=${cpuGovernor}"
echo "Finished parsing config"
}
parseConfig
case "$runPhase" in
pre-start)
# Stop idle VM, drop caches & compact memory for hugepages
if [[ ${idleVm} != "no" ]]; then
setIdleVm shutdown
fi
tidyCaches
resetVmPciDevices
;;
post-start)
# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
# ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
# I have no idea about any alternatives besides CPU hotplug hack (see below)
# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
# any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
setHostAllowedCpus "${miscCpus}"
setQemuAllowedCpus "${miscCpus}"
# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
echo "Offlining to-be pinned CPUs to move tasks away..."
for cpu in ${vmCpus//,/ }
do
setCpuState ${cpu} 0
done
# Move kernel threads & IRQs away from vCPU threads
# Doing this when CPUs are offlined makes it easier as
# nothing is running on these CPUs actively
pinIrqs "${miscCpus}"
pinKthreads "${miscCpus}"
# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
echo "Onlineing to-be pinned CPUs..."
for cpu in ${vmCpus//,/ }
do
setCpuState ${cpu} 1
done
# Set frequency scaling to performance mode
for cpu in ${vmCpus//,/ }
do
setGovernor ${cpu} ${cpuGovernor}
done
# Stats generation causes jitter in VR
sysctl vm.stat_interval=120
# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
# An alternative hacky way to do that would be to iterate over all currently running VMs and
# taskset their affinity to 1st CCX, but a new VM starting while this one is running will
# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
# requires the VM process to be moved to a non-qemu.slice
decoupleQemuVm
# Pin vCPUs to correct threads - this is crucial.
# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
# mapping.
vCpu=0
for cpu in ${vmCpus//,/ }
do
pinVCpu ${vCpu} ${cpu}
((vCpu=vCpu+1))
done
# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
# probabably be pinned to a single core, but we're counting on host scheduler being smart.
# To do static pinning here QMP needs to be used to query types of threads:
# https://wiki.qemu.org/Documentation/QMP
pinNonVCpuTasks "${miscCpus}"
;;
pre-stop)
;;
post-stop)
if ! compgen -G "/run/qemu-server/*.pid" > /dev/null; then
echo "No other pinned VM runnig, restoring defaults"
lastCpu=$(getLastCpu)
# Allow kthreads, IRQs, host & QEMU to use all CPUs again
pinKthreads "0-$lastCpu"
pinIrqs "0-$lastCpu"
setHostAllowedCpus "0-$lastCpu"
setQemuAllowedCpus "0-$lastCpu"
# Restore default scaling
resetGovernor
# Restore default virtual mem stats frequency
sysctl vm.stat_interval=1
fi
# Start idle VM
resetVmPciDevices
if [[ ${idleVm} != "no" ]]; then
setIdleVm start
fi
;;
*)
echo "Unknown run phase \"$runPhase\"!"
;;
esac
echo "Finished $runPhase on VM=$vmId"
@arminask
Copy link

arminask commented Jan 10, 2024

Hello, I have a Windows VM with GPU passthrough that I want to utilise for gaming. How would I go about using this script? My goal is to have 10 threads for my VM.

My CPU is AMD Ryzen 5 1600 (6 cores 12 threads)
Output of lscpu -e:

image

I tried to do this myself, but everytime I start or stop the Windows VM, my other specific VM stops working and shows "internal error", and htop/lscpu reports some of my cores offline, so I had to reboot the proxmox node for it to bring the cores back online.

This is my Windows VM config:

root@proxmox:~# cat /etc/pve/qemu-server/102.conf
#idleVm=no
#vmCpus=2,2,3,3,4,4,5,5,0,6,1,7,2,8,3,9,4,10,5,11
#miscCpus=0,1,8,9
#cpuGovernor=performance
agent: 1
args: -cpu 'host,topoext=on' -smp '10,sockets=1,cores=5,threads=2,maxcpus=10' hv_vendor_id=GIGABYTE,+pdpe1gb' -smbios type=0,version=UX305UA.201 -smbios type=1,manufacturer=GIGABYTE,product=UX305UA,version=2021.1 -smbios type=2,manufacturer=AMD,version=2021.5,product='AMD Ryzen 5 1600' -smbios type=3,manufacturer=XBZJ -smbios type=17,manufacturer=KINGSTON,loc_pfx=DDR4,speed=3200,serial=114514,part=FF63 -smbios type=4,manufacturer=AMD,max-speed=4800,current-speed=3200
audio0: device=ich9-intel-hda,driver=none
balloon: 0
bios: ovmf
boot: order=scsi2;scsi1;net0
cores: 12
cpu: host,hidden=1
cpuunits: 10000
efidisk0: local:102/vm-102-disk-0.raw,efitype=4m,pre-enrolled-keys=1,size=528K
hookscript: local:snippets/proxmox-hook.sh
hostpci0: 0000:07:00,pcie=1,x-vga=1
hotplug: usb
hugepages: 1024
machine: pc-q35-8.1
memory: 24576
meta: creation-qemu=8.1.2,ctime=1704182629
name: wintest
net0: virtio=E8:2A:EA:9F:8A:1A,bridge=vmbr0,firewall=1
numa: 1
ostype: win10
scsi1: /dev/disk/by-id/ata-WDC_WD10EZEX-00BBHA0_WD-WCC6Y7FUZN3L,size=976762584K,serial=4421
scsi2: local:102/vm-102-disk-2.raw,cache=unsafe,iothread=1,size=400G,ssd=1
scsihw: virtio-scsi-single
smbios1: uuid=24c326dd-3cec-48fc-bb9f-87aa3984e2c9,manufacturer=QVNVUw==,product=VVgzMDVVQQ==,version=MjAyMS4x,serial=MTI0NjY3,sku=MTM0NDY4,family=Ng==,base64=1
sockets: 1
tablet: 0
tpmstate0: local:102/vm-102-disk-1.raw,size=4M,version=v2.0
usb0: host=2-4
vcpus: 10
vga: none
vmgenid: 12ff2d20-3979-404b-91b0-90bdb31cf66f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment