set -e -o errexit -o pipefail -o nounset
# This script can be used by itself, but it's recommended that you read
# a tutorial on Proxmox forum first:
# Do not modify these variables (set by Proxmox when calling the script)
echo "Running $runPhase on VM=$vmId"
# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
# On 5900x the core config, as seen in lscpu -e, looks like the following:
# CCX #0:
# - NUMA: node 0
# - CPU: 0-5, 12-17 (SMT threads/host CPU#)
# - CORE: 0-5
# CCX #1:
# - NUMA: node 1
# - CPU: 6-11, 18-23
# - CORE: 6-11
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is:
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
# so, it's better to conciously tune it. Some scripts are here:
# There are some unexplored ideas also at
# Useful commands while debugging this code:
# List running tasks with their affinity as of now: (the "]" filters out kthreads)
# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
# Track cgroups resources usage: systemd-cgtop
# See tree of cgroups: systemd-cgls
# Gets QEMU parent process PID for the current VM
getQemuPID () {
local qemuParentPid=$(cat /run/qemu-server/$
if [[ -z $qemuParentPid ]]; then
echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
return 1
echo $qemuParentPid
# Gets the last logical CPU (thread) of the system
getLastCpu () {
echo $(( $(nproc --all) - 1 ))
# Pin vCPU to a host logic CPU (thread)
# The theread SHOULD be a single one, but it can be any taskset list
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
# Params: vCPU# hostThread#orList
pinVCpu () {
local vCpuNum=$1
local hostThreadNum="$2"
local qemuParentPid=$(getQemuPID)
local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
if [[ -z $vCpuTaskPid ]]; then
echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
return 1
echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
# There thread SHOULD probably be a list unlike pinVCpu
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
# sub-scopes, affinity has to be set per-process with taskset here.
# Params: hostThread#orList
pinNonVCpuTasks () {
local hostThreadNum="$1"
local qemuParentPid=$(getQemuPID)
local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
while IFS= read -r tpid; do
local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
taskset --cpu-list --pid "$hostThreadNum" $tpid
done <<< "$nonVCpuTaskPids"
# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
# to control their affinity manual pinning is needed.
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
# mask (see
# Params: hostThread#orList
pinKthreads () {
local hostThreadNum="$1"
echo "Attempting to pin all kthreads to $hostThreadNum..."
local procStat=""
local pid=""
local comm=""
for statFile in /proc/[0-9]*/stat; do
# This CAN sometimes fail due to TOC-TOU
2>/dev/null read -a procStat < $statFile || true
if [[ -z "${procStat[0]}" ]]; then continue; fi
# Ignore not kthreads
if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi
# This CAN fail for some kthreads that are needed on specific CPUs
if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
# when these are rescheduled. This function is not perfect as it doesn't set a mask
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
# be needed as if the VM isn't started on boot most if not all busy IRQs would have
# been triggered by now.
# Params: hostThread#orList
pinIrqs () {
local hostThreadNum="$1"
echo "Pinning IRQs to host thread(s) $hostThreadNum..."
for irqAffLst in /proc/irq/*/smp_affinity_list; do
local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
# Set governor/scaling for a host logic CPU (thread)
# Params: hostThread# desiredGovernor
setGovernor () {
local hostCpu=$1
local reqGov="$2"
local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)
if [[ -z "$curGov" ]]; then
echo "ERROR: failed to query governor for CPU $hostCpu"
return 1
if [[ "$reqGov" == "$curGov" ]]; then
echo "CPU $hostCpu: requested governor $reqGov - it is already set"
echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredGovernor
setGovernorRange () {
for (( i=$1; i<=$2; i++ )); do
setGovernor $i "$3"
# Resets governor/scaling to default state
resetGovernor () {
echo "Resetting CPU governor to default"
service cpufrequtils restart
# Put host CPU (thread) into offline or online state
# Params: hostThread# desiredState{0,1}
setCpuState () {
local hostCpu=$1
local reqState=$2
local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)
if [[ -z "$curState" ]]; then
echo "ERROR: failed to online status for CPU $hostCpu"
return 1
if [[ "$reqState" == "$curState" ]]; then
echo "CPU $hostCpu: requested state $reqState - it is already set"
echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
if [[ $? -eq 0 ]]; then
echo "[OK]"
echo "[FAILED]"
return 1
# Put host CPU (thread) range into offline or online state. Range is inclusive.
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
setCpuStateRange () {
for (( i=$1; i<=$2; i++ )); do
setCpuState $i $3
tidyCaches () {
echo -n "Tidying caches... "
echo 3 > /proc/sys/vm/drop_caches
echo 1 > /proc/sys/vm/compact_memory
echo "[OK]"
# Sets cgroup slice or scope cpu isolation
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
setCgroupAllowedCpus () {
local entity="$1"
local allowedCpus="$2"
echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
# Sets logical CPUs (threads) which can be used by processes on the host
# Params: hostThreadsList (e.g. 11,12,13-19)
setHostAllowedCpus () {
echo "Setting host userland CPU constrain to $1"
setCgroupAllowedCpus "init.scope" "$1"
setCgroupAllowedCpus "system.slice" "$1"
setCgroupAllowedCpus "user.slice" "$1"
# Sets logical CPUs (threads) which can be QEMU processes
# Params: hostThreadsList (e.g. 11,12,13-19
setQemuAllowedCpus () {
echo "Setting QEMU CPU default constrain to $1"
setCgroupAllowedCpus "qemu.slice" "$1"
# Makes sure that a decoupled slice for some QEMU VMs exist
# This will only do something the first time a VM start
# Params: <none>
ensureQemuDecoupledSlice () {
if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
return 0
echo "Creating decoupled QEMU cgroup"
mkdir /sys/fs/cgroup/qemu-decoupled.slice
# The slice itself MUST be allowed to run on ALL CPUs. The reason
# for that is we will move vCPUs to an isolated set of cores BUT
# put emulator and iothread(s) on the shared CPUs. Since cgroups v2
# doesn't allow a thread/task to be in a different cgroup than the
# parent these tasks must stay in the qemu-decoupled.slice but with
# different affinity
local lastCPU=$(getLastCpu)
setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
# All processes from host run under system.slice and user.slice, while all QEMU machines run
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
# of the systemd authors: but cgropups can
# be used directly (albeit without warranties).
# Params: <none>
decoupleQemuVm () {
local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
if [[ ! -d "$vmScope" ]]; then
echo "Creating cgroups scope for VMID=$vmId at $vmScope"
mkdir "$vmScope"
local qemuParentPid=$(getQemuPID)
echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
echo $qemuParentPid > "$vmScope/cgroup.procs"
# Starts/stops the "idle" windows VM to force very low GPU power states
setIdleVm () {
echo "Setting idle VM to $1"
qm "$1" 107
# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once.
# When VM is turned off and on it will just black-screen and the VM never boots. This is a
# workaround for that issue.
# Params: <none>
resetVmPciDevices () {
echo "Resetting VM PCI devices..."
local pciAddrFun=''
local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
while IFS= read -r pciAddr; do
# Single function (mostly SR-IOV or vGPU) device
if echo "$pciAddr" | grep -F '.' > /dev/null; then
echo "Removing PCI device function at $pciAddr"
echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
# Whole device specified => remove all function
for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
echo "Removing PCI device $pciAddr function $pciAddrFun"
echo 1 > "$pciAddrFunRm" || true
# This is absolutely required. Attempting to remove one function CAN
# remove all of them but it's not instantenous. However, if you hit
# such a case and try to manually do /remove on another function while
# the first is being removed a "general protection fault" will happen
# in the subsequent "pci_stop_and_remove_bus_device_locked()"
while [[ -f "$pciAddrFunRm" ]]; do
sleep 1
echo "Still waiting for $pciAddrFunRm..."
done <<< "$vmPciDevices"
echo "Re-scanning PCI devices..."
echo 1 > /sys/bus/pci/rescan
# rescan is asynchronous; if we wanted to be 100% correct here we should wait
# for /sys entries to appear, but 2 seconds delay is good enough
sleep 2
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
# All modifications should be done in post-start as doing them in pre-start will execute them even
# if the VM fails to start (and thus post-stop will never be called)
case "$runPhase" in
# Stop idle VM, drop caches & compact memory for hugepages
setIdleVm shutdown
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
# All modifications should be done in post-start as doing them in pre-start will execute them even
# if the VM fails to start (and thus post-stop will never be called)
# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
# ZFS), but it doesn't work with cgroups v2:
# I have no idea about any alternatives besides CPU hotplug hack (see below)
# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
# any manual pinning due to a systemd bug/design choice:
# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
setHostAllowedCpus "0-5,12-17"
setQemuAllowedCpus "0-5,12-17"
# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
echo "Offlining to-be pinned CPUs to move tasks away..."
setCpuStateRange 6 11 0
setCpuStateRange 18 23 0
# Move kernel threads & IRQs away from vCPU threads
# Doing this when CPUs are offlined makes it easier as
# nothing is running on these CPUs actively
pinIrqs "0-5,12-17"
pinKthreads "0-5,12-17"
# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
echo "Onlineing to-be pinned CPUs..."
setCpuStateRange 6 11 1
setCpuStateRange 18 23 1
# Set frequency scaling to performance mode
setGovernorRange 6 11 performance
setGovernorRange 18 23 performance
# Stats generation causes jitter in VR
sysctl vm.stat_interval=120
# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
# An alternative hacky way to do that would be to iterate over all currently running VMs and
# taskset their affinity to 1st CCX, but a new VM starting while this one is running will
# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
# requires the VM process to be moved to a non-qemu.slice
# Pin vCPUs to correct threads - this is crucial.
# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
# mapping.
pinVCpu 0 6
pinVCpu 1 18
pinVCpu 2 7
pinVCpu 3 19
pinVCpu 4 8
pinVCpu 5 20
pinVCpu 6 9
pinVCpu 7 21
pinVCpu 8 10
pinVCpu 9 22
pinVCpu 10 11
pinVCpu 11 23
# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
# probabably be pinned to a single core, but we're counting on host scheduler being smart.
# To do static pinning here QMP needs to be used to query types of threads:
pinNonVCpuTasks "0-5,12-17"
# Allow kthreads, IRQs, host & QEMU to use all CPUs again
pinKthreads "0-$lastCpu"
pinIrqs "0-$lastCpu"
setHostAllowedCpus "0-$lastCpu"
setQemuAllowedCpus "0-$lastCpu"
# Restore default scaling
# Restore default virtual mem stats frequency
sysctl vm.stat_interval=1
# Start idle VM
setIdleVm start
echo "Unknown run phase \"$runPhase\"!"
echo "Finished $runPhase on VM=$vmId"
