Skip to content

Instantly share code, notes, and snippets.

@Jipok
Created February 11, 2024 14:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jipok/624bf07bf768d923c79d8f64f56786dc to your computer and use it in GitHub Desktop.
Save Jipok/624bf07bf768d923c79d8f64f56786dc to your computer and use it in GitHub Desktop.
TUI with chart for radeon power cap control, temp monitoring. SIGSTOP python if overheated
#!/usr/bin/env python3
import plotext as plt
from subprocess import check_output
import psutil
import time
import math
CMD = "sensors amdgpu-pci-0500 | grep PPT | grep -oP '(?<=cap =) *(.?\d+)'"
TARGET_TEMP_MAX = 91
TARGET_TEMP_MIN = 87
PAUSE_TEMP = 95
PAUSE_TIME = 5 # Seconds
CRITICAL_TEMP = 107
CAP_MIN = 2
CAP_MAX = 100
CHART_HISTORY_COUNT = 55
MOVING_AVG_N = 3
plt.theme("pro")
# Prefill arrays for good chart after start
pause_counter = 0
a_temp = [43]
a_powercap = [100]
temp = psutil.sensors_temperatures()["amdgpu"][1].current
mov_avg = [temp] * MOVING_AVG_N
powercap = int(check_output(CMD, shell = True).decode('utf-8'))
for i in range(1, CHART_HISTORY_COUNT):
a_temp.append(temp)
a_powercap.append(powercap)
while True:
# Trim chart
if len(a_temp) > CHART_HISTORY_COUNT:
a_temp = a_temp[1:]
a_powercap = a_powercap[1:]
# Get data
temp = psutil.sensors_temperatures()["amdgpu"][1].current
mov_avg.append(temp)
mov_avg = mov_avg[1:]
temp = sum(mov_avg) / MOVING_AVG_N
temp = math.ceil(temp*100)/100 # Make cute digits
a_temp.append(temp)
powercap = int(check_output(CMD, shell = True).decode('utf-8'))
a_powercap.append(powercap)
# Plot temp
plt.clear_data()
plt.clear_figure()
plt.clear_terminal()
plt.theme("pro")
label = "Temp %s°C" % temp
plt.plot(a_temp, label = label, color = "red")
plt.ylabel(label)
plt.ylim(20, 115)
plt.hline(TARGET_TEMP_MAX, "green")
plt.hline(TARGET_TEMP_MIN, "green")
# Plot power cap
label = "Powercap %sW" % powercap
plt.plot(a_powercap, label = label, yside = "right", color = "blue")
plt.ylabel(label, yside = "right")
plt.ylim(CAP_MIN, CAP_MAX + 10, yside = "right")
plt.show()
# Short pause under high temp
if temp >= PAUSE_TEMP:
check_output('ps -ef | grep "[p]ython" | awk \'$1!="root" {print $2}\' | xargs -r kill -s SIGSTOP', shell = True)
pause_counter = PAUSE_TIME
if pause_counter > 0:
pause_counter -= 1
if pause_counter == 0:
check_output('ps -ef | grep "[p]ython" | awk \'$1!="root" {print $2}\' | xargs -r kill -s SIGCONT', shell = True)
# Hard save
if temp >= CRITICAL_TEMP:
# Kill all non-root(avoid suicide) python processes
check_output('ps -ef | grep "[p]ython" | awk \'$1!="root" {print $2}\' | xargs -r kill -9', shell = True)
# Limit power
newcap = powercap
if temp < TARGET_TEMP_MIN:
cap_step = TARGET_TEMP_MIN - temp
newcap = int(min(powercap + cap_step, CAP_MAX))
if temp > TARGET_TEMP_MAX:
cap_step = pow(temp - TARGET_TEMP_MAX, 2)
newcap = int(max(powercap - cap_step, CAP_MIN))
if newcap != powercap:
check_output("/opt/rocm/bin/rocm-smi --setpoweroverdrive " + str(newcap), shell = True)
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment