Skip to content

Instantly share code, notes, and snippets.

@dniku
Created January 25, 2022 10:00
Show Gist options
  • Save dniku/c901b9a459696d0969a2c78b51878a63 to your computer and use it in GitHub Desktop.
Save dniku/c901b9a459696d0969a2c78b51878a63 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#######
# USAGE
#
# nvidia-smi | nvidia-htop.py [-l [length]]
# print GPU utilization with usernames and CPU stats for each GPU-utilizing process
#
# -l|--command-length [length] Print longer part of the commandline. If `length'
# is provided, use it as the commandline length,
# otherwise print first 100 characters.
# -c|--color Colorize the output (green - free GPU, yellow -
# moderately used GPU, red - fully used GPU)
#
# It is also useful to run this under `watch`, i.e.:
#
# watch -d -n 1 python3 nvidia-htop.py
######
import argparse
import os
import re
import subprocess
import sys
try:
from termcolor import colored
except ImportError:
def colored(x, _):
return x
MEMORY_FREE_RATIO = 0.05
MEMORY_MODERATE_RATIO = 0.9
GPU_FREE_RATIO = 0.05
GPU_MODERATE_RATIO = 0.75
parser = argparse.ArgumentParser()
parser.add_argument('-l', '--command-length', default=20, const=100, type=int, nargs='?')
parser.add_argument('-c', '--color', action='store_true')
parser.add_argument('--stdin', action='store_true')
args = parser.parse_args()
# parse the command length argument
command_length = args.command_length
color = args.color
# for testing, the stdin can be provided in a file
fake_stdin_path = os.getenv("FAKE_STDIN_PATH", None)
if fake_stdin_path is not None:
with open(fake_stdin_path, 'rt') as f:
nvidia_smi_output = f.readlines()
elif args.stdin:
nvidia_smi_output = sys.stdin.readlines()
else:
from io import StringIO
nvidia_smi_output = StringIO(subprocess.check_output(['nvidia-smi']).decode('utf-8')).readlines()
def colorize(_lines):
for i in range(len(_lines)):
line = _lines[i]
m = re.match(r"\| ..%\s+[0-9]{2,3}C.*\s([0-9]+)MiB\s+\/\s+([0-9]+)MiB.*\s([0-9]+)%", line)
if m is not None:
used_mem = int(m.group(1))
total_mem = int(m.group(2))
gpu_util = int(m.group(3)) / 100.0
mem_util = used_mem / float(total_mem)
is_low = is_moderate = is_high = False
is_high = gpu_util >= GPU_MODERATE_RATIO or mem_util >= MEMORY_MODERATE_RATIO
if not is_high:
is_moderate = gpu_util >= GPU_FREE_RATIO or mem_util >= MEMORY_FREE_RATIO
if not is_high and not is_moderate:
is_free = True
c = 'red' if is_high else ('yellow' if is_moderate else 'green')
_lines[i] = colored(_lines[i], c)
_lines[i - 1] = colored(_lines[i - 1], c)
return _lines
lines_to_print = []
# Copy the utilization upper part verbatim
for i in range(len(nvidia_smi_output)):
if not nvidia_smi_output[i].startswith("| Processes:"):
lines_to_print.append(nvidia_smi_output[i].rstrip())
else:
i += 4
break
# We set the width of the block with processes manually
assert lines_to_print[-1].startswith('+--')
lines_to_print.pop()
processes_delimeter_line = '+' + '-' * (args.command_length + 59) + '+'
lines_to_print.append(processes_delimeter_line)
if color:
lines_to_print = colorize(lines_to_print)
for line in lines_to_print:
print(line)
pids = {}
# Parse the PIDs from the lower part
# gpu_num = []
# pids = []
# gpu_mem = []
# user = []
# cpu = []
# mem = []
# time = []
# command = []
while not nvidia_smi_output[i].startswith("+--"):
if "Not Supported" in nvidia_smi_output[i]:
i += 1
continue
line = nvidia_smi_output[i]
line = re.split(r'\s+', line)
pid = line[4]
gpu_num = line[1]
gpu_mem = line[-3]
if pid in pids:
pids[pid]['gpu_num'].append(gpu_num)
pids[pid]['gpu_mem'].append(gpu_mem)
else:
pids[pid] = {
'gpu_num': [gpu_num],
'gpu_mem': [gpu_mem],
}
i += 1
# Query the PIDs using ps
ps_format = "pid,user,%cpu,%mem,etime,command"
ps_command = ["ps", "-ww", "-o", ps_format, "-p", ",".join(pids.keys())]
ps_output = subprocess.check_output(ps_command)
# Parse ps output
for line in ps_output.decode('ascii').split("\n"):
if line.strip().startswith("PID") or len(line) == 0:
continue
parts = re.split(r'\s+', line.strip(), 5)
pid = parts[0]
pids[pid]['user'] = parts[1]
pids[pid]['cpu'] = parts[2]
pids[pid]['mem'] = parts[3]
pids[pid]['time'] = parts[4] if not "-" in parts[4] else parts[4].split("-")[0] + " days"
pids[pid]['command'] = parts[5]
for process_info in pids.values():
process_info['gpu_num'], process_info['gpu_mem'] = zip(
*sorted(zip(process_info['gpu_num'], process_info['gpu_mem'])))
def parse_mem(mem):
suffix = 'MiB'
assert mem.endswith(suffix)
return int(mem[:-len(suffix)])
def sort_key(pid):
return (
-sum(parse_mem(mem) for mem in pids[pid]['gpu_mem']),
int(pid)
)
format = ("| %3s %7s %8s %8s %5s %5s %9s %-" + str(command_length) + "." + str(command_length) + "s |")
print(format % (
"GPU", "PID", "USER", "GPU MEM", "%CPU", "%MEM", "TIME", "COMMAND"
))
for pid in sorted(pids.keys(), key=sort_key):
command = pids[pid].get('command', '?')
print(format % (
pids[pid]['gpu_num'][0],
pid,
pids[pid].get('user', '?'),
pids[pid]['gpu_mem'][0],
pids[pid].get('cpu', '?'),
pids[pid].get('mem', '?'),
pids[pid].get('time', '?'),
command[:command_length],
))
indent = 4
for i in range(command_length, len(command), command_length - indent):
command_part = command[i:i + command_length - indent]
print(format % ('', '', '', '', '', '', '', ' ' * indent + command_part))
for gpu_num, gpu_mem in zip(pids[pid]['gpu_num'][1:], pids[pid]['gpu_mem'][1:]):
print(format % (gpu_num, '', '', gpu_mem, '', '', '', ''))
assert nvidia_smi_output[-1].startswith('+--')
print(processes_delimeter_line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment