Skip to content

Instantly share code, notes, and snippets.

@chenzhaiyu
Last active March 8, 2023 14:27
Show Gist options
  • Save chenzhaiyu/c5b7a60fea5d61fa86582f15782616b8 to your computer and use it in GitHub Desktop.
Save chenzhaiyu/c5b7a60fea5d61fa86582f15782616b8 to your computer and use it in GitHub Desktop.
Inspect GPU usage by docker container and stop greedy containers if needed
import os
import re
if __name__ == '__main__':
# get GPUs status
processes = os.popen("nvidia-smi | grep -w 'C'").read().strip().split("\n")
# occupancy dict by GPU index
occupancy = {str(i): [] for i in range(8)}
print("Collecting GPU usage info...")
for p in processes:
_, index, _, _, pid, _, command, memory, _ = p.split()
# find container PID behind child PID
pstree = os.popen(f"pstree -sg {pid} | grep -w 'containerd-shim'").read().strip().split("---")
pid_container = pstree[3]
pid_container = pid_container[pid_container.find("(") + 1: pid_container.find(")")]
# identify container name behind container PID
try:
name_container = os.popen("docker ps -q | xargs docker inspect --format '{{.State.Pid}} {{.Name}}'" + f" | grep '^{pid_container}'").read().strip().split()[1]
# update occupancy dict
occupancy[index].append((name_container, {"name": name_container, "pid": pid_container, "memory": memory, "command": command}))
except IndexError:
occupancy[index].append(("unknown", {"name": "unknown", "pid": pid_container, "memory": memory, "command": command}))
# print out status
for i, o in occupancy.items():
for j in o:
print(f"GPU {i} (VRAM: {j[1]['memory']}) in use by container {j[1]['name']}")
# trigger for "docker stop": all GPUs are in use
trigger = all([len(v) > 0 for v in occupancy.values()])
if trigger:
print("Triggering GPU fair-use policy...")
# find out the container using the most resources
inv_occupancy = {}
for i, o in occupancy.items():
for j in o:
inv_occupancy[j[0]] = inv_occupancy.get(j[0], []) + [i]
# exclude case: multiple processes of the same container running on one GPU
for i in inv_occupancy:
inv_occupancy[i] = set(inv_occupancy[i])
# trigger docker command
maxcount = max(len(v) for v in inv_occupancy.values())
containers_to_stop = [k for k, v in inv_occupancy.items() if len(v) == maxcount]
print(f"Containers to stop: {containers_to_stop}")
for c in containers_to_stop:
os.popen(f"docker stop {c}")
pass
else:
print("GPU usage OK")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment