Last active
March 8, 2023 14:27
-
-
Save chenzhaiyu/c5b7a60fea5d61fa86582f15782616b8 to your computer and use it in GitHub Desktop.
Inspect GPU usage by docker container and stop greedy containers if needed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
if __name__ == '__main__': | |
# get GPUs status | |
processes = os.popen("nvidia-smi | grep -w 'C'").read().strip().split("\n") | |
# occupancy dict by GPU index | |
occupancy = {str(i): [] for i in range(8)} | |
print("Collecting GPU usage info...") | |
for p in processes: | |
_, index, _, _, pid, _, command, memory, _ = p.split() | |
# find container PID behind child PID | |
pstree = os.popen(f"pstree -sg {pid} | grep -w 'containerd-shim'").read().strip().split("---") | |
pid_container = pstree[3] | |
pid_container = pid_container[pid_container.find("(") + 1: pid_container.find(")")] | |
# identify container name behind container PID | |
try: | |
name_container = os.popen("docker ps -q | xargs docker inspect --format '{{.State.Pid}} {{.Name}}'" + f" | grep '^{pid_container}'").read().strip().split()[1] | |
# update occupancy dict | |
occupancy[index].append((name_container, {"name": name_container, "pid": pid_container, "memory": memory, "command": command})) | |
except IndexError: | |
occupancy[index].append(("unknown", {"name": "unknown", "pid": pid_container, "memory": memory, "command": command})) | |
# print out status | |
for i, o in occupancy.items(): | |
for j in o: | |
print(f"GPU {i} (VRAM: {j[1]['memory']}) in use by container {j[1]['name']}") | |
# trigger for "docker stop": all GPUs are in use | |
trigger = all([len(v) > 0 for v in occupancy.values()]) | |
if trigger: | |
print("Triggering GPU fair-use policy...") | |
# find out the container using the most resources | |
inv_occupancy = {} | |
for i, o in occupancy.items(): | |
for j in o: | |
inv_occupancy[j[0]] = inv_occupancy.get(j[0], []) + [i] | |
# exclude case: multiple processes of the same container running on one GPU | |
for i in inv_occupancy: | |
inv_occupancy[i] = set(inv_occupancy[i]) | |
# trigger docker command | |
maxcount = max(len(v) for v in inv_occupancy.values()) | |
containers_to_stop = [k for k, v in inv_occupancy.items() if len(v) == maxcount] | |
print(f"Containers to stop: {containers_to_stop}") | |
for c in containers_to_stop: | |
os.popen(f"docker stop {c}") | |
pass | |
else: | |
print("GPU usage OK") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment