Skip to content

Instantly share code, notes, and snippets.

@rhee-elten
Forked from rhee/nv_gpu_stat.py
Last active October 25, 2022 00:33
Show Gist options
  • Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
## monitor GPU usage in jupyter notebook -- rhee.elten@gmail.com
## https://gist.github.com/rhee-elten/1a1070e3a812ca863c3b937b5180b2f8
# pylint: disable=invalid-name
# pylint: disable=using-constant-test
# pylint: disable=wrong-import-position
# pylint: disable=missing-class-docstring
# pylint: disable=missing-function-docstring
if 10:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # make sure
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
if 10:
import logging
logging.basicConfig()
logger = logging.getLogger("nv_gpu_stat")
logger.setLevel(logging.DEBUG)
# disable numpy warnings
import warnings
warnings.filterwarnings("ignore")
import sys
import io
import os
from os.path import expanduser, dirname, isfile
from time import sleep, time
from socket import gethostname
from datetime import datetime
from subprocess import run as subprocess_run
import numpy as np
from pandas import read_csv
from h5py import File as h5file
import matplotlib.pyplot as plt
def nv_gpu_stat():
"""
example output:
print(nv_gpu_stat().to_string(index=False))
index name temperature.gpu utilization.gpu [%] power.draw [W] memory.used [MiB] memory.total [MiB]
0 Tesla M40 24GB 17 0 17.46 0 24478
1 Tesla M40 24GB 20 0 18.23 0 24478
2 Tesla M40 24GB 19 0 18.53 0 24478
3 Tesla M40 24GB 20 0 18.83 0 24478
"""
query_gpu = "--query-gpu=index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total"
query_format = "--format=csv,nounits"
try:
nvidia_smi = "nvidia-smi"
proc = subprocess_run(
[nvidia_smi, query_gpu, query_format], capture_output=True, check=False
)
except FileNotFoundError:
nvidia_smi = "C:/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi"
proc = subprocess_run(
[nvidia_smi, query_gpu, query_format], capture_output=True, check=False
)
assert proc.returncode == 0, ("proc failure exitcode:", proc.returncode)
return read_csv(io.StringIO(proc.stdout.decode("ascii")))
NV_GPU_STAT_COLLECT_FILE = expanduser("~/.nv_gpu_stat/collect.h5")
def nv_gpu_stat_collect(
collect_file=None, collect_interval=0.5, collect_length=2048, verbose=0
):
def check_h5_file(f):
if "timestamp" not in f:
return False
if "temperature_gpu" not in f:
return False
if "utilization_gpu" not in f:
return False
if "power_draw" not in f:
return False
if "memory_used" not in f:
return False
return True
def initialize_h5_file(f, *, collect_length, num_gpus=None):
"""
## fill initial values into file if needed
"""
# call nv_gpu_stat to get num_gpus
num_gpus = num_gpus or nv_gpu_stat().values.shape[0]
f.create_dataset("timestamp", (collect_length,), np.float64)
f["timestamp"][:] = np.nan
f.create_dataset("temperature_gpu", (collect_length, num_gpus), np.float64)
f["temperature_gpu"][:] = np.nan
f.create_dataset("utilization_gpu", (collect_length, num_gpus), np.float64)
f["utilization_gpu"][:] = np.nan
f.create_dataset("power_draw", (collect_length, num_gpus), np.float64)
f["power_draw"][:] = np.nan
f.create_dataset("memory_used", (collect_length, num_gpus), np.float64)
f["memory_used"][:] = np.nan
def update_h5_file(f, gpu_stat):
## read initial values
timestamp = f["timestamp"][:]
temperature_gpu = f["temperature_gpu"][:]
utilization_gpu = f["utilization_gpu"][:]
power_draw = f["power_draw"][:]
memory_used = f["memory_used"][:]
## roll
timestamp = np.roll(timestamp, -1, axis=0)
temperature_gpu = np.roll(temperature_gpu, -1, axis=0)
utilization_gpu = np.roll(utilization_gpu, -1, axis=0)
power_draw = np.roll(power_draw, -1, axis=0)
memory_used = np.roll(memory_used, -1, axis=0)
## put new measure at the last
timestamp[-1] = t_now
temperature_gpu[-1, :] = gpu_stat[" temperature.gpu"].values
utilization_gpu[-1, :] = gpu_stat[" utilization.gpu [%]"].values
power_draw[-1, :] = gpu_stat[" power.draw [W]"].values
memory_used[-1, :] = gpu_stat[" memory.used [MiB]"].values
## write back to h5 file
f["timestamp"][:] = timestamp
f["temperature_gpu"][:] = temperature_gpu
f["utilization_gpu"][:] = utilization_gpu
f["power_draw"][:] = power_draw
f["memory_used"][:] = memory_used
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE
os.makedirs(dirname(collect_file), exist_ok=True)
logger.debug("stat_collect: [1]open(a): %s", collect_file)
try:
with h5file(collect_file, "a") as f:
if not check_h5_file(f):
initialize_h5_file(f, collect_length=collect_length)
while True:
t_now = time()
if verbose:
logger.info(">>> nv_gpu_stat: time: %d", t_now)
gpu_stat = nv_gpu_stat()
update_h5_file(f, gpu_stat)
t_next = t_now + collect_interval
t_sleep = t_next - time()
if t_sleep > 0.0:
sleep(t_sleep)
except OSError:
logger.exception("h5py File open failed")
def nv_gpu_stat_query(time_up_to=None, collect_file=None):
time_up_to = time_up_to or time()
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE
if not isfile(collect_file):
logger.debug("stat_query: file not exist: %s", collect_file)
return None
try:
logger.debug("stat_query: open(r): %s", collect_file)
with h5file(collect_file, "r") as f:
timestamp = f["timestamp"][:]
temperature_gpu = f["temperature_gpu"][:]
utilization_gpu = f["utilization_gpu"][:]
power_draw = f["power_draw"][:]
memory_used = f["memory_used"][:]
except:
_, exc_value, _ = sys.exc_info()
logger.exception("*** %s", exc_value)
return None
if time_up_to:
in_range = timestamp <= time_up_to
timestamp = timestamp[in_range]
temperature_gpu = temperature_gpu[in_range]
utilization_gpu = utilization_gpu[in_range]
power_draw = power_draw[in_range]
memory_used = memory_used[in_range]
return dict(
timestamp=timestamp,
temperature_gpu=temperature_gpu,
utilization_gpu=utilization_gpu,
power_draw=power_draw,
memory_used=memory_used,
)
def nv_gpu_plot_values(
time_series,
values,
t_now=None,
ax=None,
plot_seconds=450.0,
stat_seconds=60.0,
plot_config=None,
min_ymax=10.0,
plot_kws=None,
):
plot_kws = plot_kws or dict()
ax = ax or plt.gca()
time_series = np.asarray(time_series)
t_now = t_now or time_series[-1]
values = np.asarray(values)
plot_config = plot_config or dict(title="values", value_fmt="{:>5.1f}", ylim=None)
title = plot_config["title"]
value_fmt = plot_config["value_fmt"]
if not callable(value_fmt):
value_fmt = lambda x, _fmt=value_fmt: _fmt.format(x)
ylim = plot_config["ylim"]
if time_series[0] + stat_seconds > time_series[-1]:
stat_seconds = time_series[-1] - time_series[0]
stat_select = (t_now - stat_seconds <= time_series) & (time_series <= t_now)
for i, ser in enumerate(np.transpose(values)):
y_stat = ser[stat_select]
# 현재값 (마지막 값)
val_last = ser[-1]
if len(y_stat) > 0:
val_mean = np.nanmean(y_stat)
val_max = np.nanmax(y_stat)
# do plot
label = "G{:d} {:s}, avg={:s}, max={:s}".format(
i, value_fmt(val_last), value_fmt(val_mean), value_fmt(val_max)
)
else:
# do plot
label = "G{:d} {:s}".format(i, value_fmt(val_last))
ax.plot(time_series - t_now, ser, label=label, **plot_kws)
ax.set_title(title)
ax.legend(loc="upper left", prop={"size": 8}, bbox_to_anchor=(1, 1))
# ylim 상한 자동 계산, 15% 마진
max_win = np.amax(values)
max_win = max(min_ymax, max_win)
ylim = ylim or [-max_win * 0.1, max_win * 1.1]
ax.set_ylim(ylim)
# ylim 내에 vertical line
dyn_y_lim = ax.get_ylim()
ax.vlines(-stat_seconds, *dyn_y_lim, ls="dashed", lw=1.0, color="k", alpha=0.5)
ax.set_xlim([-plot_seconds, 0])
def nv_gpu_stat_draw(
t_now=None,
collect_file=None,
values_dict=None,
plot_configs=None,
plot_context=None,
hostname=None,
figsize=(5.4, 7.4),
dpi=100,
plot_seconds=450.0,
**kwargs
):
if plot_context is None:
plot_context = dict()
t_now = t_now or time()
hostname = hostname or gethostname()
fig, axs = plt.subplots(4, 1, figsize=figsize, dpi=dpi)
axs = axs.flatten()
plot_context["fig"] = fig
if plot_configs is None:
plot_configs = dict(
temperature_gpu=dict(
title="temperature.gpu", value_fmt="{:>3.0f}", ylim=None
),
utilization_gpu=dict(
title="utilization.gpu [%]", value_fmt="{:>3.0f}%", ylim=None
),
power_draw=dict(title="power.draw [W]", value_fmt="{:>3.0f}", ylim=None),
memory_used=dict(title="memory.used [GB]", value_fmt="{:>4.1f}", ylim=None),
)
plot_kws = dict(lw=1, alpha=0.55)
if values_dict is None:
values_dict = nv_gpu_stat_query(collect_file=collect_file)
timestamp = np.asarray(values_dict["timestamp"])
temperature_gpu = np.asarray(values_dict["temperature_gpu"])
utilization_gpu = np.asarray(values_dict["utilization_gpu"])
power_draw = np.asarray(values_dict["power_draw"])
memory_used = np.asarray(values_dict["memory_used"])
memory_used = memory_used / 1024 # MiB ==> GB
plot_select = (t_now - plot_seconds <= timestamp) & (timestamp <= t_now)
if not np.any(plot_select):
logger.warning("nv_gpu_stat_draw: no data")
return
timestamp = timestamp[plot_select]
temperature_gpu = temperature_gpu[plot_select, :]
utilization_gpu = utilization_gpu[plot_select, :]
power_draw = power_draw[plot_select, :]
memory_used = memory_used[plot_select, :]
nv_gpu_plot_values(
timestamp,
utilization_gpu,
ax=axs[0],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["utilization_gpu"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
memory_used,
ax=axs[1],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["memory_used"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
power_draw,
ax=axs[2],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["power_draw"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
temperature_gpu,
ax=axs[3],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["temperature_gpu"],
plot_kws=plot_kws,
**kwargs
)
dtstr = datetime.fromtimestamp(t_now).strftime("%H:%M:%S")
fig_title = "{:s}\n{:s}".format(hostname, dtstr)
fig.suptitle(fig_title, fontsize=12)
fig.tight_layout(rect=[0, 0.03, 1, 0.92]) # fig.tight_layout()
plt.show()
def nv_gpu_stat_monitor(collect_file=None, interval=5.0, out=None, **kwargs):
"""
usage:
%matplotlib inline
from nv_gpu_stat import nv_gpu_stat_monitor
nv_gpu_stat_monitor()
"""
from IPython.display import clear_output
# from multiprocessing import Process
from threading import Thread
# child = Thread(target=(lambda: nv_gpu_stat_collect(verbose=1, collect_interval=2.0)))
child = Thread(target=nv_gpu_stat_collect)
child.start()
try:
with plt.ion(): # start interactive mode
plot_context = None
while True:
t_now = time()
collect = nv_gpu_stat_query(time_up_to=t_now, collect_file=collect_file)
# 아직 collect_file 이 없으면, 대기
if collect:
clear_output(wait=True)
nv_gpu_stat_draw(
t_now=t_now,
values_dict=collect,
plot_context=plot_context,
**kwargs
)
t_next = t_now + interval
t_sleep = t_next - time()
if t_sleep > 0.0:
sleep(t_sleep)
finally:
child.join()
if __name__ == "__main__":
try:
get_ipython().run_line_magic("matplotlib", "inline")
except:
_, ex_val, _ = sys.exc_info()
print(ex_val, file=sys.stderr)
import matplotlib
matplotlib.use("agg") # or Qt5Agg?
try:
nv_gpu_stat_monitor()
finally:
get_ipython().system("rm -fvr .??*.ipynb .ipynb_checkpoints __pycache__")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment