-
-
Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## monitor GPU usage in jupyter notebook -- rhee.elten@gmail.com | |
## https://gist.github.com/rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 | |
# pylint: disable=invalid-name | |
# pylint: disable=using-constant-test | |
# pylint: disable=wrong-import-position | |
# pylint: disable=missing-class-docstring | |
# pylint: disable=missing-function-docstring | |
if 10: | |
import os | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # make sure | |
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" | |
if 10: | |
import logging | |
logging.basicConfig() | |
logger = logging.getLogger("nv_gpu_stat") | |
logger.setLevel(logging.DEBUG) | |
# disable numpy warnings | |
import warnings | |
warnings.filterwarnings("ignore") | |
import sys | |
import io | |
import os | |
from os.path import expanduser, dirname, isfile | |
from time import sleep, time | |
from socket import gethostname | |
from datetime import datetime | |
from subprocess import run as subprocess_run | |
import numpy as np | |
from pandas import read_csv | |
from h5py import File as h5file | |
import matplotlib.pyplot as plt | |
def nv_gpu_stat(): | |
""" | |
example output: | |
print(nv_gpu_stat().to_string(index=False)) | |
index name temperature.gpu utilization.gpu [%] power.draw [W] memory.used [MiB] memory.total [MiB] | |
0 Tesla M40 24GB 17 0 17.46 0 24478 | |
1 Tesla M40 24GB 20 0 18.23 0 24478 | |
2 Tesla M40 24GB 19 0 18.53 0 24478 | |
3 Tesla M40 24GB 20 0 18.83 0 24478 | |
""" | |
query_gpu = "--query-gpu=index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total" | |
query_format = "--format=csv,nounits" | |
try: | |
nvidia_smi = "nvidia-smi" | |
proc = subprocess_run( | |
[nvidia_smi, query_gpu, query_format], capture_output=True, check=False | |
) | |
except FileNotFoundError: | |
nvidia_smi = "C:/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi" | |
proc = subprocess_run( | |
[nvidia_smi, query_gpu, query_format], capture_output=True, check=False | |
) | |
assert proc.returncode == 0, ("proc failure exitcode:", proc.returncode) | |
return read_csv(io.StringIO(proc.stdout.decode("ascii"))) | |
NV_GPU_STAT_COLLECT_FILE = expanduser("~/.nv_gpu_stat/collect.h5") | |
def nv_gpu_stat_collect( | |
collect_file=None, collect_interval=0.5, collect_length=2048, verbose=0 | |
): | |
def check_h5_file(f): | |
if "timestamp" not in f: | |
return False | |
if "temperature_gpu" not in f: | |
return False | |
if "utilization_gpu" not in f: | |
return False | |
if "power_draw" not in f: | |
return False | |
if "memory_used" not in f: | |
return False | |
return True | |
def initialize_h5_file(f, *, collect_length, num_gpus=None): | |
""" | |
## fill initial values into file if needed | |
""" | |
# call nv_gpu_stat to get num_gpus | |
num_gpus = num_gpus or nv_gpu_stat().values.shape[0] | |
f.create_dataset("timestamp", (collect_length,), np.float64) | |
f["timestamp"][:] = np.nan | |
f.create_dataset("temperature_gpu", (collect_length, num_gpus), np.float64) | |
f["temperature_gpu"][:] = np.nan | |
f.create_dataset("utilization_gpu", (collect_length, num_gpus), np.float64) | |
f["utilization_gpu"][:] = np.nan | |
f.create_dataset("power_draw", (collect_length, num_gpus), np.float64) | |
f["power_draw"][:] = np.nan | |
f.create_dataset("memory_used", (collect_length, num_gpus), np.float64) | |
f["memory_used"][:] = np.nan | |
def update_h5_file(f, gpu_stat): | |
## read initial values | |
timestamp = f["timestamp"][:] | |
temperature_gpu = f["temperature_gpu"][:] | |
utilization_gpu = f["utilization_gpu"][:] | |
power_draw = f["power_draw"][:] | |
memory_used = f["memory_used"][:] | |
## roll | |
timestamp = np.roll(timestamp, -1, axis=0) | |
temperature_gpu = np.roll(temperature_gpu, -1, axis=0) | |
utilization_gpu = np.roll(utilization_gpu, -1, axis=0) | |
power_draw = np.roll(power_draw, -1, axis=0) | |
memory_used = np.roll(memory_used, -1, axis=0) | |
## put new measure at the last | |
timestamp[-1] = t_now | |
temperature_gpu[-1, :] = gpu_stat[" temperature.gpu"].values | |
utilization_gpu[-1, :] = gpu_stat[" utilization.gpu [%]"].values | |
power_draw[-1, :] = gpu_stat[" power.draw [W]"].values | |
memory_used[-1, :] = gpu_stat[" memory.used [MiB]"].values | |
## write back to h5 file | |
f["timestamp"][:] = timestamp | |
f["temperature_gpu"][:] = temperature_gpu | |
f["utilization_gpu"][:] = utilization_gpu | |
f["power_draw"][:] = power_draw | |
f["memory_used"][:] = memory_used | |
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE | |
os.makedirs(dirname(collect_file), exist_ok=True) | |
logger.debug("stat_collect: [1]open(a): %s", collect_file) | |
try: | |
with h5file(collect_file, "a") as f: | |
if not check_h5_file(f): | |
initialize_h5_file(f, collect_length=collect_length) | |
while True: | |
t_now = time() | |
if verbose: | |
logger.info(">>> nv_gpu_stat: time: %d", t_now) | |
gpu_stat = nv_gpu_stat() | |
update_h5_file(f, gpu_stat) | |
t_next = t_now + collect_interval | |
t_sleep = t_next - time() | |
if t_sleep > 0.0: | |
sleep(t_sleep) | |
except OSError: | |
logger.exception("h5py File open failed") | |
def nv_gpu_stat_query(time_up_to=None, collect_file=None): | |
time_up_to = time_up_to or time() | |
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE | |
if not isfile(collect_file): | |
logger.debug("stat_query: file not exist: %s", collect_file) | |
return None | |
try: | |
logger.debug("stat_query: open(r): %s", collect_file) | |
with h5file(collect_file, "r") as f: | |
timestamp = f["timestamp"][:] | |
temperature_gpu = f["temperature_gpu"][:] | |
utilization_gpu = f["utilization_gpu"][:] | |
power_draw = f["power_draw"][:] | |
memory_used = f["memory_used"][:] | |
except: | |
_, exc_value, _ = sys.exc_info() | |
logger.exception("*** %s", exc_value) | |
return None | |
if time_up_to: | |
in_range = timestamp <= time_up_to | |
timestamp = timestamp[in_range] | |
temperature_gpu = temperature_gpu[in_range] | |
utilization_gpu = utilization_gpu[in_range] | |
power_draw = power_draw[in_range] | |
memory_used = memory_used[in_range] | |
return dict( | |
timestamp=timestamp, | |
temperature_gpu=temperature_gpu, | |
utilization_gpu=utilization_gpu, | |
power_draw=power_draw, | |
memory_used=memory_used, | |
) | |
def nv_gpu_plot_values( | |
time_series, | |
values, | |
t_now=None, | |
ax=None, | |
plot_seconds=450.0, | |
stat_seconds=60.0, | |
plot_config=None, | |
min_ymax=10.0, | |
plot_kws=None, | |
): | |
plot_kws = plot_kws or dict() | |
ax = ax or plt.gca() | |
time_series = np.asarray(time_series) | |
t_now = t_now or time_series[-1] | |
values = np.asarray(values) | |
plot_config = plot_config or dict(title="values", value_fmt="{:>5.1f}", ylim=None) | |
title = plot_config["title"] | |
value_fmt = plot_config["value_fmt"] | |
if not callable(value_fmt): | |
value_fmt = lambda x, _fmt=value_fmt: _fmt.format(x) | |
ylim = plot_config["ylim"] | |
if time_series[0] + stat_seconds > time_series[-1]: | |
stat_seconds = time_series[-1] - time_series[0] | |
stat_select = (t_now - stat_seconds <= time_series) & (time_series <= t_now) | |
for i, ser in enumerate(np.transpose(values)): | |
y_stat = ser[stat_select] | |
# 현재값 (마지막 값) | |
val_last = ser[-1] | |
if len(y_stat) > 0: | |
val_mean = np.nanmean(y_stat) | |
val_max = np.nanmax(y_stat) | |
# do plot | |
label = "G{:d} {:s}, avg={:s}, max={:s}".format( | |
i, value_fmt(val_last), value_fmt(val_mean), value_fmt(val_max) | |
) | |
else: | |
# do plot | |
label = "G{:d} {:s}".format(i, value_fmt(val_last)) | |
ax.plot(time_series - t_now, ser, label=label, **plot_kws) | |
ax.set_title(title) | |
ax.legend(loc="upper left", prop={"size": 8}, bbox_to_anchor=(1, 1)) | |
# ylim 상한 자동 계산, 15% 마진 | |
max_win = np.amax(values) | |
max_win = max(min_ymax, max_win) | |
ylim = ylim or [-max_win * 0.1, max_win * 1.1] | |
ax.set_ylim(ylim) | |
# ylim 내에 vertical line | |
dyn_y_lim = ax.get_ylim() | |
ax.vlines(-stat_seconds, *dyn_y_lim, ls="dashed", lw=1.0, color="k", alpha=0.5) | |
ax.set_xlim([-plot_seconds, 0]) | |
def nv_gpu_stat_draw( | |
t_now=None, | |
collect_file=None, | |
values_dict=None, | |
plot_configs=None, | |
plot_context=None, | |
hostname=None, | |
figsize=(5.4, 7.4), | |
dpi=100, | |
plot_seconds=450.0, | |
**kwargs | |
): | |
if plot_context is None: | |
plot_context = dict() | |
t_now = t_now or time() | |
hostname = hostname or gethostname() | |
fig, axs = plt.subplots(4, 1, figsize=figsize, dpi=dpi) | |
axs = axs.flatten() | |
plot_context["fig"] = fig | |
if plot_configs is None: | |
plot_configs = dict( | |
temperature_gpu=dict( | |
title="temperature.gpu", value_fmt="{:>3.0f}", ylim=None | |
), | |
utilization_gpu=dict( | |
title="utilization.gpu [%]", value_fmt="{:>3.0f}%", ylim=None | |
), | |
power_draw=dict(title="power.draw [W]", value_fmt="{:>3.0f}", ylim=None), | |
memory_used=dict(title="memory.used [GB]", value_fmt="{:>4.1f}", ylim=None), | |
) | |
plot_kws = dict(lw=1, alpha=0.55) | |
if values_dict is None: | |
values_dict = nv_gpu_stat_query(collect_file=collect_file) | |
timestamp = np.asarray(values_dict["timestamp"]) | |
temperature_gpu = np.asarray(values_dict["temperature_gpu"]) | |
utilization_gpu = np.asarray(values_dict["utilization_gpu"]) | |
power_draw = np.asarray(values_dict["power_draw"]) | |
memory_used = np.asarray(values_dict["memory_used"]) | |
memory_used = memory_used / 1024 # MiB ==> GB | |
plot_select = (t_now - plot_seconds <= timestamp) & (timestamp <= t_now) | |
if not np.any(plot_select): | |
logger.warning("nv_gpu_stat_draw: no data") | |
return | |
timestamp = timestamp[plot_select] | |
temperature_gpu = temperature_gpu[plot_select, :] | |
utilization_gpu = utilization_gpu[plot_select, :] | |
power_draw = power_draw[plot_select, :] | |
memory_used = memory_used[plot_select, :] | |
nv_gpu_plot_values( | |
timestamp, | |
utilization_gpu, | |
ax=axs[0], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["utilization_gpu"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
memory_used, | |
ax=axs[1], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["memory_used"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
power_draw, | |
ax=axs[2], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["power_draw"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
temperature_gpu, | |
ax=axs[3], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["temperature_gpu"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
dtstr = datetime.fromtimestamp(t_now).strftime("%H:%M:%S") | |
fig_title = "{:s}\n{:s}".format(hostname, dtstr) | |
fig.suptitle(fig_title, fontsize=12) | |
fig.tight_layout(rect=[0, 0.03, 1, 0.92]) # fig.tight_layout() | |
plt.show() | |
def nv_gpu_stat_monitor(collect_file=None, interval=5.0, out=None, **kwargs): | |
""" | |
usage: | |
%matplotlib inline | |
from nv_gpu_stat import nv_gpu_stat_monitor | |
nv_gpu_stat_monitor() | |
""" | |
from IPython.display import clear_output | |
# from multiprocessing import Process | |
from threading import Thread | |
# child = Thread(target=(lambda: nv_gpu_stat_collect(verbose=1, collect_interval=2.0))) | |
child = Thread(target=nv_gpu_stat_collect) | |
child.start() | |
try: | |
with plt.ion(): # start interactive mode | |
plot_context = None | |
while True: | |
t_now = time() | |
collect = nv_gpu_stat_query(time_up_to=t_now, collect_file=collect_file) | |
# 아직 collect_file 이 없으면, 대기 | |
if collect: | |
clear_output(wait=True) | |
nv_gpu_stat_draw( | |
t_now=t_now, | |
values_dict=collect, | |
plot_context=plot_context, | |
**kwargs | |
) | |
t_next = t_now + interval | |
t_sleep = t_next - time() | |
if t_sleep > 0.0: | |
sleep(t_sleep) | |
finally: | |
child.join() | |
if __name__ == "__main__": | |
try: | |
get_ipython().run_line_magic("matplotlib", "inline") | |
except: | |
_, ex_val, _ = sys.exc_info() | |
print(ex_val, file=sys.stderr) | |
import matplotlib | |
matplotlib.use("agg") # or Qt5Agg? | |
try: | |
nv_gpu_stat_monitor() | |
finally: | |
get_ipython().system("rm -fvr .??*.ipynb .ipynb_checkpoints __pycache__") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment