Skip to content

Instantly share code, notes, and snippets.

@rolisz
Created November 21, 2018 13:07
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rolisz/f1f5a33f2758e62c0090fcdeab4e3eba to your computer and use it in GitHub Desktop.
Save rolisz/f1f5a33f2758e62c0090fcdeab4e3eba to your computer and use it in GitHub Desktop.
Script to send GPU monitoring data obtained with nvidia-smi to Stackdriver.
from subprocess import Popen, PIPE
import os
import time
import sys
def compute_stats():
all_gpu = []
all_mem = []
for i in range(10):
p = Popen(["nvidia-smi","--query-gpu=utilization.gpu,utilization.memory", "--format=csv,noheader,nounits"], stdout=PIPE)
stdout, stderror = p.communicate()
output = stdout.decode('UTF-8')
# Split on line break
lines = output.split(os.linesep)
numDevices = len(lines)-1
gpu = []
mem = []
for g in range(numDevices):
line = lines[g]
#print(line)
vals = line.split(', ')
#print(vals)
gpu.append(float(vals[0]))
mem.append(float(vals[1]))
all_gpu.append(gpu)
all_mem.append(mem)
time.sleep(1)
max_gpu = [max(x[i] for x in all_gpu) for i in range(numDevices)]
avg_gpu = [sum(x[i] for x in all_gpu)/len(all_gpu) for i in range(numDevices)]
max_mem = [max(x[i] for x in all_mem) for i in range(numDevices)]
avg_mem = [sum(x[i] for x in all_mem)/len(all_mem) for i in range(numDevices)]
return max_gpu, avg_gpu, max_mem, avg_mem
from google.cloud import monitoring_v3
client = monitoring_v3.MetricServiceClient()
project = 'yourGCPProjectID'
project_name = client.project_path(project)
def write_time_series(name, gpu_nr, value):
series = monitoring_v3.types.TimeSeries()
series.metric.type = 'custom.googleapis.com/' + name
series.resource.type = 'gce_instance'
series.resource.labels['instance_id'] = sys.argv[1] + "_gpu_" + str(gpu_nr)
series.resource.labels['zone'] = 'us-central1-f'
point = series.points.add()
point.value.double_value = value
now = time.time()
point.interval.end_time.seconds = int(now)
point.interval.end_time.nanos = int(
(now - point.interval.end_time.seconds) * 10**9)
client.create_time_series(project_name, [series])
if len(sys.argv) < 2:
print("You need to pass the instance name as first argument")
sys.exit(1)
try:
max_gpu, avg_gpu, max_mem, avg_mem = compute_stats()
for i in range(len(max_gpu)):
write_time_series('max_gpu_utilization', i, max_gpu[i])
write_time_series('max_gpu_memory', i, max_mem[i])
write_time_series('avg_gpu_utilization', i, avg_gpu[i])
write_time_series('avg_gpu_memory', i, avg_mem[i])
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment