Skip to content

Instantly share code, notes, and snippets.

@ozancaglayan
Created November 29, 2017 11:16
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ozancaglayan/40aaae8397edca78d9d473a3e1ef6e78 to your computer and use it in GitHub Desktop.
Save ozancaglayan/40aaae8397edca78d9d473a3e1ef6e78 to your computer and use it in GitHub Desktop.
NVIDIA GPU textfile exporter
#!/usr/bin/env python3
import time
import atexit
from collections import OrderedDict
from py3nvml import py3nvml as nv
from prometheus_client import Gauge, CollectorRegistry
from prometheus_client import write_to_textfile
class GPUCollector(object):
def __init__(self, write_file):
self.labels = ['gpu', 'name', 'driver']
self.driver = nv.nvmlSystemGetDriverVersion()
self.registry = CollectorRegistry()
self.write_file = write_file
self.n_gpu = nv.nvmlDeviceGetCount()
self.hnds = [nv.nvmlDeviceGetHandleByIndex(i) for i in
range(self.n_gpu)]
self.args = []
for i, hnd in enumerate(self.hnds):
args = OrderedDict()
args['gpu'] = 'gpu%d' % i
args['name'] = nv.nvmlDeviceGetName(hnd)
args['driver'] = self.driver
self.args.append(args)
self.gpu_memused = Gauge('node_gpu_memory_bytes_used', # bytes
'Used GPU Memory', self.labels,
registry=self.registry)
self.gpu_memtotal = Gauge('node_gpu_memory_bytes_total', # bytes
'Total GPU Memory', self.labels,
registry=self.registry)
self.gpu_powertotal = Gauge('node_gpu_power_watts', # watts
'GPU Power Utilization', self.labels,
registry=self.registry)
self.gpu_temp = Gauge('node_gpu_temp_celsius',
'GPU Temperature', self.labels,
registry=self.registry)
self.gpu_utilpct = Gauge('node_gpu_usage_ratio', # percent
'GPU Usage Percentage', self.labels,
registry=self.registry)
def temperature(self, hnd):
try:
return nv.nvmlDeviceGetTemperature(hnd, nv.NVML_TEMPERATURE_GPU)
except nv.NVMLError_NotSupported:
return -1
def usage_ratio(self, hnd):
return nv.nvmlDeviceGetUtilizationRates(hnd).gpu
def mem_info(self, hnd):
return nv.nvmlDeviceGetMemoryInfo(hnd)
def power_usage(self, hnd):
return nv.nvmlDeviceGetPowerUsage(hnd) / 1000
def update(self):
for hnd, args in zip(self.hnds, self.args):
mem = self.mem_info(hnd)
self.gpu_memused.labels(**args).set(mem.used)
self.gpu_memtotal.labels(**args).set(mem.total)
self.gpu_powertotal.labels(**args).set(self.power_usage(hnd))
self.gpu_utilpct.labels(**args).set(self.usage_ratio(hnd))
self.gpu_temp.labels(**args).set(self.temperature(hnd))
write_to_textfile(self.write_file, self.registry)
if __name__ == "__main__":
nv.nvmlInit()
atexit.register(nv.nvmlShutdown)
gc = GPUCollector('/var/lib/node_exporter/textfile_collector/gpu.prom')
while True:
gc.update()
time.sleep(15)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment