Skip to content

Instantly share code, notes, and snippets.

@remyd1
Created September 19, 2018 14:35
Show Gist options
  • Save remyd1/00547d7306f70adb85709d2699256155 to your computer and use it in GitHub Desktop.
Save remyd1/00547d7306f70adb85709d2699256155 to your computer and use it in GitHub Desktop.
ganglia module for nvidia cards
#!/usr/bin/env python
'''
ganglia/gmond python module to use nvidia-smi to inject some GPU info into ganglia.
Reference:
https://github.com/ganglia/monitor-core/blob/master/gmond/modules/python/README.in
'''
import subprocess
if "check_output" not in dir( subprocess ):
def f(*popenargs, **kwargs):
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise subprocess.CalledProcessError(retcode, cmd)
return output
subprocess.check_output = f
def metric_value(name):
gpu_id = int(name[-1])-1
query = name.lower().replace('_','.')[2:-5]
action = ("percent" if ',' in query else "realval")
cmd = 'nvidia-smi --query-gpu=%s --id=%d --format=csv,noheader,nounits'%(query,gpu_id)
output = subprocess.check_output(cmd, shell=True)
if action == "percent":
return 100*int(output.split(', ')[0])/int(output.split(', ')[1])
else:
return int(round(float(output.split()[0])))
def nbcards():
nb_dirs = subprocess.Popen(["ls","/proc/driver/nvidia/gpus"], stdout=subprocess.PIPE)
nb_cards = subprocess.Popen(["wc","-w"], stdin=nb_dirs.stdout, stdout=subprocess.PIPE)
nb_dirs.stdout.close()
output = nb_cards.communicate()[0]
return int(output)
def metric_init(params):
defaults = dict(
name = 'Default', # override this
units = 'MiB', # override this
description = 'GPU Memory Free', # override this
call_back = metric_value,
time_max = 600,
value_type = 'uint',
slope = 'both',
format = '%u',
groups = 'gpu')
query_defs = [
{"name":"Utilization_GPU:GPU", "desc":"Utilization GPU", "units":'%'},
{"name":"Memory_Used,Memory_Total:GPU", "desc":"Memory Utilization GPU", "units":'%'},
{"name":"Memory_Used:GPU", "desc":"Memory Used GPU", "units":'MiB'},
{"name":"Temperature_GPU:GPU", "desc":"Temperature GPU", "units":'C'},
{"name":"Fan_Speed:GPU", "desc":"Fan Speed GPU", "units":'%'},
{"name":"Power_Draw:GPU", "desc":"Power Consumption GPU", "units":'Watts'}
]
nb_cards = nbcards()
descriptors = []
for i in range(1,7):
for j in range(1,nb_cards+1):
desc_name = str(i)+":"+query_defs[i-1]["name"]+str(j)
desc_desc = query_defs[i-1]["desc"]+str(j)
descriptors.append(dict(defaults, name = desc_name, description = desc_desc, units = query_defs[i-1]["units"]))
return descriptors
def metric_cleanup():
'''Clean up the metric module.'''
pass
#Testing
if __name__ == '__main__':
descriptors = metric_init({})
for descriptor in descriptors:
name = descriptor['name']
value = descriptor['call_back'](name)
print 'value for %s is %u %s' % (name, value, descriptor['units'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment