Created
September 19, 2018 14:35
-
-
Save remyd1/00547d7306f70adb85709d2699256155 to your computer and use it in GitHub Desktop.
ganglia module for nvidia cards
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
ganglia/gmond python module to use nvidia-smi to inject some GPU info into ganglia. | |
Reference: | |
https://github.com/ganglia/monitor-core/blob/master/gmond/modules/python/README.in | |
''' | |
import subprocess | |
if "check_output" not in dir( subprocess ): | |
def f(*popenargs, **kwargs): | |
if 'stdout' in kwargs: | |
raise ValueError('stdout argument not allowed, it will be overridden.') | |
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) | |
output, unused_err = process.communicate() | |
retcode = process.poll() | |
if retcode: | |
cmd = kwargs.get("args") | |
if cmd is None: | |
cmd = popenargs[0] | |
raise subprocess.CalledProcessError(retcode, cmd) | |
return output | |
subprocess.check_output = f | |
def metric_value(name): | |
gpu_id = int(name[-1])-1 | |
query = name.lower().replace('_','.')[2:-5] | |
action = ("percent" if ',' in query else "realval") | |
cmd = 'nvidia-smi --query-gpu=%s --id=%d --format=csv,noheader,nounits'%(query,gpu_id) | |
output = subprocess.check_output(cmd, shell=True) | |
if action == "percent": | |
return 100*int(output.split(', ')[0])/int(output.split(', ')[1]) | |
else: | |
return int(round(float(output.split()[0]))) | |
def nbcards(): | |
nb_dirs = subprocess.Popen(["ls","/proc/driver/nvidia/gpus"], stdout=subprocess.PIPE) | |
nb_cards = subprocess.Popen(["wc","-w"], stdin=nb_dirs.stdout, stdout=subprocess.PIPE) | |
nb_dirs.stdout.close() | |
output = nb_cards.communicate()[0] | |
return int(output) | |
def metric_init(params): | |
defaults = dict( | |
name = 'Default', # override this | |
units = 'MiB', # override this | |
description = 'GPU Memory Free', # override this | |
call_back = metric_value, | |
time_max = 600, | |
value_type = 'uint', | |
slope = 'both', | |
format = '%u', | |
groups = 'gpu') | |
query_defs = [ | |
{"name":"Utilization_GPU:GPU", "desc":"Utilization GPU", "units":'%'}, | |
{"name":"Memory_Used,Memory_Total:GPU", "desc":"Memory Utilization GPU", "units":'%'}, | |
{"name":"Memory_Used:GPU", "desc":"Memory Used GPU", "units":'MiB'}, | |
{"name":"Temperature_GPU:GPU", "desc":"Temperature GPU", "units":'C'}, | |
{"name":"Fan_Speed:GPU", "desc":"Fan Speed GPU", "units":'%'}, | |
{"name":"Power_Draw:GPU", "desc":"Power Consumption GPU", "units":'Watts'} | |
] | |
nb_cards = nbcards() | |
descriptors = [] | |
for i in range(1,7): | |
for j in range(1,nb_cards+1): | |
desc_name = str(i)+":"+query_defs[i-1]["name"]+str(j) | |
desc_desc = query_defs[i-1]["desc"]+str(j) | |
descriptors.append(dict(defaults, name = desc_name, description = desc_desc, units = query_defs[i-1]["units"])) | |
return descriptors | |
def metric_cleanup(): | |
'''Clean up the metric module.''' | |
pass | |
#Testing | |
if __name__ == '__main__': | |
descriptors = metric_init({}) | |
for descriptor in descriptors: | |
name = descriptor['name'] | |
value = descriptor['call_back'](name) | |
print 'value for %s is %u %s' % (name, value, descriptor['units']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment