Skip to content

Instantly share code, notes, and snippets.

@agaoglu
Created February 2, 2011 21:48
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save agaoglu/808523 to your computer and use it in GitHub Desktop.
Save agaoglu/808523 to your computer and use it in GitHub Desktop.
Ganglia metric module for nVidia GPU monitoring
import os
descriptors = list()
def getString():
test_file = "nvidia-smi -q --gpu=0 | tail -23"
try:
p = os.popen(test_file, 'r')
return p.read()
except IOError:
return "Error"
def readl(key):
output=getString()
splittedoutput=output.split('\n')
for line in splittedoutput:
line=line.strip()
if line.startswith(key):
line=line.split(':')[1].strip()
if key=='Temperature':
return line.split('C')[0].strip()
else:
return line[:-1]
def Gpu_Temp(name):
return int(readl('Temperature'))
def Fan_Speed(name):
return int(readl('Fan Speed'))
def Gpu_Util(name):
return int(readl('GPU'))
def Mem_Util(name):
return int(readl('Memory'))
def metric_init(params):
global descriptors
d1 = {'name': 'Gpu_Temperature',
'call_back': Gpu_Temp,
'time_max': 90,
'value_type': 'uint',
'units': 'C',
'slope': 'both',
'format': '%u',
'description': 'GPU Temperature',
'groups': 'gpu'}
d2 = {'name': 'Fan_Speed',
'call_back': Fan_Speed,
'time_max': 90,
'value_type': 'uint',
'units': '%',
'slope': 'both',
'format': '%u',
'description': 'Fan Speed',
'groups': 'gpu'}
d3 = {'name': 'Gpu_Utilization',
'call_back': Gpu_Util,
'time_max': 90,
'value_type': 'uint',
'units': '%',
'slope': 'both',
'format': '%u',
'description': 'GPU GPU Utilization',
'groups': 'gpu'}
d4 = {'name': 'Memory_Utilization',
'call_back': Mem_Util,
'time_max': 90,
'value_type': 'uint',
'units': '%',
'slope': 'both',
'format': '%u',
'description': 'GPU Memory Utilization',
'groups': 'gpu'}
descriptors = [d1,d2,d3,d4]
return descriptors
def metric_cleanup():
'''Clean up the metric module.'''
pass
#Testing
if __name__ == '__main__':
metric_init({})
for d in descriptors:
v = d['call_back']('')
print 'value for %s is %u' % (d['name'], v)
modules {
module {
name = "python_module"
path = "modpython.so"
params = "/path/to/dir/of/gpuwatch"
}
}
modules {
module {
name = "gpuwatch"
language = "python"
}
}
collection_group {
collect_every = 20
time_threshold = 20
metric {
name = "Gpu_Temperature"
title = "Gpu Temperature"
}
metric {
name = "Fan_Speed"
title = "Fan Speed Percentage"
}
metric {
name = "Gpu_Utilization"
title = "Gpu Utilization Percentage"
}
metric {
name = "Memory_Utilization"
title = "Memory Utilization Percentage"
}
}
@agaoglu
Copy link
Author

agaoglu commented Feb 3, 2011

see our post for some details. http://goo.gl/g2zpS

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment