Skip to content

Instantly share code, notes, and snippets.

@rtomson
Created July 20, 2016 15:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rtomson/716568f55121add042e80a03495173eb to your computer and use it in GitHub Desktop.
Save rtomson/716568f55121add042e80a03495173eb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import print_function
from nvidia.pynvml import *
import socket
import time
HOSTNAME = socket.gethostname().split(".")[0]
try:
nvmlInit()
gpuStats = {}
now = int(time.time())
for i in range(0, nvmlDeviceGetCount()):
hndl = nvmlDeviceGetHandleByIndex(i)
# Memory
memInfo = nvmlDeviceGetMemoryInfo(hndl)
memTotal = memInfo.total / 1024 / 1024
memUsed = memInfo.used / 1024 / 1024
memFree = memInfo.total / 1024 / 1024 - memInfo.used / 1024 / 1024
gpuStats['memUsed'] = memUsed
gpuStats['memTotal'] = memTotal
# Thermal
temp = nvmlDeviceGetTemperature(hndl, NVML_TEMPERATURE_GPU)
gpuStats['tempC'] = temp
gpuStats['tempF'] = temp * 9 / 5 + 32
# Power
try:
nvmlDeviceGetPowerManagementMode(hndl)
gpuStats['powDraw'] = nvmlDeviceGetPowerUsage(hndl) / 1000
gpuStats['powLimit'] = nvmlDeviceGetPowerManagementLimit(hndl) / 1000
except NVMLError:
gpuStats['powDraw'] = 0
gpuStats['powLimit'] = 0
# Clocks
try:
gpuStats['clockGraphics'] = nvmlDeviceGetClockInfo(hndl, NVML_CLOCK_GRAPHICS)
except NVMLError:
gpuStats['clockGraphics'] = 0
# Utilization
gpuStats['gpuUtil'] = nvmlDeviceGetUtilizationRates(hndl).gpu
gpuStats['memUtil'] = nvmlDeviceGetUtilizationRates(hndl).memory
# PCI Util
try:
gpuStats['pciUtilTx'] = nvmlDeviceGetPcieThroughput(hndl, NVML_PCIE_UTIL_TX_BYTES)
gpuStats['pciUtilRx'] = nvmlDeviceGetPcieThroughput(hndl, NVML_PCIE_UTIL_RX_BYTES)
except NVMLError:
gpuStats['pciUtilTx'] = 0
gpuStats['pciUtilRx'] = 0
print(
"farm.stats.{host}.gpu{i}.memUsed {memUsed} {now}\n"
"farm.stats.{host}.gpu{i}.memTotal {memTotal} {now}\n"
"farm.stats.{host}.gpu{i}.tempC {tempC} {now}\n"
"farm.stats.{host}.gpu{i}.tempF {tempF} {now}\n"
"farm.stats.{host}.gpu{i}.powDraw {powDraw} {now}\n"
"farm.stats.{host}.gpu{i}.powLimit {powLimit} {now}\n"
"farm.stats.{host}.gpu{i}.clockGraphics {clockGraphics} {now}\n"
"farm.stats.{host}.gpu{i}.gpuUtil {gpuUtil} {now}\n"
"farm.stats.{host}.gpu{i}.memUtil {memUtil} {now}\n"
"farm.stats.{host}.gpu{i}.pciUtilTx {pciUtilTx} {now}\n"
"farm.stats.{host}.gpu{i}.pciUtilRx {pciUtilRx} {now}\n"
"".format(host=HOSTNAME, i=i, now=now, **gpuStats).strip())
except NVMLError as err:
print(err)
except KeyboardInterrupt:
pass
finally:
nvmlShutdown()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment