Skip to content

Instantly share code, notes, and snippets.

@lrakai
Created November 23, 2017 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lrakai/b6445734e5e2101180c08fb0f494c76a to your computer and use it in GitHub Desktop.
Save lrakai/b6445734e5e2101180c08fb0f494c76a to your computer and use it in GitHub Desktop.
CloudWatch GPU Monitoring script
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import urllib2
import boto3
from pynvml import *
from datetime import datetime
from time import sleep
###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'
### CHOOSE PUSH INTERVAL ####
sleep_interval = 10
### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60
#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()
IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read()
INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read()
INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read()
EC2_REGION = INSTANCE_AZ[:-1]
TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
TMP_FILE = '/tmp/GPU_TEMP'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')
# Flag to push to CloudWatch
PUSH_TO_CW = True
def getPowerDraw(handle):
try:
powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
powDrawStr = '%.2f' % powDraw
except NVMLError as err:
powDrawStr = handleError(err)
PUSH_TO_CW = False
return powDrawStr
def getTemp(handle):
try:
temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
except NVMLError as err:
temp = handleError(err)
PUSH_TO_CW = False
return temp
def getUtilization(handle):
try:
util = nvmlDeviceGetUtilizationRates(handle)
gpu_util = str(util.gpu)
mem_util = str(util.memory)
except NVMLError as err:
error = handleError(err)
gpu_util = error
mem_util = error
PUSH_TO_CW = False
return util, gpu_util, mem_util
def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
try:
gpu_logs = open(TMP_FILE_SAVED, 'a+')
writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
gpu_logs.write(writeString)
except:
print("Error writing to file ", gpu_logs)
finally:
gpu_logs.close()
if (PUSH_TO_CW):
MY_DIMENSIONS=[
{
'Name': 'InstanceId',
'Value': INSTANCE_ID
},
{
'Name': 'ImageId',
'Value': IMAGE_ID
},
{
'Name': 'InstanceType',
'Value': INSTANCE_TYPE
},
{
'Name': 'GPUNumber',
'Value': str(i)
}
]
cloudwatch.put_metric_data(
MetricData=[
{
'MetricName': 'GPU Usage',
'Dimensions': MY_DIMENSIONS,
'Unit': 'Percent',
'StorageResolution': store_reso,
'Value': util.gpu
},
{
'MetricName': 'Memory Usage',
'Dimensions': MY_DIMENSIONS,
'Unit': 'Percent',
'StorageResolution': store_reso,
'Value': util.memory
},
{
'MetricName': 'Power Usage (Watts)',
'Dimensions': MY_DIMENSIONS,
'Unit': 'None',
'StorageResolution': store_reso,
'Value': float(powDrawStr)
},
{
'MetricName': 'Temperature (C)',
'Dimensions': MY_DIMENSIONS,
'Unit': 'None',
'StorageResolution': store_reso,
'Value': int(temp)
},
],
Namespace=my_NameSpace
)
nvmlInit()
deviceCount = nvmlDeviceGetCount()
def main():
try:
while True:
PUSH_TO_CW = True
# Find the metrics for each GPU on instance
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
powDrawStr = getPowerDraw(handle)
temp = getTemp(handle)
util, gpu_util, mem_util = getUtilization(handle)
logResults(i, util, gpu_util, mem_util, powDrawStr, temp)
sleep(sleep_interval)
finally:
nvmlShutdown()
if __name__=='__main__':
main()
@amritha-devadiga
Copy link

HI

how to get the baseurl details

am getting connection issue

File "/usr/lib64/python3.7/socket.py", line 716, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "gpumon.py", line 37, in
INSTANCE_ID = urlopen(BASE_URL + 'instance-id').read()
File "/usr/lib64/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/lib64/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/usr/lib64/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/usr/lib64/python3.7/urllib/request.py", line 1378, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib64/python3.7/urllib/request.py", line 1352, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 111] Connection refused>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment