Created
November 23, 2017 22:19
-
-
Save lrakai/b6445734e5e2101180c08fb0f494c76a to your computer and use it in GitHub Desktop.
CloudWatch GPU Monitoring script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"). | |
# You may not use this file except in compliance with the License. | |
# A copy of the License is located at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# or in the "license" file accompanying this file. This file is distributed | |
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | |
# express or implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
import urllib2 | |
import boto3 | |
from pynvml import * | |
from datetime import datetime | |
from time import sleep | |
###CHOOSE NAMESPACE PARMETERS HERE### | |
my_NameSpace = 'DeepLearningTrain' | |
### CHOOSE PUSH INTERVAL #### | |
sleep_interval = 10 | |
### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) #### | |
store_reso = 60 | |
#Instance information | |
BASE_URL = 'http://169.254.169.254/latest/meta-data/' | |
INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read() | |
IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read() | |
INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read() | |
INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read() | |
EC2_REGION = INSTANCE_AZ[:-1] | |
TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H') | |
TMP_FILE = '/tmp/GPU_TEMP' | |
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP | |
# Create CloudWatch client | |
cloudwatch = boto3.client('cloudwatch') | |
# Flag to push to CloudWatch | |
PUSH_TO_CW = True | |
def getPowerDraw(handle): | |
try: | |
powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0 | |
powDrawStr = '%.2f' % powDraw | |
except NVMLError as err: | |
powDrawStr = handleError(err) | |
PUSH_TO_CW = False | |
return powDrawStr | |
def getTemp(handle): | |
try: | |
temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) | |
except NVMLError as err: | |
temp = handleError(err) | |
PUSH_TO_CW = False | |
return temp | |
def getUtilization(handle): | |
try: | |
util = nvmlDeviceGetUtilizationRates(handle) | |
gpu_util = str(util.gpu) | |
mem_util = str(util.memory) | |
except NVMLError as err: | |
error = handleError(err) | |
gpu_util = error | |
mem_util = error | |
PUSH_TO_CW = False | |
return util, gpu_util, mem_util | |
def logResults(i, util, gpu_util, mem_util, powDrawStr, temp): | |
try: | |
gpu_logs = open(TMP_FILE_SAVED, 'a+') | |
writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n' | |
gpu_logs.write(writeString) | |
except: | |
print("Error writing to file ", gpu_logs) | |
finally: | |
gpu_logs.close() | |
if (PUSH_TO_CW): | |
MY_DIMENSIONS=[ | |
{ | |
'Name': 'InstanceId', | |
'Value': INSTANCE_ID | |
}, | |
{ | |
'Name': 'ImageId', | |
'Value': IMAGE_ID | |
}, | |
{ | |
'Name': 'InstanceType', | |
'Value': INSTANCE_TYPE | |
}, | |
{ | |
'Name': 'GPUNumber', | |
'Value': str(i) | |
} | |
] | |
cloudwatch.put_metric_data( | |
MetricData=[ | |
{ | |
'MetricName': 'GPU Usage', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'Percent', | |
'StorageResolution': store_reso, | |
'Value': util.gpu | |
}, | |
{ | |
'MetricName': 'Memory Usage', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'Percent', | |
'StorageResolution': store_reso, | |
'Value': util.memory | |
}, | |
{ | |
'MetricName': 'Power Usage (Watts)', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'None', | |
'StorageResolution': store_reso, | |
'Value': float(powDrawStr) | |
}, | |
{ | |
'MetricName': 'Temperature (C)', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'None', | |
'StorageResolution': store_reso, | |
'Value': int(temp) | |
}, | |
], | |
Namespace=my_NameSpace | |
) | |
nvmlInit() | |
deviceCount = nvmlDeviceGetCount() | |
def main(): | |
try: | |
while True: | |
PUSH_TO_CW = True | |
# Find the metrics for each GPU on instance | |
for i in range(deviceCount): | |
handle = nvmlDeviceGetHandleByIndex(i) | |
powDrawStr = getPowerDraw(handle) | |
temp = getTemp(handle) | |
util, gpu_util, mem_util = getUtilization(handle) | |
logResults(i, util, gpu_util, mem_util, powDrawStr, temp) | |
sleep(sleep_interval) | |
finally: | |
nvmlShutdown() | |
if __name__=='__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
HI
how to get the baseurl details
am getting connection issue
File "/usr/lib64/python3.7/socket.py", line 716, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "gpumon.py", line 37, in
INSTANCE_ID = urlopen(BASE_URL + 'instance-id').read()
File "/usr/lib64/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/lib64/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/usr/lib64/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/usr/lib64/python3.7/urllib/request.py", line 1378, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib64/python3.7/urllib/request.py", line 1352, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 111] Connection refused>