lrakai/gpumon.py

## gpumon.py
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  or in the "license" file accompanying this file. This file is distributed
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
#  express or implied. See the License for the specific language governing
#  permissions and limitations under the License.


import urllib2
import boto3
from pynvml import *
from datetime import datetime
from time import sleep

###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'

### CHOOSE PUSH INTERVAL ####
sleep_interval = 10

### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60

#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()
IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read()
INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read()
INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read()
EC2_REGION = INSTANCE_AZ[:-1]

TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
TMP_FILE = '/tmp/GPU_TEMP'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')

# Flag to push to CloudWatch
PUSH_TO_CW = True

def getPowerDraw(handle):
    try:
        powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
        powDrawStr = '%.2f' % powDraw
    except NVMLError as err:
        powDrawStr = handleError(err)
        PUSH_TO_CW = False
    return powDrawStr

def getTemp(handle):
    try:
        temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
    except NVMLError as err:
        temp = handleError(err)
        PUSH_TO_CW = False
    return temp

def getUtilization(handle):
    try:
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = str(util.gpu)
        mem_util = str(util.memory)
    except NVMLError as err:
        error = handleError(err)
        gpu_util = error
        mem_util = error
        PUSH_TO_CW = False
    return util, gpu_util, mem_util

def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
    try:
        gpu_logs = open(TMP_FILE_SAVED, 'a+')
        writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
        gpu_logs.write(writeString)
    except:
        print("Error writing to file ", gpu_logs)
    finally:
        gpu_logs.close()
    if (PUSH_TO_CW):
        MY_DIMENSIONS=[
                    {
                        'Name': 'InstanceId',
                        'Value': INSTANCE_ID
                    },
                    {
                        'Name': 'ImageId',
                        'Value': IMAGE_ID
                    },
                    {
                        'Name': 'InstanceType',
                        'Value': INSTANCE_TYPE
                    },
                    {
                        'Name': 'GPUNumber',
                        'Value': str(i)
                    }
                ]
        cloudwatch.put_metric_data(
            MetricData=[
                {
                    'MetricName': 'GPU Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.gpu
                },
                {
                    'MetricName': 'Memory Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.memory
                },
                {
                    'MetricName': 'Power Usage (Watts)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': float(powDrawStr)
                },
                {
                    'MetricName': 'Temperature (C)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': int(temp)
                },
        ],
            Namespace=my_NameSpace
        )


nvmlInit()
deviceCount = nvmlDeviceGetCount()

def main():
    try:
        while True:
            PUSH_TO_CW = True
            # Find the metrics for each GPU on instance
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)

                powDrawStr = getPowerDraw(handle)
                temp = getTemp(handle)
                util, gpu_util, mem_util = getUtilization(handle)
                logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

            sleep(sleep_interval)

    finally:
        nvmlShutdown()

if __name__=='__main__':
    main()
	# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License").
	# You may not use this file except in compliance with the License.
	# A copy of the License is located at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# or in the "license" file accompanying this file. This file is distributed
	# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	# express or implied. See the License for the specific language governing
	# permissions and limitations under the License.


	import urllib2
	import boto3
	from pynvml import *
	from datetime import datetime
	from time import sleep

	###CHOOSE NAMESPACE PARMETERS HERE###
	my_NameSpace = 'DeepLearningTrain'

	### CHOOSE PUSH INTERVAL ####
	sleep_interval = 10

	### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
	store_reso = 60

	#Instance information
	BASE_URL = 'http://169.254.169.254/latest/meta-data/'
	INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()
	IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read()
	INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read()
	INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read()
	EC2_REGION = INSTANCE_AZ[:-1]

	TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
	TMP_FILE = '/tmp/GPU_TEMP'
	TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

	# Create CloudWatch client
	cloudwatch = boto3.client('cloudwatch')

	# Flag to push to CloudWatch
	PUSH_TO_CW = True

	def getPowerDraw(handle):
	try:
	powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
	powDrawStr = '%.2f' % powDraw
	except NVMLError as err:
	powDrawStr = handleError(err)
	PUSH_TO_CW = False
	return powDrawStr

	def getTemp(handle):
	try:
	temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
	except NVMLError as err:
	temp = handleError(err)
	PUSH_TO_CW = False
	return temp

	def getUtilization(handle):
	try:
	util = nvmlDeviceGetUtilizationRates(handle)
	gpu_util = str(util.gpu)
	mem_util = str(util.memory)
	except NVMLError as err:
	error = handleError(err)
	gpu_util = error
	mem_util = error
	PUSH_TO_CW = False
	return util, gpu_util, mem_util

	def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
	try:
	gpu_logs = open(TMP_FILE_SAVED, 'a+')
	writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
	gpu_logs.write(writeString)
	except:
	print("Error writing to file ", gpu_logs)
	finally:
	gpu_logs.close()
	if (PUSH_TO_CW):
	MY_DIMENSIONS=[
	{
	'Name': 'InstanceId',
	'Value': INSTANCE_ID
	},
	{
	'Name': 'ImageId',
	'Value': IMAGE_ID
	},
	{
	'Name': 'InstanceType',
	'Value': INSTANCE_TYPE
	},
	{
	'Name': 'GPUNumber',
	'Value': str(i)
	}
	]
	cloudwatch.put_metric_data(
	MetricData=[
	{
	'MetricName': 'GPU Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.gpu
	},
	{
	'MetricName': 'Memory Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.memory
	},
	{
	'MetricName': 'Power Usage (Watts)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': float(powDrawStr)
	},
	{
	'MetricName': 'Temperature (C)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': int(temp)
	},
	],
	Namespace=my_NameSpace
	)


	nvmlInit()
	deviceCount = nvmlDeviceGetCount()

	def main():
	try:
	while True:
	PUSH_TO_CW = True
	# Find the metrics for each GPU on instance
	for i in range(deviceCount):
	handle = nvmlDeviceGetHandleByIndex(i)

	powDrawStr = getPowerDraw(handle)
	temp = getTemp(handle)
	util, gpu_util, mem_util = getUtilization(handle)
	logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

	sleep(sleep_interval)

	finally:
	nvmlShutdown()

	if __name__=='__main__':
	main()