alexwal/gpumon.py

## readme.md

      
    Raw
  

              readme.md
            
          
    How to use


Start with GPU EC2 instance that has service role with policy for putting metrics on Amazon CloudWatch (https://aws.amazon.com/blogs/machine-learning/monitoring-gpu-utilization-with-amazon-cloudwatch/).
Put a bunch of 299x299 jpeg images in test-images/ Ex. for run in {1..10} do wget via.placeholder.com/299x299.jpg  done.
pip3 install nvidia-ml-py3
Edit run.sh to run gpumon.py and another process (which is presumably using the GPU)..
Open CloudWatch in AWS Console to view graphs.


## gpumon.py
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  or in the "license" file accompanying this file. This file is distributed
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
#  express or implied. See the License for the specific language governing
#  permissions and limitations under the License.


import urllib
import boto3
from pynvml import *
from datetime import datetime
from time import sleep

### CHOOSE REGION ####
EC2_REGION = 'us-east-1'

###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'

### CHOOSE PUSH INTERVAL ####
sleep_interval = 10

### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60

#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8')
IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8')
INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8')
INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8')
EC2_REGION = INSTANCE_AZ[:-1]

TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
TMP_FILE = '/tmp/GPU_TEMP'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

print(EC2_REGION)
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


# Flag to push to CloudWatch
PUSH_TO_CW = True

def getPowerDraw(handle):
    try:
        powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
        powDrawStr = '%.2f' % powDraw
    except NVMLError as err:
        powDrawStr = handleError(err)
        PUSH_TO_CW = False
    return powDrawStr

def getTemp(handle):
    try:
        temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
    except NVMLError as err:
        temp = handleError(err)
        PUSH_TO_CW = False
    return temp

def getUtilization(handle):
    try:
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = str(util.gpu)
        mem_util = str(util.memory)
    except NVMLError as err:
        error = handleError(err)
        gpu_util = error
        mem_util = error
        PUSH_TO_CW = False
    return util, gpu_util, mem_util

def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
    try:
        gpu_logs = open(TMP_FILE_SAVED, 'a+')
        writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
        gpu_logs.write(writeString)
    except:
        print(("Error writing to file ", gpu_logs))
    finally:
        gpu_logs.close()
    if (PUSH_TO_CW):
        MY_DIMENSIONS=[
                    {
                        'Name': 'InstanceId',
                        'Value': INSTANCE_ID
                    },
                    {
                        'Name': 'ImageId',
                        'Value': IMAGE_ID
                    },
                    {
                        'Name': 'InstanceType',
                        'Value': INSTANCE_TYPE
                    },
                    {
                        'Name': 'GPUNumber',
                        'Value': str(i)
                    }
                ]
        cloudwatch.put_metric_data(
            MetricData=[
                {
                    'MetricName': 'GPU Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.gpu
                },
                {
                    'MetricName': 'Memory Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.memory
                },
                {
                    'MetricName': 'Power Usage (Watts)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': float(powDrawStr)
                },
                {
                    'MetricName': 'Temperature (C)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': int(temp)
                },
        ],
            Namespace=my_NameSpace
        )


nvmlInit()
deviceCount = nvmlDeviceGetCount()

def main():
    try:
        while True:
            PUSH_TO_CW = True
            # Find the metrics for each GPU on instance
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)

                powDrawStr = getPowerDraw(handle)
                temp = getTemp(handle)
                util, gpu_util, mem_util = getUtilization(handle)
                logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

            sleep(sleep_interval)

    finally:
        nvmlShutdown()

if __name__=='__main__':
    main()


## how-to-use-all-gpus-in-tensorflow.py
import glob
import tensorflow as tf

# Get matching filenames
pattern = '../../test-images/*.jpg.*' # A bunch of 299x299x3 jpegs
matching_filenames = glob.iglob(pattern)

# Pipeline settings
NUM_GPUS = 2
BATCH_SIZE = 32
INNER_LOOP_ITERS = 3
FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate.

# Generate batches of images (jpeg encoded)
batches, batch = [], []
id_batches, id_batch = [], []
for i, filename in enumerate(matching_filenames):
  data = open(filename, 'rb').read()
  batch.append(data)
  id_batch.append(i)
  if len(batch) % FEED_BATCH_SIZE == 0:
    batches.append(batch)
    id_batches.append(id_batch)
    batch = []
    id_batch = []

###########################################
# Tensorflow pipeline for decoding images #
###########################################

# Create a dataset returning slices of `image_strings`
image_strings = tf.placeholder(tf.string, shape=[None])
id_ints = tf.placeholder(tf.int32, shape=[None])
dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings))

# Parse every image in the dataset using `map`
def _parse_function(image_id, image_string):
  image_decoded = tf.image.decode_jpeg(image_string, channels=3)
  image = tf.cast(image_decoded, tf.float32)
  return image_id, image
dataset = dataset.map(_parse_function)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.repeat()

# Create iterator and final tensor
# a = tf.ones(shape = [9,5,5,3])
# b = tf.random_uniform(shape = [9,5,3,1])
# c = tf.matmul(a,b) [shape=9,5,5,1]

iterator = dataset.make_initializable_iterator()
results = []
for i in range(NUM_GPUS):
    with tf.device('/gpu:{}'.format(i)):
        image_ids, images = iterator.get_next()
        W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1)))
        result = tf.matmul(images, W)
        results.append((image_ids, result))
init_op = tf.global_variables_initializer()

# Feed image data to the dataset pipeline
config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True)
with tf.Session(config=config) as sess:
  sess.run(init_op)
  while True:
    for id_batch, batch in zip(id_batches, batches):
      # Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements)
      sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch})
      for _ in range(INNER_LOOP_ITERS):
        all_results = sess.run(results)
        for result in all_results:
            ids, vectors = result
            print('Got ids:', ids)


## run.sh
#!/bin/bash
if pgrep -f "gpumon.py" &>/dev/null; then
    echo "gpumon already running"
else
    echo "gpumon not already running"
    python3 gpumon.py &
fi
python3 how-to-use-all-gpus-in-tensorflow.py
	# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License").
	# You may not use this file except in compliance with the License.
	# A copy of the License is located at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# or in the "license" file accompanying this file. This file is distributed
	# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	# express or implied. See the License for the specific language governing
	# permissions and limitations under the License.


	import urllib
	import boto3
	from pynvml import *
	from datetime import datetime
	from time import sleep

	### CHOOSE REGION ####
	EC2_REGION = 'us-east-1'

	###CHOOSE NAMESPACE PARMETERS HERE###
	my_NameSpace = 'DeepLearningTrain'

	### CHOOSE PUSH INTERVAL ####
	sleep_interval = 10

	### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
	store_reso = 60

	#Instance information
	BASE_URL = 'http://169.254.169.254/latest/meta-data/'
	INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8')
	IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8')
	INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8')
	INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8')
	EC2_REGION = INSTANCE_AZ[:-1]

	TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
	TMP_FILE = '/tmp/GPU_TEMP'
	TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

	print(EC2_REGION)
	# Create CloudWatch client
	cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


	# Flag to push to CloudWatch
	PUSH_TO_CW = True

	def getPowerDraw(handle):
	try:
	powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
	powDrawStr = '%.2f' % powDraw
	except NVMLError as err:
	powDrawStr = handleError(err)
	PUSH_TO_CW = False
	return powDrawStr

	def getTemp(handle):
	try:
	temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
	except NVMLError as err:
	temp = handleError(err)
	PUSH_TO_CW = False
	return temp

	def getUtilization(handle):
	try:
	util = nvmlDeviceGetUtilizationRates(handle)
	gpu_util = str(util.gpu)
	mem_util = str(util.memory)
	except NVMLError as err:
	error = handleError(err)
	gpu_util = error
	mem_util = error
	PUSH_TO_CW = False
	return util, gpu_util, mem_util

	def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
	try:
	gpu_logs = open(TMP_FILE_SAVED, 'a+')
	writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
	gpu_logs.write(writeString)
	except:
	print(("Error writing to file ", gpu_logs))
	finally:
	gpu_logs.close()
	if (PUSH_TO_CW):
	MY_DIMENSIONS=[
	{
	'Name': 'InstanceId',
	'Value': INSTANCE_ID
	},
	{
	'Name': 'ImageId',
	'Value': IMAGE_ID
	},
	{
	'Name': 'InstanceType',
	'Value': INSTANCE_TYPE
	},
	{
	'Name': 'GPUNumber',
	'Value': str(i)
	}
	]
	cloudwatch.put_metric_data(
	MetricData=[
	{
	'MetricName': 'GPU Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.gpu
	},
	{
	'MetricName': 'Memory Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.memory
	},
	{
	'MetricName': 'Power Usage (Watts)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': float(powDrawStr)
	},
	{
	'MetricName': 'Temperature (C)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': int(temp)
	},
	],
	Namespace=my_NameSpace
	)


	nvmlInit()
	deviceCount = nvmlDeviceGetCount()

	def main():
	try:
	while True:
	PUSH_TO_CW = True
	# Find the metrics for each GPU on instance
	for i in range(deviceCount):
	handle = nvmlDeviceGetHandleByIndex(i)

	powDrawStr = getPowerDraw(handle)
	temp = getTemp(handle)
	util, gpu_util, mem_util = getUtilization(handle)
	logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

	sleep(sleep_interval)

	finally:
	nvmlShutdown()

	if __name__=='__main__':
	main()
	import glob
	import tensorflow as tf

	# Get matching filenames
	pattern = '../../test-images/.jpg.' # A bunch of 299x299x3 jpegs
	matching_filenames = glob.iglob(pattern)

	# Pipeline settings
	NUM_GPUS = 2
	BATCH_SIZE = 32
	INNER_LOOP_ITERS = 3
	FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate.

	# Generate batches of images (jpeg encoded)
	batches, batch = [], []
	id_batches, id_batch = [], []
	for i, filename in enumerate(matching_filenames):
	data = open(filename, 'rb').read()
	batch.append(data)
	id_batch.append(i)
	if len(batch) % FEED_BATCH_SIZE == 0:
	batches.append(batch)
	id_batches.append(id_batch)
	batch = []
	id_batch = []

	###########################################
	# Tensorflow pipeline for decoding images #
	###########################################

	# Create a dataset returning slices of `image_strings`
	image_strings = tf.placeholder(tf.string, shape=[None])
	id_ints = tf.placeholder(tf.int32, shape=[None])
	dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings))

	# Parse every image in the dataset using `map`
	def _parse_function(image_id, image_string):
	image_decoded = tf.image.decode_jpeg(image_string, channels=3)
	image = tf.cast(image_decoded, tf.float32)
	return image_id, image
	dataset = dataset.map(_parse_function)
	dataset = dataset.batch(BATCH_SIZE)
	dataset = dataset.repeat()

	# Create iterator and final tensor
	# a = tf.ones(shape = [9,5,5,3])
	# b = tf.random_uniform(shape = [9,5,3,1])
	# c = tf.matmul(a,b) [shape=9,5,5,1]

	iterator = dataset.make_initializable_iterator()
	results = []
	for i in range(NUM_GPUS):
	with tf.device('/gpu:{}'.format(i)):
	image_ids, images = iterator.get_next()
	W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1)))
	result = tf.matmul(images, W)
	results.append((image_ids, result))
	init_op = tf.global_variables_initializer()

	# Feed image data to the dataset pipeline
	config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True)
	with tf.Session(config=config) as sess:
	sess.run(init_op)
	while True:
	for id_batch, batch in zip(id_batches, batches):
	# Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements)
	sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch})
	for _ in range(INNER_LOOP_ITERS):
	all_results = sess.run(results)
	for result in all_results:
	ids, vectors = result
	print('Got ids:', ids)
	#!/bin/bash
	if pgrep -f "gpumon.py" &>/dev/null; then
	echo "gpumon already running"
	else
	echo "gpumon not already running"
	python3 gpumon.py &
	fi
	python3 how-to-use-all-gpus-in-tensorflow.py