Skip to content

Instantly share code, notes, and snippets.

@alexwal
Last active April 20, 2023 11:54
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save alexwal/4961f4d7330ad3465ecd04eb301bc497 to your computer and use it in GitHub Desktop.
Save alexwal/4961f4d7330ad3465ecd04eb301bc497 to your computer and use it in GitHub Desktop.

How to use

  1. Start with GPU EC2 instance that has service role with policy for putting metrics on Amazon CloudWatch (https://aws.amazon.com/blogs/machine-learning/monitoring-gpu-utilization-with-amazon-cloudwatch/).
  2. Put a bunch of 299x299 jpeg images in test-images/ Ex. for run in {1..10} do wget via.placeholder.com/299x299.jpg done.
  3. pip3 install nvidia-ml-py3
  4. Edit run.sh to run gpumon.py and another process (which is presumably using the GPU)..
  5. Open CloudWatch in AWS Console to view graphs.
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import urllib
import boto3
from pynvml import *
from datetime import datetime
from time import sleep
### CHOOSE REGION ####
EC2_REGION = 'us-east-1'
###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'
### CHOOSE PUSH INTERVAL ####
sleep_interval = 10
### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60
#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8')
IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8')
INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8')
INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8')
EC2_REGION = INSTANCE_AZ[:-1]
TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
TMP_FILE = '/tmp/GPU_TEMP'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP
print(EC2_REGION)
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)
# Flag to push to CloudWatch
PUSH_TO_CW = True
def getPowerDraw(handle):
try:
powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
powDrawStr = '%.2f' % powDraw
except NVMLError as err:
powDrawStr = handleError(err)
PUSH_TO_CW = False
return powDrawStr
def getTemp(handle):
try:
temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
except NVMLError as err:
temp = handleError(err)
PUSH_TO_CW = False
return temp
def getUtilization(handle):
try:
util = nvmlDeviceGetUtilizationRates(handle)
gpu_util = str(util.gpu)
mem_util = str(util.memory)
except NVMLError as err:
error = handleError(err)
gpu_util = error
mem_util = error
PUSH_TO_CW = False
return util, gpu_util, mem_util
def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
try:
gpu_logs = open(TMP_FILE_SAVED, 'a+')
writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
gpu_logs.write(writeString)
except:
print(("Error writing to file ", gpu_logs))
finally:
gpu_logs.close()
if (PUSH_TO_CW):
MY_DIMENSIONS=[
{
'Name': 'InstanceId',
'Value': INSTANCE_ID
},
{
'Name': 'ImageId',
'Value': IMAGE_ID
},
{
'Name': 'InstanceType',
'Value': INSTANCE_TYPE
},
{
'Name': 'GPUNumber',
'Value': str(i)
}
]
cloudwatch.put_metric_data(
MetricData=[
{
'MetricName': 'GPU Usage',
'Dimensions': MY_DIMENSIONS,
'Unit': 'Percent',
'StorageResolution': store_reso,
'Value': util.gpu
},
{
'MetricName': 'Memory Usage',
'Dimensions': MY_DIMENSIONS,
'Unit': 'Percent',
'StorageResolution': store_reso,
'Value': util.memory
},
{
'MetricName': 'Power Usage (Watts)',
'Dimensions': MY_DIMENSIONS,
'Unit': 'None',
'StorageResolution': store_reso,
'Value': float(powDrawStr)
},
{
'MetricName': 'Temperature (C)',
'Dimensions': MY_DIMENSIONS,
'Unit': 'None',
'StorageResolution': store_reso,
'Value': int(temp)
},
],
Namespace=my_NameSpace
)
nvmlInit()
deviceCount = nvmlDeviceGetCount()
def main():
try:
while True:
PUSH_TO_CW = True
# Find the metrics for each GPU on instance
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
powDrawStr = getPowerDraw(handle)
temp = getTemp(handle)
util, gpu_util, mem_util = getUtilization(handle)
logResults(i, util, gpu_util, mem_util, powDrawStr, temp)
sleep(sleep_interval)
finally:
nvmlShutdown()
if __name__=='__main__':
main()
import glob
import tensorflow as tf
# Get matching filenames
pattern = '../../test-images/*.jpg.*' # A bunch of 299x299x3 jpegs
matching_filenames = glob.iglob(pattern)
# Pipeline settings
NUM_GPUS = 2
BATCH_SIZE = 32
INNER_LOOP_ITERS = 3
FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate.
# Generate batches of images (jpeg encoded)
batches, batch = [], []
id_batches, id_batch = [], []
for i, filename in enumerate(matching_filenames):
data = open(filename, 'rb').read()
batch.append(data)
id_batch.append(i)
if len(batch) % FEED_BATCH_SIZE == 0:
batches.append(batch)
id_batches.append(id_batch)
batch = []
id_batch = []
###########################################
# Tensorflow pipeline for decoding images #
###########################################
# Create a dataset returning slices of `image_strings`
image_strings = tf.placeholder(tf.string, shape=[None])
id_ints = tf.placeholder(tf.int32, shape=[None])
dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings))
# Parse every image in the dataset using `map`
def _parse_function(image_id, image_string):
image_decoded = tf.image.decode_jpeg(image_string, channels=3)
image = tf.cast(image_decoded, tf.float32)
return image_id, image
dataset = dataset.map(_parse_function)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.repeat()
# Create iterator and final tensor
# a = tf.ones(shape = [9,5,5,3])
# b = tf.random_uniform(shape = [9,5,3,1])
# c = tf.matmul(a,b) [shape=9,5,5,1]
iterator = dataset.make_initializable_iterator()
results = []
for i in range(NUM_GPUS):
with tf.device('/gpu:{}'.format(i)):
image_ids, images = iterator.get_next()
W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1)))
result = tf.matmul(images, W)
results.append((image_ids, result))
init_op = tf.global_variables_initializer()
# Feed image data to the dataset pipeline
config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True)
with tf.Session(config=config) as sess:
sess.run(init_op)
while True:
for id_batch, batch in zip(id_batches, batches):
# Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements)
sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch})
for _ in range(INNER_LOOP_ITERS):
all_results = sess.run(results)
for result in all_results:
ids, vectors = result
print('Got ids:', ids)
#!/bin/bash
if pgrep -f "gpumon.py" &>/dev/null; then
echo "gpumon already running"
else
echo "gpumon not already running"
python3 gpumon.py &
fi
python3 how-to-use-all-gpus-in-tensorflow.py
@sturfee-petrl
Copy link

nvmlDeviceGetUtilizationRates returns not really actual parameters. I mean It retrieves the current utilization rates. It can be 0 - 100. It don't show a real picture if you have not constant input to a GPU.
nvmlDeviceGetSamples allow to get an array of rates and then calculate the real average rate.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment