Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

torque submit filter that sets CUDA_VISIBLE_DEVICES based on specified number of GPUs.

View filter.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
#!/usr/bin/python2.7
 
"""
torque submit filter that automatically sets CUDA_VISIBLE_DEVICES
based upon the number of GPUs requested in a job.
Notes
-----
Assumes that /var/spool/torque/filters/trqgpu.py is available.
"""
 
import re
import sys
 
print "#!/bin/sh"
 
directive_lines = []
remaining_lines = []
for line in sys.stdin:
if re.match(r'^\#\!', line):
continue
elif re.match(r'^\#PBS', line):
directive_lines.append(line)
else:
remaining_lines.append(line)
 
# Make sure that all PBS directives occur before CUDA_VISIBLE_DEVICES is set:
for line in directive_lines:
print line,
print "export CUDA_VISIBLE_DEVICES=$(/usr/bin/python2.7 /var/spool/torque/filters/trqgpu.py)"
for line in remaining_lines:
print line,
View filter.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#!/usr/bin/python2.7
 
"""
Utilities to facilitate retrieval of available GPUs set by torque.
"""
 
import re
import os
 
import sys
import socket
import fcntl
import struct
import array
 
def all_interfaces():
"""
Get all network interfaces.
Notes
-----
http://code.activestate.com/recipes/439093-get-names-of-all-up-network-interfaces-linux-only/
"""
 
is_64bits = sys.maxsize > 2**32
struct_size = 40 if is_64bits else 32
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
max_possible = 8 # initial value
while True:
bytes = max_possible * struct_size
names = array.array('B', '\0' * bytes)
outbytes = struct.unpack('iL', fcntl.ioctl(
s.fileno(),
0x8912, # SIOCGIFCONF
struct.pack('iL', bytes, names.buffer_info()[0])
))[0]
if outbytes == bytes:
max_possible *= 2
else:
break
namestr = names.tostring()
return [(namestr[i:i+16].split('\0', 1)[0],
socket.inet_ntoa(namestr[i+20:i+24]))
for i in range(0, outbytes, struct_size)]
 
def get_gpus():
"""
Retrieve available GPUs set by torque.
Returns
-------
result : dict
Maps hostnames to lists of available GPU identifiers.
"""
 
try:
filename = os.getenv('PBS_GPUFILE')
except:
return {}
else:
if filename is None or not os.path.exists(filename):
return {}
else:
result = {}
with open(filename, 'r') as f:
for line in f:
hostname, gpu = re.search('^(.*)-gpu(\d+)', line).groups()
if not result.has_key(hostname):
result[hostname] = []
result[hostname].append(int(gpu))
return result
 
def cuda_visible_devices():
"""
create CUDA_VISIBLE_DEVICES value corresponding based on available GPUs set by torque.
Notes
-----
The returned value corresponds to that for the machine on which the function is run.
"""
 
addr_list = [interface[1] for interface in all_interfaces() if interface[0] != 'lo']
gpu_dict = get_gpus()
for hostname in gpu_dict.keys():
addr = socket.gethostbyname(hostname)
if addr in addr_list:
return ','.join(map(str, gpu_dict[hostname]))
return ''
 
if __name__ == '__main__':
result = cuda_visible_devices()
if result:
print result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.