Last active
September 25, 2016 15:32
-
-
Save achimnol/3404967 to your computer and use it in GitHub Desktop.
An upstart configuration for NVIDIA CUDA device initialization without X environments (derived from Sangjin Han's init.d script)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
description "CUDA initialization without X environments" | |
start on runlevel [2345] | |
stop on runlevel [!2345] | |
pre-start script | |
DRIVER=nvidia | |
HOLD_GPU=/usr/local/bin/hold_gpu.py | |
[ -x "$HOLD_GPU" ] || { echo "$HOLD_GPU is missing or not executable!"; exit 1; } | |
echo "Loading $DRIVER kernel module..." | |
modprobe $DRIVER NVreg_EnableMSI=1 | |
[ "$?" = 0 ] || { echo "Loading $DRIVER kernel module has failed."; exit 1; } | |
echo -n "Initializing CUDA /dev entries... " | |
N3D=`/usr/bin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l` | |
NVGA=`/usr/bin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l` | |
N=$(($N3D + $NVGA - 1)) | |
[ "$N" = -1 ] && { echo "No CUDA devices are found."; stop; exit 0; } | |
for i in `seq 0 $N`; do | |
DEVFILE=/dev/nvidia$i | |
[ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 $i || : | |
[ "$?" = 0 ] || { echo "Creation of /dev/nvidia$i failed."; exit 1; } | |
done | |
DEVFILE=/dev/nvidiactl | |
[ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 255 || : | |
[ "$?" = 0 ] || { echo "Creation of /dev/nvidiactl failed."; exit 1; } | |
echo "Done." | |
end script | |
exec /usr/local/bin/hold_gpu.py | |
post-stop script | |
echo "Unloading nvidia_uvm, nvidia kernel module..." | |
rmmod -f nvidia_uvm | |
[ "$?" = 0 ] || { echo "Unloading nvidia_uvm kernel module has failed."; exit 1; } | |
rmmod -f nvidia | |
[ "$?" = 0 ] || { echo "Unloading nvidia kernel module has failed."; exit 1; } | |
echo -n "Removing CUDA /dev entries... " | |
rm -f /dev/nvidia* | |
echo "Done." | |
end script |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# This script sets an interrupt and holds a handle to CUDA device by not dying. | |
# Copy or make a symlink at /usr/local/bin directory. | |
from __future__ import print_function | |
import sys, time | |
import logging, logging.handlers | |
import subprocess, multiprocessing | |
import ctypes, ctypes.util | |
_libnuma = ctypes.CDLL(ctypes.util.find_library('numa')) | |
_libcuda = ctypes.CDLL(ctypes.util.find_library('cudart')) | |
def execute(cmd, check_returncode=True): | |
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) | |
stdout, stderr = proc.communicate() | |
logger.debug(cmd) | |
if check_returncode and proc.returncode != 0: | |
raise subprocess.CalledProcessError(proc.returncode, cmd) | |
return stdout.decode('ascii') | |
def read_sysfs(bus_id, name): | |
with open('/sys/bus/pci/devices/{0}/{1}'.format(bus_id, name), 'r') as f: | |
return f.read().strip() | |
def get_cpu_topology(): | |
num_nodes = _libnuma.numa_num_configured_nodes() | |
node_cpus = [] | |
for n in range(num_nodes): | |
node_cpus.append([]) | |
for c in range(multiprocessing.cpu_count()): | |
node_id = int(_libnuma.numa_node_of_cpu(ctypes.c_int(c))) | |
node_cpus[node_id].append(c) | |
return node_cpus | |
def set_gpu_affinity(): | |
num_cpus = multiprocessing.cpu_count() | |
assert num_cpus >= 4, 'Too few number of CPU cores.' | |
# You should put the path to the CUDA toolkit library into LD_LIBRARY_PATH envvar | |
# or into the system-wide configuration at /etc/ld.so.conf.d/. | |
count = ctypes.c_int(0) | |
# This will initialize a CUDA device context, | |
# which holds the handle to the device until this program exits. | |
ret = _libcuda.cudaGetDeviceCount(ctypes.byref(count)) | |
assert ret == 0, 'cudaGetDeviceCount() has failed.' | |
logger.info('Grabbed a CUDA context.') | |
logger.info('Number of GPUs detected: {0}'.format(count.value)) | |
result = execute('lspci | grep -i nvidia | grep VGA').strip() | |
lines = result.splitlines() | |
assert len(lines) == count.value, 'Mismatch of CUDA device count and # NVIDIA GPUs on the PCI bus?' | |
# Example: for dual processor octa-core systems with HT enabled (total 32 logical cores), | |
# each GPU is mapped to core 23 and 31. | |
# Example2: if there are two GPUs per node, they are mapped to core 22/23, | |
# 30/31 depending on their actual node location. | |
node_cpus = get_cpu_topology() | |
node_devices = [0] * len(node_cpus) | |
for dev_idx, line in enumerate(lines): | |
bus_id, _ = line.split(' ', 1) | |
bus_id = '0000:' + bus_id | |
node_id = int(read_sysfs(bus_id, 'numa_node')) | |
irq = int(read_sysfs(bus_id, 'irq')) | |
node_devices[node_id] += 1 | |
core_id = node_cpus[node_id][-node_devices[node_id]] | |
cuda_dev_id = ctypes.c_int(0) | |
ret = _libcuda.cudaDeviceGetByPCIBusId(ctypes.byref(cuda_dev_id), ctypes.c_char_p(bus_id)) | |
assert ret == 0, 'cudaDeviceGetByPCIBusId() has failed.' | |
logger.info('GPU (BusId: {0}, DeviceId: {1}) is on node {2}, setting IRQ {3}\'s affinity to core {4}.'.format( | |
bus_id, cuda_dev_id.value, node_id, irq, core_id | |
)) | |
execute('echo {0:x} > /proc/irq/{1:d}/smp_affinity'.format(1 << core_id, irq)) | |
if __name__ == '__main__': | |
log_formatter = logging.Formatter('%(module)s[%(process)d]: %(levelname)s: %(message)s') | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
syslog_handler = logging.handlers.SysLogHandler(address='/dev/log') | |
syslog_handler.setFormatter(log_formatter) | |
syslog_handler.setLevel(logging.DEBUG) | |
stdout_handler = logging.StreamHandler() | |
stdout_handler.setFormatter(log_formatter) | |
logger.addHandler(syslog_handler) | |
logger.addHandler(stdout_handler) | |
try: | |
set_gpu_affinity() | |
except AssertionError as e: | |
logger.exception('Terminating due to an error.') | |
sys.exit(1) | |
logger.info('Staying alive...') | |
try: | |
while True: | |
time.sleep(1) | |
except (SystemExit, KeyboardInterrupt): | |
print() | |
pass | |
logger.info('Terminated.') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment