achimnol/cuda.conf

## cuda.conf
description "CUDA initialization without X environments"

start on runlevel [2345]
stop on runlevel [!2345]

pre-start script
    DRIVER=nvidia
    HOLD_GPU=/usr/local/bin/hold_gpu.py

    [ -x "$HOLD_GPU" ] || { echo "$HOLD_GPU is missing or not executable!"; exit 1; }

    echo "Loading $DRIVER kernel module..."
    modprobe $DRIVER NVreg_EnableMSI=1
    [ "$?" = 0 ] || { echo "Loading $DRIVER kernel module has failed."; exit 1; }

    echo -n "Initializing CUDA /dev entries... "
    N3D=`/usr/bin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l`
    NVGA=`/usr/bin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l`
    N=$(($N3D + $NVGA - 1))
    [ "$N" = -1 ] && { echo "No CUDA devices are found."; stop; exit 0; }
    for i in `seq 0 $N`; do
        DEVFILE=/dev/nvidia$i
        [ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 $i || :
        [ "$?" = 0 ] || { echo "Creation of /dev/nvidia$i failed."; exit 1; }
    done
    DEVFILE=/dev/nvidiactl
    [ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 255 || :
    [ "$?" = 0 ] || { echo "Creation of /dev/nvidiactl failed."; exit 1; }
    echo "Done."
end script

exec /usr/local/bin/hold_gpu.py

post-stop script
    echo "Unloading nvidia_uvm, nvidia kernel module..."
    rmmod -f nvidia_uvm
    [ "$?" = 0 ] || { echo "Unloading nvidia_uvm kernel module has failed."; exit 1; }
    rmmod -f nvidia
    [ "$?" = 0 ] || { echo "Unloading nvidia kernel module has failed."; exit 1; }

    echo -n "Removing CUDA /dev entries... "
    rm -f /dev/nvidia*
    echo "Done."
end script

## hold_gpu.py
#! /usr/bin/env python

# This script sets an interrupt and holds a handle to CUDA device by not dying.
# Copy or make a symlink at /usr/local/bin directory.

from __future__ import print_function
import sys, time
import logging, logging.handlers
import subprocess, multiprocessing
import ctypes, ctypes.util

_libnuma = ctypes.CDLL(ctypes.util.find_library('numa'))
_libcuda = ctypes.CDLL(ctypes.util.find_library('cudart'))

def execute(cmd, check_returncode=True):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    stdout, stderr = proc.communicate()
    logger.debug(cmd)
    if check_returncode and proc.returncode != 0:
        raise subprocess.CalledProcessError(proc.returncode, cmd)
    return stdout.decode('ascii')

def read_sysfs(bus_id, name):
    with open('/sys/bus/pci/devices/{0}/{1}'.format(bus_id, name), 'r') as f:
        return f.read().strip()

def get_cpu_topology():
    num_nodes = _libnuma.numa_num_configured_nodes()
    node_cpus = []
    for n in range(num_nodes):
        node_cpus.append([])
    for c in range(multiprocessing.cpu_count()):
        node_id = int(_libnuma.numa_node_of_cpu(ctypes.c_int(c)))
        node_cpus[node_id].append(c)
    return node_cpus

def set_gpu_affinity():

    num_cpus = multiprocessing.cpu_count()
    assert num_cpus >= 4, 'Too few number of CPU cores.'

    # You should put the path to the CUDA toolkit library into LD_LIBRARY_PATH envvar
    # or into the system-wide configuration at /etc/ld.so.conf.d/.
    count = ctypes.c_int(0)

    # This will initialize a CUDA device context,
    # which holds the handle to the device until this program exits.
    ret = _libcuda.cudaGetDeviceCount(ctypes.byref(count))
    assert ret == 0, 'cudaGetDeviceCount() has failed.'
    logger.info('Grabbed a CUDA context.')
    logger.info('Number of GPUs detected: {0}'.format(count.value))

    result = execute('lspci | grep -i nvidia | grep VGA').strip()
    lines = result.splitlines()
    assert len(lines) == count.value, 'Mismatch of CUDA device count and # NVIDIA GPUs on the PCI bus?'

    # Example: for dual processor octa-core systems with HT enabled (total 32 logical cores),
    #          each GPU is mapped to core 23 and 31.
    # Example2: if there are two GPUs per node, they are mapped to core 22/23,
    #           30/31 depending on their actual node location.
    node_cpus = get_cpu_topology()
    node_devices = [0] * len(node_cpus)

    for dev_idx, line in enumerate(lines):
        bus_id, _ = line.split(' ', 1)
        bus_id = '0000:' + bus_id
        node_id = int(read_sysfs(bus_id, 'numa_node'))
        irq = int(read_sysfs(bus_id, 'irq'))
        node_devices[node_id] += 1
        core_id = node_cpus[node_id][-node_devices[node_id]]

        cuda_dev_id = ctypes.c_int(0)
        ret = _libcuda.cudaDeviceGetByPCIBusId(ctypes.byref(cuda_dev_id), ctypes.c_char_p(bus_id))
        assert ret == 0, 'cudaDeviceGetByPCIBusId() has failed.'

        logger.info('GPU (BusId: {0}, DeviceId: {1}) is on node {2}, setting IRQ {3}\'s affinity to core {4}.'.format(
            bus_id, cuda_dev_id.value, node_id, irq, core_id
        ))
        execute('echo {0:x} > /proc/irq/{1:d}/smp_affinity'.format(1 << core_id, irq))

if __name__ == '__main__':

    log_formatter = logging.Formatter('%(module)s[%(process)d]: %(levelname)s: %(message)s')
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    syslog_handler = logging.handlers.SysLogHandler(address='/dev/log')
    syslog_handler.setFormatter(log_formatter)
    syslog_handler.setLevel(logging.DEBUG)
    stdout_handler = logging.StreamHandler()
    stdout_handler.setFormatter(log_formatter)
    logger.addHandler(syslog_handler)
    logger.addHandler(stdout_handler)

    try:
        set_gpu_affinity()
    except AssertionError as e:
        logger.exception('Terminating due to an error.')
        sys.exit(1)

    logger.info('Staying alive...')
    try:
        while True:
            time.sleep(1)
    except (SystemExit, KeyboardInterrupt):
        print()
        pass

    logger.info('Terminated.')
	description "CUDA initialization without X environments"

	start on runlevel [2345]
	stop on runlevel [!2345]

	pre-start script
	DRIVER=nvidia
	HOLD_GPU=/usr/local/bin/hold_gpu.py

	[ -x "$HOLD_GPU" ] \|\| { echo "$HOLD_GPU is missing or not executable!"; exit 1; }

	echo "Loading $DRIVER kernel module..."
	modprobe $DRIVER NVreg_EnableMSI=1
	[ "$?" = 0 ] \|\| { echo "Loading $DRIVER kernel module has failed."; exit 1; }

	echo -n "Initializing CUDA /dev entries... "
	N3D=`/usr/bin/lspci \| grep -i NVIDIA \| grep "3D controller" \| wc -l`
	NVGA=`/usr/bin/lspci \| grep -i NVIDIA \| grep "VGA compatible controller" \| wc -l`
	N=$(($N3D + $NVGA - 1))
	[ "$N" = -1 ] && { echo "No CUDA devices are found."; stop; exit 0; }
	for i in `seq 0 $N`; do
	DEVFILE=/dev/nvidia$i
	[ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 $i \|\| :
	[ "$?" = 0 ] \|\| { echo "Creation of /dev/nvidia$i failed."; exit 1; }
	done
	DEVFILE=/dev/nvidiactl
	[ ! -c $DEVFILE ] && mknod -m 666 $DEVFILE c 195 255 \|\| :
	[ "$?" = 0 ] \|\| { echo "Creation of /dev/nvidiactl failed."; exit 1; }
	echo "Done."
	end script

	exec /usr/local/bin/hold_gpu.py

	post-stop script
	echo "Unloading nvidia_uvm, nvidia kernel module..."
	rmmod -f nvidia_uvm
	[ "$?" = 0 ] \|\| { echo "Unloading nvidia_uvm kernel module has failed."; exit 1; }
	rmmod -f nvidia
	[ "$?" = 0 ] \|\| { echo "Unloading nvidia kernel module has failed."; exit 1; }

	echo -n "Removing CUDA /dev entries... "
	rm -f /dev/nvidia*
	echo "Done."
	end script
	#! /usr/bin/env python

	# This script sets an interrupt and holds a handle to CUDA device by not dying.
	# Copy or make a symlink at /usr/local/bin directory.

	from __future__ import print_function
	import sys, time
	import logging, logging.handlers
	import subprocess, multiprocessing
	import ctypes, ctypes.util

	_libnuma = ctypes.CDLL(ctypes.util.find_library('numa'))
	_libcuda = ctypes.CDLL(ctypes.util.find_library('cudart'))

	def execute(cmd, check_returncode=True):
	proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
	stdout, stderr = proc.communicate()
	logger.debug(cmd)
	if check_returncode and proc.returncode != 0:
	raise subprocess.CalledProcessError(proc.returncode, cmd)
	return stdout.decode('ascii')

	def read_sysfs(bus_id, name):
	with open('/sys/bus/pci/devices/{0}/{1}'.format(bus_id, name), 'r') as f:
	return f.read().strip()

	def get_cpu_topology():
	num_nodes = _libnuma.numa_num_configured_nodes()
	node_cpus = []
	for n in range(num_nodes):
	node_cpus.append([])
	for c in range(multiprocessing.cpu_count()):
	node_id = int(_libnuma.numa_node_of_cpu(ctypes.c_int(c)))
	node_cpus[node_id].append(c)
	return node_cpus

	def set_gpu_affinity():

	num_cpus = multiprocessing.cpu_count()
	assert num_cpus >= 4, 'Too few number of CPU cores.'

	# You should put the path to the CUDA toolkit library into LD_LIBRARY_PATH envvar
	# or into the system-wide configuration at /etc/ld.so.conf.d/.
	count = ctypes.c_int(0)

	# This will initialize a CUDA device context,
	# which holds the handle to the device until this program exits.
	ret = _libcuda.cudaGetDeviceCount(ctypes.byref(count))
	assert ret == 0, 'cudaGetDeviceCount() has failed.'
	logger.info('Grabbed a CUDA context.')
	logger.info('Number of GPUs detected: {0}'.format(count.value))

	result = execute('lspci \| grep -i nvidia \| grep VGA').strip()
	lines = result.splitlines()
	assert len(lines) == count.value, 'Mismatch of CUDA device count and # NVIDIA GPUs on the PCI bus?'

	# Example: for dual processor octa-core systems with HT enabled (total 32 logical cores),
	# each GPU is mapped to core 23 and 31.
	# Example2: if there are two GPUs per node, they are mapped to core 22/23,
	# 30/31 depending on their actual node location.
	node_cpus = get_cpu_topology()
	node_devices = [0] * len(node_cpus)

	for dev_idx, line in enumerate(lines):
	bus_id, _ = line.split(' ', 1)
	bus_id = '0000:' + bus_id
	node_id = int(read_sysfs(bus_id, 'numa_node'))
	irq = int(read_sysfs(bus_id, 'irq'))
	node_devices[node_id] += 1
	core_id = node_cpus[node_id][-node_devices[node_id]]

	cuda_dev_id = ctypes.c_int(0)
	ret = _libcuda.cudaDeviceGetByPCIBusId(ctypes.byref(cuda_dev_id), ctypes.c_char_p(bus_id))
	assert ret == 0, 'cudaDeviceGetByPCIBusId() has failed.'

	logger.info('GPU (BusId: {0}, DeviceId: {1}) is on node {2}, setting IRQ {3}\'s affinity to core {4}.'.format(
	bus_id, cuda_dev_id.value, node_id, irq, core_id
	))
	execute('echo {0:x} > /proc/irq/{1:d}/smp_affinity'.format(1 << core_id, irq))

	if __name__ == '__main__':

	log_formatter = logging.Formatter('%(module)s[%(process)d]: %(levelname)s: %(message)s')
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)
	syslog_handler = logging.handlers.SysLogHandler(address='/dev/log')
	syslog_handler.setFormatter(log_formatter)
	syslog_handler.setLevel(logging.DEBUG)
	stdout_handler = logging.StreamHandler()
	stdout_handler.setFormatter(log_formatter)
	logger.addHandler(syslog_handler)
	logger.addHandler(stdout_handler)

	try:
	set_gpu_affinity()
	except AssertionError as e:
	logger.exception('Terminating due to an error.')
	sys.exit(1)

	logger.info('Staying alive...')
	try:
	while True:
	time.sleep(1)
	except (SystemExit, KeyboardInterrupt):
	print()
	pass

	logger.info('Terminated.')