-
-
Save f0k/63a664160d016a491b2cbea15913d549 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Outputs some information on CUDA-enabled devices on your computer, | |
including current memory usage. | |
It's a port of https://gist.github.com/f0k/0d6431e3faa60bffc788f8b4daa029b1 | |
from C to Python with ctypes, so it can run without compiling anything. Note | |
that this is a direct translation with no attempt to make the code Pythonic. | |
It's meant as a general demonstration on how to obtain CUDA device information | |
from Python without resorting to nvidia-smi or a compiled Python extension. | |
Author: Jan Schlüter | |
License: MIT (https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#gistcomment-3870498) | |
""" | |
import sys | |
import ctypes | |
# Some constants taken from cuda.h | |
CUDA_SUCCESS = 0 | |
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16 | |
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39 | |
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13 | |
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36 | |
def ConvertSMVer2Cores(major, minor): | |
# Returns the number of CUDA cores per multiprocessor for a given | |
# Compute Capability version. There is no way to retrieve that via | |
# the API, so it needs to be hard-coded. | |
# See _ConvertSMVer2Cores in helper_cuda.h in NVIDIA's CUDA Samples. | |
return {(1, 0): 8, # Tesla | |
(1, 1): 8, | |
(1, 2): 8, | |
(1, 3): 8, | |
(2, 0): 32, # Fermi | |
(2, 1): 48, | |
(3, 0): 192, # Kepler | |
(3, 2): 192, | |
(3, 5): 192, | |
(3, 7): 192, | |
(5, 0): 128, # Maxwell | |
(5, 2): 128, | |
(5, 3): 128, | |
(6, 0): 64, # Pascal | |
(6, 1): 128, | |
(6, 2): 128, | |
(7, 0): 64, # Volta | |
(7, 2): 64, | |
(7, 5): 64, # Turing | |
(8, 0): 64, # Ampere | |
(8, 6): 128, | |
(8, 7): 128, | |
(8, 9): 128, # Ada | |
(9, 0): 128, # Hopper | |
}.get((major, minor), 0) | |
def main(): | |
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll', 'cuda.dll') | |
for libname in libnames: | |
try: | |
cuda = ctypes.CDLL(libname) | |
except OSError: | |
continue | |
else: | |
break | |
else: | |
raise OSError("could not load any of: " + ' '.join(libnames)) | |
nGpus = ctypes.c_int() | |
name = b' ' * 100 | |
cc_major = ctypes.c_int() | |
cc_minor = ctypes.c_int() | |
cores = ctypes.c_int() | |
threads_per_core = ctypes.c_int() | |
clockrate = ctypes.c_int() | |
freeMem = ctypes.c_size_t() | |
totalMem = ctypes.c_size_t() | |
result = ctypes.c_int() | |
device = ctypes.c_int() | |
context = ctypes.c_void_p() | |
error_str = ctypes.c_char_p() | |
result = cuda.cuInit(0) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuInit failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus)) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
print("Found %d device(s)." % nGpus.value) | |
for i in range(nGpus.value): | |
result = cuda.cuDeviceGet(ctypes.byref(device), i) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode())) | |
return 1 | |
print("Device: %d" % i) | |
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS: | |
print(" Name: %s" % (name.split(b'\0', 1)[0].decode())) | |
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS: | |
print(" Compute Capability: %d.%d" % (cc_major.value, cc_minor.value)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS: | |
print(" Multiprocessors: %d" % cores.value) | |
print(" CUDA Cores: %s" % (cores.value * ConvertSMVer2Cores(cc_major.value, cc_minor.value) or "unknown")) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS: | |
print(" Concurrent threads: %d" % (cores.value * threads_per_core.value)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS: | |
print(" GPU clock: %g MHz" % (clockrate.value / 1000.)) | |
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS: | |
print(" Memory clock: %g MHz" % (clockrate.value / 1000.)) | |
try: | |
result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) | |
except AttributeError: | |
result = cuda.cuCtxCreate(ctypes.byref(context), 0, device) | |
if result != CUDA_SUCCESS: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuCtxCreate failed with error code %d: %s" % (result, error_str.value.decode())) | |
else: | |
try: | |
result = cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) | |
except AttributeError: | |
result = cuda.cuMemGetInfo(ctypes.byref(freeMem), ctypes.byref(totalMem)) | |
if result == CUDA_SUCCESS: | |
print(" Total Memory: %ld MiB" % (totalMem.value / 1024**2)) | |
print(" Free Memory: %ld MiB" % (freeMem.value / 1024**2)) | |
else: | |
cuda.cuGetErrorString(result, ctypes.byref(error_str)) | |
print("cuMemGetInfo failed with error code %d: %s" % (result, error_str.value.decode())) | |
cuda.cuCtxDetach(context) | |
return 0 | |
if __name__=="__main__": | |
sys.exit(main()) |
This sounds as if something was wrong with the arguments to the cuInit
call, but it's called with a value of 0
, which is currently the only valid value according to the documentation: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html. The documentation says it could also "return error codes from previous, asynchronous launches", but it's the very first CUDA API call in this process. So I assume it indicates something is wrong with your setup.
Things to try:
- Run the same with
sudo
, I once had a situation where this had to be done once after booting to make it work for all users - Check that you can run the command
nvidia-smi
and it does not display any error - Check that the driver version installed supports your GPU: Go to https://www.nvidia.com/object/unix.html, go to the archive for your architecture, click the driver version, click "Supported Products"
- Check that the driver version is sufficient for the installed version of CUDA: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility
Good luck!
Hi Jan,
here's an up-to-date version of ConvertSMVer2Cores
:
def ConvertSMVer2Cores(major, minor):
# Returns the number of CUDA cores per multiprocessor for a given
# Compute Capability version. There is no way to retrieve that via
# the API, so it needs to be hard-coded.
return {
# Tesla
(1, 0): 8, # SM 1.0
(1, 1): 8, # SM 1.1
(1, 2): 8, # SM 1.2
(1, 3): 8, # SM 1.3
# Fermi
(2, 0): 32, # SM 2.0: GF100 class
(2, 1): 48, # SM 2.1: GF10x class
# Kepler
(3, 0): 192, # SM 3.0: GK10x class
(3, 2): 192, # SM 3.2: GK10x class
(3, 5): 192, # SM 3.5: GK11x class
(3, 7): 192, # SM 3.7: GK21x class
# Maxwell
(5, 0): 128, # SM 5.0: GM10x class
(5, 2): 128, # SM 5.2: GM20x class
(5, 3): 128, # SM 5.3: GM20x class
# Pascal
(6, 0): 64, # SM 6.0: GP100 class
(6, 1): 128, # SM 6.1: GP10x class
(6, 2): 128, # SM 6.2: GP10x class
# Volta
(7, 0): 64, # SM 7.0: GV100 class
(7, 2): 64, # SM 7.2: GV11b class
# Turing
(7, 5): 64, # SM 7.5: TU10x class
}.get((major, minor), 64) # unknown architecture, return a default value
Thanks @fwyzard! I've updated the gist accordingly.
Thank you for this useful script @f0k!
I noticed an issue with the device memory size displayed. On my Tesla V100 with 16GB memory, it shows:
Total Memory: 4016 MiB
Free Memory: 3946 MiB
Running the equivalent C code (from https://gist.github.com/f0k/0d6431e3faa60bffc788f8b4daa029b1) shows the correct values.
Also, changing this script to use the CUDA runtime API (while commenting out the cuCtxCreate
call) displays the correct values too:
freeMem = ctypes.c_size_t()
totalMem = ctypes.c_size_t()
libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
result = libcudart.cudaMemGetInfo(ctypes.byref(freeMem), ctypes.byref(totalMem))
Total Memory: 16160 MiB
Free Memory: 15844 MiB
Do you have any idea what the problem could be?
I noticed an issue with the device memory size displayed. On my Tesla V100 with 16GB memory, it shows:
Total Memory: 4016 MiB Free Memory: 3946 MiB
The following patch fixes this issue:
--- old Thu Mar 4 21:28:23 2021
+++ new Thu Mar 4 21:28:09 2021
@@ -109,12 +109,12 @@
print(" GPU clock: %g MHz" % (clockrate.value / 1000.))
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS:
print(" Memory clock: %g MHz" % (clockrate.value / 1000.))
- result = cuda.cuCtxCreate(ctypes.byref(context), 0, device)
+ result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuCtxCreate failed with error code %d: %s" % (result, error_str.value.decode()))
else:
- result = cuda.cuMemGetInfo(ctypes.byref(freeMem), ctypes.byref(totalMem))
+ result = cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem))
if result == CUDA_SUCCESS:
print(" Total Memory: %ld MiB" % (totalMem.value / 1024**2))
print(" Free Memory: %ld MiB" % (freeMem.value / 1024**2))
The reason for this is the following. The CUDA API in cuda.h
defines two versions of cuMemGetInfo
(and other functions) depending on the CUDA API version: cuMemGetInfo
(__CUDA_API_VERSION < 3020
) and cuMemGetInfo_v2
(__CUDA_API_VERSION >= 3020
; for this CUDA version, cuMemGetInfo
is #define
d to cuMemGetInfo_v2
). When compiling C code, running cuMemGetInfo
will therefore actually call cuMemGetInfo_v2
for the newer API version. ctypes
does not know about this and will happily call the old cuMemGetInfo
version, whose prototype is:
CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
ie., it only returns 32-bit integers, not size_t.
Thanks @matthiasdiener
@f0k Can you please update script with above corection, since 3020 api rarely used now? I roll with it too and patch solves the ptoblem
@f0k this is a great resource, can you please specify a license?
Thanks @matthiasdiener! Sorry for the delay, I've updated the gist now (with a fallback to the old API if needed). I also updated the ConvertSMVer2Cores
helper function for Ampere while I was at it.
@zoombinis: I'll make it MIT.
MIT License
Copyright (c) 2017-2021 Jan Schlüter
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#file-cuda_check-py-L60
I believe it should look for nvcuda.dll here on windows.
As i know cuda.dll is deprecated since 1.1 version of cuda and then replaced by nvcuda.dll which distributed with Nvidia Driver since 169.x.x version. I test it localy and it works. I am not an expert though.
This is Cuda programming code and Compare GPU vs CPU
https://debuggingsolution.blogspot.com/2021/09/vector-addition-cuda-parallel.html
Thanks @f0k for the excellent snippet! Here is an importable version which can be run inside other scripts as get_cuda_device_specs()
. It returns a list of specification dicts per CUDA device
[
{
"name": "NVIDIA GeForce RTX 3080 Laptop GPU",
"compute_capability": [
8,
6
],
"architecture": "ampere",
"cores": 48,
"cuda_cores": 3072,
"concurrent_threads": 73728,
"gpu_clock_mhz": 1245.0,
"mem_clock_mhz": 6001.0,
"total_mem_mb": 16125.3125,
"free_mem_mb": 15733.25
}
]
I also made some minor cosmetic updates
- Refactor
str.format()
to f-strings for readability - Refactor camel case to snake case (for PEP linting)
- Move semantic versioning map to constant named dict
- Add another mapping to the architecture key name
- Switch
sys.exit
codes toRuntimeError
andwarnings.warn
where appropriate
import ctypes
import json
from typing import Any, Dict, List
from warnings import warn
# TODO define decorator to share the RuntimeError/CUDA_SUCCESS logic among different library functions
# One of the following libraries must be available to load
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise ImportError(f'Could not load any of: {", ".join(libnames)}')
# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
(1, 0): 8, # Tesla
(1, 1): 8,
(1, 2): 8,
(1, 3): 8,
(2, 0): 32, # Fermi
(2, 1): 48,
(3, 0): 192, # Kepler
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128, # Maxwell
(5, 2): 128,
(5, 3): 128,
(6, 0): 64, # Pascal
(6, 1): 128,
(6, 2): 128,
(7, 0): 64, # Volta
(7, 2): 64,
(7, 5): 64, # Turing
(8, 0): 64, # Ampere
(8, 6): 64,
}
SEMVER_TO_ARCH = {
(1, 0): 'tesla',
(1, 1): 'tesla',
(1, 2): 'tesla',
(1, 3): 'tesla',
(2, 0): 'fermi',
(2, 1): 'fermi',
(3, 0): 'kepler',
(3, 2): 'kepler',
(3, 5): 'kepler',
(3, 7): 'kepler',
(5, 0): 'maxwell',
(5, 2): 'maxwell',
(5, 3): 'maxwell',
(6, 0): 'pascal',
(6, 1): 'pascal',
(6, 2): 'pascal',
(7, 0): 'volta',
(7, 2): 'volta',
(7, 5): 'turing',
(8, 0): 'ampere',
(8, 6): 'ampere',
}
def get_cuda_device_specs() -> List[Dict[str, Any]]:
"""Generate spec for each GPU device with format
{
'name': str,
'compute_capability': (major: int, minor: int),
'cores': int,
'cuda_cores': int,
'concurrent_threads': int,
'gpu_clock_mhz': float,
'mem_clock_mhz': float,
'total_mem_mb': float,
'free_mem_mb': float
}
"""
# Type-binding definitions for ctypes
num_gpus = ctypes.c_int()
name = b' ' * 100
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
cores = ctypes.c_int()
threads_per_core = ctypes.c_int()
clockrate = ctypes.c_int()
free_mem = ctypes.c_size_t()
total_mem = ctypes.c_size_t()
result = ctypes.c_int()
device = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
# Check expected initialization
result = cuda.cuInit(0)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuInit failed with error code {result}: {error_str.value.decode()}')
result = cuda.cuDeviceGetCount(ctypes.byref(num_gpus))
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuDeviceGetCount failed with error code {result}: {error_str.value.decode()}')
# Iterate through available devices
device_specs = []
for i in range(num_gpus.value):
spec = {}
result = cuda.cuDeviceGet(ctypes.byref(device), i)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(f'cuDeviceGet failed with error code {result}: {error_str.value.decode()}')
# Parse specs for each device
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
spec.update(name=name.split(b'\0', 1)[0].decode())
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
spec.update(compute_capability=(cc_major.value, cc_minor.value))
spec.update(architecture=SEMVER_TO_ARCH.get((cc_major.value, cc_minor.value), 'unknown'))
if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS:
spec.update(
cores=cores.value,
cuda_cores=cores.value * SEMVER_TO_CORES.get((cc_major.value, cc_minor.value), 'unknown'))
if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS:
spec.update(concurrent_threads=cores.value * threads_per_core.value)
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS:
spec.update(gpu_clock_mhz=clockrate.value / 1000.)
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS:
spec.update(mem_clock_mhz=clockrate.value / 1000.)
# Attempt to determine available vs. free memory
try:
result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device)
except AttributeError:
result = cuda.cuCtxCreate(ctypes.byref(context), 0, device)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(f'cuCtxCreate failed with error code {result}: {error_str.value.decode()}')
else:
try:
result = cuda.cuMemGetInfo_v2(ctypes.byref(free_mem), ctypes.byref(total_mem))
except AttributeError:
result = cuda.cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
if result == CUDA_SUCCESS:
spec.update(
total_mem_mb=total_mem.value / 1024**2,
free_mem_mb=free_mem.value / 1024**2)
else:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(f'cuMemGetInfo failed with error code {result}: {error_str.value.decode()}')
cuda.cuCtxDetach(context)
device_specs.append(spec)
return device_specs
if __name__ == '__main__':
print(json.dumps(get_cuda_device_specs(), indent=2))
Thank you for this script! It helped me debug an issue with getting CUDA working in Windows 10 with Ubuntu WSL.
See this: bitsandbytes-foundation/bitsandbytes#337 - Thanks again @f0k !!!
Thanks! Further refactoring with decorators.
import ctypes
import json
from functools import wraps
from typing import Any, Dict, List
from warnings import warn
# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
(1, 0): 8, # Tesla
(1, 1): 8,
(1, 2): 8,
(1, 3): 8,
(2, 0): 32, # Fermi
(2, 1): 48,
(3, 0): 192, # Kepler
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128, # Maxwell
(5, 2): 128,
(5, 3): 128,
(6, 0): 64, # Pascal
(6, 1): 128,
(6, 2): 128,
(7, 0): 64, # Volta
(7, 2): 64,
(7, 5): 64, # Turing
(8, 0): 64, # Ampere
(8, 6): 64,
}
SEMVER_TO_ARCH = {
(1, 0): "tesla",
(1, 1): "tesla",
(1, 2): "tesla",
(1, 3): "tesla",
(2, 0): "fermi",
(2, 1): "fermi",
(3, 0): "kepler",
(3, 2): "kepler",
(3, 5): "kepler",
(3, 7): "kepler",
(5, 0): "maxwell",
(5, 2): "maxwell",
(5, 3): "maxwell",
(6, 0): "pascal",
(6, 1): "pascal",
(6, 2): "pascal",
(7, 0): "volta",
(7, 2): "volta",
(7, 5): "turing",
(8, 0): "ampere",
(8, 6): "ampere",
}
# Decorator for CUDA API calls
def cuda_api_call(func):
"""
Decorator to wrap CUDA API calls and check their results.
Raises RuntimeError if the CUDA call does not return CUDA_SUCCESS.
"""
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
if result != CUDA_SUCCESS:
error_str = ctypes.c_char_p()
cuda.cuGetErrorString(result, ctypes.byref(error_str))
raise RuntimeError(
f"{func.__name__} failed with error code {result}: {error_str.value.decode()}"
)
return result
return wrapper
def cuda_api_call_warn(func):
"""
Decorator to wrap CUDA API calls and check their results.
Prints a warning message if the CUDA call does not return CUDA_SUCCESS.
"""
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
if result != CUDA_SUCCESS:
error_str = ctypes.c_char_p()
cuda.cuGetErrorString(result, ctypes.byref(error_str))
warn(
f"Warning: {func.__name__} failed with error code {result}: {error_str.value.decode()}"
)
return result
return wrapper
# Attempt to load the CUDA library
libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise ImportError(f'Could not load any of: {", ".join(libnames)}')
# CUDA API calls wrapped with the decorator
@cuda_api_call
def cuInit(flags):
return cuda.cuInit(flags)
@cuda_api_call
def cuDeviceGetCount(count):
return cuda.cuDeviceGetCount(count)
@cuda_api_call
def cuDeviceGet(device, ordinal):
return cuda.cuDeviceGet(device, ordinal)
@cuda_api_call
def cuDeviceGetName(name, len, dev):
return cuda.cuDeviceGetName(name, len, dev)
@cuda_api_call
def cuDeviceComputeCapability(major, minor, dev):
return cuda.cuDeviceComputeCapability(major, minor, dev)
@cuda_api_call
def cuDeviceGetAttribute(pi, attrib, dev):
return cuda.cuDeviceGetAttribute(pi, attrib, dev)
@cuda_api_call_warn
def cuCtxCreate(pctx, flags, dev):
try:
result = cuda.cuCtxCreate_v2(pctx, flags, dev)
except AttributeError:
result = cuda.cuCtxCreate(pctx, flags, dev)
return result
@cuda_api_call_warn
def cuMemGetInfo(free, total):
try:
result = cuda.cuMemGetInfo_v2(free, total)
except AttributeError:
result = cuda.cuMemGetInfo(free, total)
return result
@cuda_api_call
def cuCtxDetach(ctx):
return cuda.cuCtxDetach(ctx)
# Main function to get CUDA device specs
def get_cuda_device_specs() -> List[Dict[str, Any]]:
"""Generate spec for each GPU device with format
{
'name': str,
'compute_capability': (major: int, minor: int),
'cores': int,
'cuda_cores': int,
'concurrent_threads': int,
'gpu_clock_mhz': float,
'mem_clock_mhz': float,
'total_mem_mb': float,
'free_mem_mb': float,
'architecture': str,
'cuda_cores': int
}
"""
# Initialize CUDA
cuInit(0)
num_gpus = ctypes.c_int()
cuDeviceGetCount(ctypes.byref(num_gpus))
device_specs = []
for i in range(num_gpus.value):
spec = {}
device = ctypes.c_int()
cuDeviceGet(ctypes.byref(device), i)
name = b" " * 100
cuDeviceGetName(ctypes.c_char_p(name), len(name), device)
spec["name"] = name.split(b"\0", 1)[0].decode()
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
cuDeviceComputeCapability(
ctypes.byref(cc_major), ctypes.byref(cc_minor), device
)
compute_capability = (cc_major.value, cc_minor.value)
spec["compute_capability"] = compute_capability
cores = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
)
spec["cores"] = cores.value
threads_per_core = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(threads_per_core),
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
device,
)
spec["concurrent_threads"] = cores.value * threads_per_core.value
clockrate = ctypes.c_int()
cuDeviceGetAttribute(
ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device
)
spec["gpu_clock_mhz"] = clockrate.value / 1000.0
cuDeviceGetAttribute(
ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device
)
spec["mem_clock_mhz"] = clockrate.value / 1000.0
context = ctypes.c_void_p()
if cuCtxCreate(ctypes.byref(context), 0, device) == CUDA_SUCCESS:
free_mem = ctypes.c_size_t()
total_mem = ctypes.c_size_t()
cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
spec["total_mem_mb"] = total_mem.value / 1024**2
spec["free_mem_mb"] = free_mem.value / 1024**2
spec["architecture"] = SEMVER_TO_ARCH.get(compute_capability, "unknown")
spec["cuda_cores"] = cores.value * SEMVER_TO_CORES.get(
compute_capability, "unknown"
)
cuCtxDetach(context)
device_specs.append(spec)
return device_specs
if __name__ == "__main__":
print(json.dumps(get_cuda_device_specs(), indent=2))
Thanks for sharing @addisonklinke and @IanBoyanZhang! Looks good except that it would probably be easier to maintain if the two SEMVER dictionaries were joined into one, and the SEMVER_TO_CORES.get()
should default to 0 instead of "unknown", otherwise you will get a very long string in spec["cuda_cores"] for new architectures :) I will not update the gist as the original is so much shorter, but yours will be handy for people who need to access the information from another script.
Hi, thanks you for this code. I have an error:
cuInit failed with error code 1: invalid argument