Skip to content

Instantly share code, notes, and snippets.

@f0k
Last active December 5, 2024 13:35
Show Gist options
  • Save f0k/63a664160d016a491b2cbea15913d549 to your computer and use it in GitHub Desktop.
Save f0k/63a664160d016a491b2cbea15913d549 to your computer and use it in GitHub Desktop.
Simple python script to obtain CUDA device information
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Outputs some information on CUDA-enabled devices on your computer,
including current memory usage.
It's a port of https://gist.github.com/f0k/0d6431e3faa60bffc788f8b4daa029b1
from C to Python with ctypes, so it can run without compiling anything. Note
that this is a direct translation with no attempt to make the code Pythonic.
It's meant as a general demonstration on how to obtain CUDA device information
from Python without resorting to nvidia-smi or a compiled Python extension.
Author: Jan Schlüter
License: MIT (https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#gistcomment-3870498)
"""
import sys
import ctypes
# Some constants taken from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
def ConvertSMVer2Cores(major, minor):
# Returns the number of CUDA cores per multiprocessor for a given
# Compute Capability version. There is no way to retrieve that via
# the API, so it needs to be hard-coded.
# See _ConvertSMVer2Cores in helper_cuda.h in NVIDIA's CUDA Samples.
return {(1, 0): 8, # Tesla
(1, 1): 8,
(1, 2): 8,
(1, 3): 8,
(2, 0): 32, # Fermi
(2, 1): 48,
(3, 0): 192, # Kepler
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128, # Maxwell
(5, 2): 128,
(5, 3): 128,
(6, 0): 64, # Pascal
(6, 1): 128,
(6, 2): 128,
(7, 0): 64, # Volta
(7, 2): 64,
(7, 5): 64, # Turing
(8, 0): 64, # Ampere
(8, 6): 128,
(8, 7): 128,
(8, 9): 128, # Ada
(9, 0): 128, # Hopper
}.get((major, minor), 0)
def main():
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll', 'cuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise OSError("could not load any of: " + ' '.join(libnames))
nGpus = ctypes.c_int()
name = b' ' * 100
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
cores = ctypes.c_int()
threads_per_core = ctypes.c_int()
clockrate = ctypes.c_int()
freeMem = ctypes.c_size_t()
totalMem = ctypes.c_size_t()
result = ctypes.c_int()
device = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
result = cuda.cuInit(0)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuInit failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
print("Found %d device(s)." % nGpus.value)
for i in range(nGpus.value):
result = cuda.cuDeviceGet(ctypes.byref(device), i)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
print("Device: %d" % i)
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
print(" Name: %s" % (name.split(b'\0', 1)[0].decode()))
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
print(" Compute Capability: %d.%d" % (cc_major.value, cc_minor.value))
if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS:
print(" Multiprocessors: %d" % cores.value)
print(" CUDA Cores: %s" % (cores.value * ConvertSMVer2Cores(cc_major.value, cc_minor.value) or "unknown"))
if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS:
print(" Concurrent threads: %d" % (cores.value * threads_per_core.value))
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS:
print(" GPU clock: %g MHz" % (clockrate.value / 1000.))
if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS:
print(" Memory clock: %g MHz" % (clockrate.value / 1000.))
try:
result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device)
except AttributeError:
result = cuda.cuCtxCreate(ctypes.byref(context), 0, device)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuCtxCreate failed with error code %d: %s" % (result, error_str.value.decode()))
else:
try:
result = cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem))
except AttributeError:
result = cuda.cuMemGetInfo(ctypes.byref(freeMem), ctypes.byref(totalMem))
if result == CUDA_SUCCESS:
print(" Total Memory: %ld MiB" % (totalMem.value / 1024**2))
print(" Free Memory: %ld MiB" % (freeMem.value / 1024**2))
else:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuMemGetInfo failed with error code %d: %s" % (result, error_str.value.decode()))
cuda.cuCtxDetach(context)
return 0
if __name__=="__main__":
sys.exit(main())
@addisonklinke
Copy link

Thanks @f0k for the excellent snippet! Here is an importable version which can be run inside other scripts as get_cuda_device_specs(). It returns a list of specification dicts per CUDA device

[
  {
    "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
    "compute_capability": [
      8,
      6
    ],
    "architecture": "ampere",
    "cores": 48,
    "cuda_cores": 3072,
    "concurrent_threads": 73728,
    "gpu_clock_mhz": 1245.0,
    "mem_clock_mhz": 6001.0,
    "total_mem_mb": 16125.3125,
    "free_mem_mb": 15733.25
  }
]

I also made some minor cosmetic updates

  • Refactor str.format() to f-strings for readability
  • Refactor camel case to snake case (for PEP linting)
  • Move semantic versioning map to constant named dict
  • Add another mapping to the architecture key name
  • Switch sys.exit codes to RuntimeError and warnings.warn where appropriate
import ctypes
import json
from typing import Any, Dict, List
from warnings import warn

# TODO define decorator to share the RuntimeError/CUDA_SUCCESS logic among different library functions

# One of the following libraries must be available to load
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
for libname in libnames:
    try:
        cuda = ctypes.CDLL(libname)
    except OSError:
        continue
    else:
        break
else:
    raise ImportError(f'Could not load any of: {", ".join(libnames)}')

# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36

# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
    (1, 0): 8,    # Tesla
    (1, 1): 8,
    (1, 2): 8,
    (1, 3): 8,
    (2, 0): 32,   # Fermi
    (2, 1): 48,
    (3, 0): 192,  # Kepler
    (3, 2): 192,
    (3, 5): 192,
    (3, 7): 192,
    (5, 0): 128,  # Maxwell
    (5, 2): 128,
    (5, 3): 128,
    (6, 0): 64,   # Pascal
    (6, 1): 128,
    (6, 2): 128,
    (7, 0): 64,   # Volta
    (7, 2): 64,
    (7, 5): 64,   # Turing
    (8, 0): 64,   # Ampere
    (8, 6): 64,
}
SEMVER_TO_ARCH = {
    (1, 0): 'tesla',
    (1, 1): 'tesla',
    (1, 2): 'tesla',
    (1, 3): 'tesla',

    (2, 0): 'fermi',
    (2, 1): 'fermi',

    (3, 0): 'kepler',
    (3, 2): 'kepler',
    (3, 5): 'kepler',
    (3, 7): 'kepler',

    (5, 0): 'maxwell',
    (5, 2): 'maxwell',
    (5, 3): 'maxwell',

    (6, 0): 'pascal',
    (6, 1): 'pascal',
    (6, 2): 'pascal',

    (7, 0): 'volta',
    (7, 2): 'volta',

    (7, 5): 'turing',

    (8, 0): 'ampere',
    (8, 6): 'ampere',
}


def get_cuda_device_specs() -> List[Dict[str, Any]]:
    """Generate spec for each GPU device with format

    {
        'name': str,
        'compute_capability': (major: int, minor: int),
        'cores': int,
        'cuda_cores': int,
        'concurrent_threads': int,
        'gpu_clock_mhz': float,
        'mem_clock_mhz': float,
        'total_mem_mb': float,
        'free_mem_mb': float
    }
    """

    # Type-binding definitions for ctypes
    num_gpus = ctypes.c_int()
    name = b' ' * 100
    cc_major = ctypes.c_int()
    cc_minor = ctypes.c_int()
    cores = ctypes.c_int()
    threads_per_core = ctypes.c_int()
    clockrate = ctypes.c_int()
    free_mem = ctypes.c_size_t()
    total_mem = ctypes.c_size_t()
    result = ctypes.c_int()
    device = ctypes.c_int()
    context = ctypes.c_void_p()
    error_str = ctypes.c_char_p()

    # Check expected initialization
    result = cuda.cuInit(0)
    if result != CUDA_SUCCESS:
        cuda.cuGetErrorString(result, ctypes.byref(error_str))
        raise RuntimeError(f'cuInit failed with error code {result}: {error_str.value.decode()}')
    result = cuda.cuDeviceGetCount(ctypes.byref(num_gpus))
    if result != CUDA_SUCCESS:
        cuda.cuGetErrorString(result, ctypes.byref(error_str))
        raise RuntimeError(f'cuDeviceGetCount failed with error code {result}: {error_str.value.decode()}')

    # Iterate through available devices
    device_specs = []
    for i in range(num_gpus.value):
        spec = {}
        result = cuda.cuDeviceGet(ctypes.byref(device), i)
        if result != CUDA_SUCCESS:
            cuda.cuGetErrorString(result, ctypes.byref(error_str))
            raise RuntimeError(f'cuDeviceGet failed with error code {result}: {error_str.value.decode()}')

        # Parse specs for each device
        if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
            spec.update(name=name.split(b'\0', 1)[0].decode())
        if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
            spec.update(compute_capability=(cc_major.value, cc_minor.value))
            spec.update(architecture=SEMVER_TO_ARCH.get((cc_major.value, cc_minor.value), 'unknown'))
        if cuda.cuDeviceGetAttribute(ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device) == CUDA_SUCCESS:
            spec.update(
                cores=cores.value,
                cuda_cores=cores.value * SEMVER_TO_CORES.get((cc_major.value, cc_minor.value), 'unknown'))
            if cuda.cuDeviceGetAttribute(ctypes.byref(threads_per_core), CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device) == CUDA_SUCCESS:
                spec.update(concurrent_threads=cores.value * threads_per_core.value)
        if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device) == CUDA_SUCCESS:
            spec.update(gpu_clock_mhz=clockrate.value / 1000.)
        if cuda.cuDeviceGetAttribute(ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device) == CUDA_SUCCESS:
            spec.update(mem_clock_mhz=clockrate.value / 1000.)

        # Attempt to determine available vs. free memory
        try:
            result = cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device)
        except AttributeError:
            result = cuda.cuCtxCreate(ctypes.byref(context), 0, device)
        if result != CUDA_SUCCESS:
            cuda.cuGetErrorString(result, ctypes.byref(error_str))
            warn(f'cuCtxCreate failed with error code {result}: {error_str.value.decode()}')
        else:
            try:
                result = cuda.cuMemGetInfo_v2(ctypes.byref(free_mem), ctypes.byref(total_mem))
            except AttributeError:
                result = cuda.cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
            if result == CUDA_SUCCESS:
                spec.update(
                    total_mem_mb=total_mem.value / 1024**2,
                    free_mem_mb=free_mem.value / 1024**2)
            else:
                cuda.cuGetErrorString(result, ctypes.byref(error_str))
                warn(f'cuMemGetInfo failed with error code {result}: {error_str.value.decode()}')
            cuda.cuCtxDetach(context)
        device_specs.append(spec)
    return device_specs


if __name__ == '__main__':

    print(json.dumps(get_cuda_device_specs(), indent=2))

@ksylvan
Copy link

ksylvan commented Apr 23, 2023

Thank you for this script! It helped me debug an issue with getting CUDA working in Windows 10 with Ubuntu WSL.

@ksylvan
Copy link

ksylvan commented Apr 24, 2023

See this: bitsandbytes-foundation/bitsandbytes#337 - Thanks again @f0k !!!

@IanBoyanZhang
Copy link

Thanks! Further refactoring with decorators.

import ctypes
import json
from functools import wraps
from typing import Any, Dict, List
from warnings import warn

# Constants from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36

# Conversions from semantic version numbers
# Borrowed from original gist and updated from the "GPUs supported" section of this Wikipedia article
# https://en.wikipedia.org/wiki/CUDA
SEMVER_TO_CORES = {
    (1, 0): 8,  # Tesla
    (1, 1): 8,
    (1, 2): 8,
    (1, 3): 8,
    (2, 0): 32,  # Fermi
    (2, 1): 48,
    (3, 0): 192,  # Kepler
    (3, 2): 192,
    (3, 5): 192,
    (3, 7): 192,
    (5, 0): 128,  # Maxwell
    (5, 2): 128,
    (5, 3): 128,
    (6, 0): 64,  # Pascal
    (6, 1): 128,
    (6, 2): 128,
    (7, 0): 64,  # Volta
    (7, 2): 64,
    (7, 5): 64,  # Turing
    (8, 0): 64,  # Ampere
    (8, 6): 64,
}
SEMVER_TO_ARCH = {
    (1, 0): "tesla",
    (1, 1): "tesla",
    (1, 2): "tesla",
    (1, 3): "tesla",
    (2, 0): "fermi",
    (2, 1): "fermi",
    (3, 0): "kepler",
    (3, 2): "kepler",
    (3, 5): "kepler",
    (3, 7): "kepler",
    (5, 0): "maxwell",
    (5, 2): "maxwell",
    (5, 3): "maxwell",
    (6, 0): "pascal",
    (6, 1): "pascal",
    (6, 2): "pascal",
    (7, 0): "volta",
    (7, 2): "volta",
    (7, 5): "turing",
    (8, 0): "ampere",
    (8, 6): "ampere",
}


# Decorator for CUDA API calls
def cuda_api_call(func):
    """
    Decorator to wrap CUDA API calls and check their results.
    Raises RuntimeError if the CUDA call does not return CUDA_SUCCESS.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        if result != CUDA_SUCCESS:
            error_str = ctypes.c_char_p()
            cuda.cuGetErrorString(result, ctypes.byref(error_str))
            raise RuntimeError(
                f"{func.__name__} failed with error code {result}: {error_str.value.decode()}"
            )
        return result

    return wrapper


def cuda_api_call_warn(func):
    """
    Decorator to wrap CUDA API calls and check their results.
    Prints a warning message if the CUDA call does not return CUDA_SUCCESS.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        if result != CUDA_SUCCESS:
            error_str = ctypes.c_char_p()
            cuda.cuGetErrorString(result, ctypes.byref(error_str))
            warn(
                f"Warning: {func.__name__} failed with error code {result}: {error_str.value.decode()}"
            )
        return result

    return wrapper


# Attempt to load the CUDA library
libnames = ("libcuda.so", "libcuda.dylib", "cuda.dll")
for libname in libnames:
    try:
        cuda = ctypes.CDLL(libname)
    except OSError:
        continue
    else:
        break
else:
    raise ImportError(f'Could not load any of: {", ".join(libnames)}')


# CUDA API calls wrapped with the decorator
@cuda_api_call
def cuInit(flags):
    return cuda.cuInit(flags)


@cuda_api_call
def cuDeviceGetCount(count):
    return cuda.cuDeviceGetCount(count)


@cuda_api_call
def cuDeviceGet(device, ordinal):
    return cuda.cuDeviceGet(device, ordinal)


@cuda_api_call
def cuDeviceGetName(name, len, dev):
    return cuda.cuDeviceGetName(name, len, dev)


@cuda_api_call
def cuDeviceComputeCapability(major, minor, dev):
    return cuda.cuDeviceComputeCapability(major, minor, dev)


@cuda_api_call
def cuDeviceGetAttribute(pi, attrib, dev):
    return cuda.cuDeviceGetAttribute(pi, attrib, dev)


@cuda_api_call_warn
def cuCtxCreate(pctx, flags, dev):
    try:
        result = cuda.cuCtxCreate_v2(pctx, flags, dev)
    except AttributeError:
        result = cuda.cuCtxCreate(pctx, flags, dev)
    return result


@cuda_api_call_warn
def cuMemGetInfo(free, total):
    try:
        result = cuda.cuMemGetInfo_v2(free, total)
    except AttributeError:
        result = cuda.cuMemGetInfo(free, total)
    return result


@cuda_api_call
def cuCtxDetach(ctx):
    return cuda.cuCtxDetach(ctx)


# Main function to get CUDA device specs
def get_cuda_device_specs() -> List[Dict[str, Any]]:
    """Generate spec for each GPU device with format
    {
        'name': str,
        'compute_capability': (major: int, minor: int),
        'cores': int,
        'cuda_cores': int,
        'concurrent_threads': int,
        'gpu_clock_mhz': float,
        'mem_clock_mhz': float,
        'total_mem_mb': float,
        'free_mem_mb': float,
        'architecture': str,
        'cuda_cores': int
    }
    """
    # Initialize CUDA
    cuInit(0)

    num_gpus = ctypes.c_int()
    cuDeviceGetCount(ctypes.byref(num_gpus))

    device_specs = []
    for i in range(num_gpus.value):
        spec = {}
        device = ctypes.c_int()
        cuDeviceGet(ctypes.byref(device), i)

        name = b" " * 100
        cuDeviceGetName(ctypes.c_char_p(name), len(name), device)
        spec["name"] = name.split(b"\0", 1)[0].decode()

        cc_major = ctypes.c_int()
        cc_minor = ctypes.c_int()
        cuDeviceComputeCapability(
            ctypes.byref(cc_major), ctypes.byref(cc_minor), device
        )
        compute_capability = (cc_major.value, cc_minor.value)
        spec["compute_capability"] = compute_capability

        cores = ctypes.c_int()
        cuDeviceGetAttribute(
            ctypes.byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
        )
        spec["cores"] = cores.value

        threads_per_core = ctypes.c_int()
        cuDeviceGetAttribute(
            ctypes.byref(threads_per_core),
            CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
            device,
        )
        spec["concurrent_threads"] = cores.value * threads_per_core.value

        clockrate = ctypes.c_int()
        cuDeviceGetAttribute(
            ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device
        )
        spec["gpu_clock_mhz"] = clockrate.value / 1000.0

        cuDeviceGetAttribute(
            ctypes.byref(clockrate), CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device
        )
        spec["mem_clock_mhz"] = clockrate.value / 1000.0

        context = ctypes.c_void_p()
        if cuCtxCreate(ctypes.byref(context), 0, device) == CUDA_SUCCESS:
            free_mem = ctypes.c_size_t()
            total_mem = ctypes.c_size_t()

            cuMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))

            spec["total_mem_mb"] = total_mem.value / 1024**2
            spec["free_mem_mb"] = free_mem.value / 1024**2

            spec["architecture"] = SEMVER_TO_ARCH.get(compute_capability, "unknown")
            spec["cuda_cores"] = cores.value * SEMVER_TO_CORES.get(
                compute_capability, "unknown"
            )

            cuCtxDetach(context)

        device_specs.append(spec)
    return device_specs


if __name__ == "__main__":
    print(json.dumps(get_cuda_device_specs(), indent=2))

@f0k
Copy link
Author

f0k commented Jun 8, 2024

Thanks for sharing @addisonklinke and @IanBoyanZhang! Looks good except that it would probably be easier to maintain if the two SEMVER dictionaries were joined into one, and the SEMVER_TO_CORES.get() should default to 0 instead of "unknown", otherwise you will get a very long string in spec["cuda_cores"] for new architectures :) I will not update the gist as the original is so much shorter, but yours will be handy for people who need to access the information from another script.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment