xman/cpp_device_query_output.txt

## cpp_device_query_output.txt
There are 2 devices supporting CUDA

Device 0: "GeForce GTX 285"
  CUDA Driver Version / Runtime Version          4.0 / 4.0
  CUDA Capability Major/Minor version number:    1.3
  Total amount of global memory:                 1024 MBytes (1073545216 bytes)
  (30) Multiprocessors x ( 8) CUDA Cores/MP:     240 CUDA Cores
  GPU Clock Speed:                               1.48 GHz
  Memory Clock rate:                             1242.00 Mhz
  Memory Bus Width:                              512-bit
  Max Texture Dimension Size (x,y,z)             1D=(8192), 2D=(65536,32768), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(8192) x 512, 2D=(8192,8192) x 512
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       16384 bytes
  Total number of registers available per block: 16384
  Warp size:                                     32
  Maximum number of threads per block:           512
  Maximum sizes of each dimension of a block:    512 x 512 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 1
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             256 bytes
  Concurrent copy and execution:                 Yes with 1 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   No
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      No
  Device PCI Bus ID / PCI location ID:           8 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 1: "GeForce GTX 460"
  CUDA Driver Version / Runtime Version          4.0 / 4.0
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1023 MBytes (1072889856 bytes)
  ( 7) Multiprocessors x (48) CUDA Cores/MP:     336 CUDA Cores
  GPU Clock Speed:                               1.55 GHz
  Memory Clock rate:                             2000.00 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 524288 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and execution:                 Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

## device_query.cpp
/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */

// Shared Utilities (QA Testing)
#include <shrUtils.h>
#include <shrQATest.h>

// std::system includes
#include <memory>
#include <iostream>

// CUDA-C includes
#include <cuda.h>
#include <cuda_runtime_api.h>

// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
    CUresult error = 	cuDeviceGetAttribute( attribute, device_attribute, device );

    if( CUDA_SUCCESS != error) {
        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
                error, __FILE__, __LINE__);
        exit(-1);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
    shrQAStart(argc, argv);

    shrSetLogFileName ("deviceQuery.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
        shrLog("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
        shrQAFinishExit2(false, "deviceQuery", QA_FAILED);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        shrLog("There is no device supporting CUDA\n");

    int dev, driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
            // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
                shrLog("There are no available device(s) that support CUDA.\n");
            else if (deviceCount == 1)
                shrLog("There is 1 device supporting CUDA\n");
            else
                shrLog("There are %d devices supporting CUDA\n", deviceCount);
        }
        shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

    #if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        shrLog("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                      (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        shrLog(msg);
    #if CUDART_VERSION >= 2000
        shrLog("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
			deviceProp.multiProcessorCount,
			ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
			ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
    #endif
        shrLog("  GPU Clock Speed:                               %.2f GHz\n", deviceProp.clockRate * 1e-6f);
    #if CUDART_VERSION >= 4000
	// This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
        shrLog("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
        shrLog("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
        if (L2CacheSize) {
            shrLog("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

        shrLog("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
                                                        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
                                                        deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        shrLog("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
                                                        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
                                                        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
    #endif
        shrLog("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem);
        shrLog("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        shrLog("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        shrLog("  Warp size:                                     %d\n", deviceProp.warpSize);
        shrLog("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        shrLog("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        shrLog("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        shrLog("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
		shrLog("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);

    #if CUDART_VERSION >= 4000
        shrLog("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
    #else
        shrLog("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
    #endif

    #if CUDART_VERSION >= 2020
        shrLog("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        shrLog("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        shrLog("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3000
        shrLog("  Concurrent kernel execution:                   %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
        shrLog("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3010
        shrLog("  Device has ECC support enabled:                %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3020
        shrLog("  Device is using TCC driver mode:               %s\n", deviceProp.tccDriver ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 4000
        shrLog("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        shrLog("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID );
    #endif

    #if CUDART_VERSION >= 2020
        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        shrLog("  Compute Mode:\n");
        shrLog("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    #endif
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    shrLog("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[10];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
    #ifdef WIN32
	    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, driverVersion%100);
    #else
	    sprintf(cTemp, "%d.%d", driverVersion/1000, driverVersion%100);
    #endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
    #ifdef WIN32
	    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, runtimeVersion%100);
    #else
	    sprintf(cTemp, "%d.%d", runtimeVersion/1000, runtimeVersion%100);
    #endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
    #ifdef WIN32
        sprintf_s(cTemp, 10, "%d", deviceCount);
    #else
        sprintf(cTemp, "%d", deviceCount);
    #endif
    sProfileString += cTemp;

    // First 2 device names, if any
    for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += ", Device = ";
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

    // finish
	shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}

## device_query.rb
require 'rubycuda'

include SGC::Cuda

def ncores_per_sm(major, minor)
    table = {
        0x10 => 8,
        0x11 => 8,
        0x12 => 8,
        0x13 => 8,
        0x20 => 32,
        0x21 => 48,
    }
    n = table[(major << 4) + minor]
    n or "?"
end


puts "Number of CUDA devices: #{CudaDevice.count}"
puts

(0...CudaDevice.count).each do |i|
    CudaDevice.current = i
    prop = CudaDevice.properties

    puts "CUDA device #{i}"
    puts "CUDA device name                    : #{prop.name}"
    puts "CUDA driver version                 : #{driver_version/1000}.#{driver_version%100}"
    puts "CUDA runtime version                : #{runtime_version/1000}.#{runtime_version%100}"
    puts "CUDA compute capability             : #{prop.major}.#{prop.minor}"
    puts "Total global memory                 : #{prop.total_global_mem/1048576} MB"
    puts "CUDA cores                          : #{mpc = prop.multi_processor_count} x #{nps = ncores_per_sm(prop.major, prop.minor)} => #{mpc*nps.to_i}"
    puts "GPU clock rate                      : #{prop.clock_rate/1000} MHz"
    puts "Memory clock rate                   : #{prop.memory_clock_rate/1000} MHz"
    puts "Memory bus width                    : #{prop.global_memory_bus_width}-bit"
    puts "L2 cache size                       : #{prop.l2_cache_size} bytes"
    puts "Total constant memory               : #{prop.total_const_mem} bytes"
    puts "Total shared memory per block       : #{prop.shared_mem_per_block} bytes"
    puts "Total registers available per block : #{prop.regs_per_block}"
    puts "Warp size                           : #{prop.warp_size}"
    puts "Max dimension sizes of a block      : #{prop.max_threads_dim[0]} x #{prop.max_threads_dim[1]} x #{prop.max_threads_dim[2]}"
    puts "Max dimension sizes of a grid       : #{prop.max_grid_size[0]} x #{prop.max_grid_size[1]} x #{prop.max_grid_size[2]}"
    puts "Number of concurrent copy engines   : #{prop.async_engine_count}"
    puts "Support unified addressing?         : #{prop.unified_addressing > 0 ? "Yes" : "No"}"
    puts "Compute mode                        : #{CudaComputeMode[prop.compute_mode]}"
    puts
end

## ruby_device_query_output.txt
Number of CUDA devices: 2

CUDA device 0
CUDA device name                    : GeForce GTX 285
CUDA driver version                 : 4.0
CUDA runtime version                : 4.0
CUDA compute capability             : 1.3
Total global memory                 : 1023 MB
CUDA cores                          : 30 x 8 => 240
GPU clock rate                      : 1476 MHz
Memory clock rate                   : 1242 MHz
Memory bus width                    : 512-bit
L2 cache size                       : 0 bytes
Total constant memory               : 65536 bytes
Total shared memory per block       : 16384 bytes
Total registers available per block : 16384
Warp size                           : 32
Max dimension sizes of a block      : 512 x 512 x 64
Max dimension sizes of a grid       : 65535 x 65535 x 1
Number of concurrent copy engines   : 1
Support unified addressing?         : No
Compute mode                        : DEFAULT

CUDA device 1
CUDA device name                    : GeForce GTX 460
CUDA driver version                 : 4.0
CUDA runtime version                : 4.0
CUDA compute capability             : 2.1
Total global memory                 : 1023 MB
CUDA cores                          : 7 x 48 => 336
GPU clock rate                      : 1550 MHz
Memory clock rate                   : 2000 MHz
Memory bus width                    : 256-bit
L2 cache size                       : 524288 bytes
Total constant memory               : 65536 bytes
Total shared memory per block       : 49152 bytes
Total registers available per block : 32768
Warp size                           : 32
Max dimension sizes of a block      : 1024 x 1024 x 64
Max dimension sizes of a grid       : 65535 x 65535 x 65535
Number of concurrent copy engines   : 1
Support unified addressing?         : Yes
Compute mode                        : DEFAULT
	There are 2 devices supporting CUDA

	Device 0: "GeForce GTX 285"
	CUDA Driver Version / Runtime Version 4.0 / 4.0
	CUDA Capability Major/Minor version number: 1.3
	Total amount of global memory: 1024 MBytes (1073545216 bytes)
	(30) Multiprocessors x ( 8) CUDA Cores/MP: 240 CUDA Cores
	GPU Clock Speed: 1.48 GHz
	Memory Clock rate: 1242.00 Mhz
	Memory Bus Width: 512-bit
	Max Texture Dimension Size (x,y,z) 1D=(8192), 2D=(65536,32768), 3D=(2048,2048,2048)
	Max Layered Texture Size (dim) x layers 1D=(8192) x 512, 2D=(8192,8192) x 512
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 16384 bytes
	Total number of registers available per block: 16384
	Warp size: 32
	Maximum number of threads per block: 512
	Maximum sizes of each dimension of a block: 512 x 512 x 64
	Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 256 bytes
	Concurrent copy and execution: Yes with 1 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Concurrent kernel execution: No
	Alignment requirement for Surfaces: Yes
	Device has ECC support enabled: No
	Device is using TCC driver mode: No
	Device supports Unified Addressing (UVA): No
	Device PCI Bus ID / PCI location ID: 8 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 1: "GeForce GTX 460"
	CUDA Driver Version / Runtime Version 4.0 / 4.0
	CUDA Capability Major/Minor version number: 2.1
	Total amount of global memory: 1023 MBytes (1072889856 bytes)
	( 7) Multiprocessors x (48) CUDA Cores/MP: 336 CUDA Cores
	GPU Clock Speed: 1.55 GHz
	Memory Clock rate: 2000.00 Mhz
	Memory Bus Width: 256-bit
	L2 Cache Size: 524288 bytes
	Max Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
	Max Layered Texture Size (dim) x layers 1D=(16384) x 2048, 2D=(16384,16384) x 2048
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 32768
	Warp size: 32
	Maximum number of threads per block: 1024
	Maximum sizes of each dimension of a block: 1024 x 1024 x 64
	Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and execution: Yes with 1 copy engine(s)
	Run time limit on kernels: Yes
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Concurrent kernel execution: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support enabled: No
	Device is using TCC driver mode: No
	Device supports Unified Addressing (UVA): Yes
	Device PCI Bus ID / PCI location ID: 1 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
	/*
	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*
	* Please refer to the NVIDIA end user license agreement (EULA) associated
	* with this source code for terms and conditions that govern your use of
	* this software. Any use, reproduction, disclosure, or distribution of
	* this software and related documentation outside the terms of the EULA
	* is strictly prohibited.
	*
	*/
	/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */

	// Shared Utilities (QA Testing)
	#include <shrUtils.h>
	#include <shrQATest.h>

	// std::system includes
	#include <memory>
	#include <iostream>

	// CUDA-C includes
	#include <cuda.h>
	#include <cuda_runtime_api.h>

	// This function wraps the CUDA Driver API into a template function
	template <class T>
	inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
	{
	CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );

	if( CUDA_SUCCESS != error) {
	fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
	error, __FILE__, __LINE__);
	exit(-1);
	}
	}

	////////////////////////////////////////////////////////////////////////////////
	// Program main
	////////////////////////////////////////////////////////////////////////////////
	int
	main( int argc, char** argv)
	{
	shrQAStart(argc, argv);

	shrSetLogFileName ("deviceQuery.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

	int deviceCount = 0;
	if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
	shrLog("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
	shrQAFinishExit2(false, "deviceQuery", QA_FAILED);
	}

	// This function call returns 0 if there are no CUDA capable devices.
	if (deviceCount == 0)
	shrLog("There is no device supporting CUDA\n");

	int dev, driverVersion = 0, runtimeVersion = 0;
	for (dev = 0; dev < deviceCount; ++dev) {
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, dev);

	if (dev == 0) {
	// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
	if (deviceProp.major == 9999 && deviceProp.minor == 9999)
	shrLog("There are no available device(s) that support CUDA.\n");
	else if (deviceCount == 1)
	shrLog("There is 1 device supporting CUDA\n");
	else
	shrLog("There are %d devices supporting CUDA\n", deviceCount);
	}
	shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

	#if CUDART_VERSION >= 2020
	// Console log
	cudaDriverGetVersion(&driverVersion);
	cudaRuntimeGetVersion(&runtimeVersion);
	shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
	#endif
	shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);

	char msg[256];
	sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n",
	(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
	shrLog(msg);
	#if CUDART_VERSION >= 2000
	shrLog(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n",
	deviceProp.multiProcessorCount,
	ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
	ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
	#endif
	shrLog(" GPU Clock Speed: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
	#if CUDART_VERSION >= 4000
	// This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
	int memoryClock;
	getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
	shrLog(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f);
	int memBusWidth;
	getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
	shrLog(" Memory Bus Width: %d-bit\n", memBusWidth);
	int L2CacheSize;
	getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
	if (L2CacheSize) {
	shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize);
	}

	shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
	deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
	deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
	shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n",
	deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
	deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
	#endif
	shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem);
	shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock);
	shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
	shrLog(" Warp size: %d\n", deviceProp.warpSize);
	shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
	shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
	deviceProp.maxThreadsDim[0],
	deviceProp.maxThreadsDim[1],
	deviceProp.maxThreadsDim[2]);
	shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
	deviceProp.maxGridSize[0],
	deviceProp.maxGridSize[1],
	deviceProp.maxGridSize[2]);
	shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch);
	shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment);

	#if CUDART_VERSION >= 4000
	shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
	#else
	shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
	#endif

	#if CUDART_VERSION >= 2020
	shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
	shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
	shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
	#endif
	#if CUDART_VERSION >= 3000
	shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
	shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
	#endif
	#if CUDART_VERSION >= 3010
	shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
	#endif
	#if CUDART_VERSION >= 3020
	shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No");
	#endif
	#if CUDART_VERSION >= 4000
	shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
	shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID );
	#endif

	#if CUDART_VERSION >= 2020
	const char *sComputeMode[] = {
	"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
	"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
	"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
	"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
	"Unknown",
	NULL
	};
	shrLog(" Compute Mode:\n");
	shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]);
	#endif
	}

	// csv masterlog info
	// *****************************
	// exe and CUDA driver name
	shrLog("\n");
	std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
	char cTemp[10];

	// driver version
	sProfileString += ", CUDA Driver Version = ";
	#ifdef WIN32
	sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, driverVersion%100);
	#else
	sprintf(cTemp, "%d.%d", driverVersion/1000, driverVersion%100);
	#endif
	sProfileString += cTemp;

	// Runtime version
	sProfileString += ", CUDA Runtime Version = ";
	#ifdef WIN32
	sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, runtimeVersion%100);
	#else
	sprintf(cTemp, "%d.%d", runtimeVersion/1000, runtimeVersion%100);
	#endif
	sProfileString += cTemp;

	// Device count
	sProfileString += ", NumDevs = ";
	#ifdef WIN32
	sprintf_s(cTemp, 10, "%d", deviceCount);
	#else
	sprintf(cTemp, "%d", deviceCount);
	#endif
	sProfileString += cTemp;

	// First 2 device names, if any
	for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev)
	{
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, dev);
	sProfileString += ", Device = ";
	sProfileString += deviceProp.name;
	}
	sProfileString += "\n";
	shrLogEx(LOGBOTH \| MASTER, 0, sProfileString.c_str());

	// finish
	shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
	}
	require 'rubycuda'

	include SGC::Cuda

	def ncores_per_sm(major, minor)
	table = {
	0x10 => 8,
	0x11 => 8,
	0x12 => 8,
	0x13 => 8,
	0x20 => 32,
	0x21 => 48,
	}
	n = table[(major << 4) + minor]
	n or "?"
	end


	puts "Number of CUDA devices: #{CudaDevice.count}"
	puts

	(0...CudaDevice.count).each do \|i\|
	CudaDevice.current = i
	prop = CudaDevice.properties

	puts "CUDA device #{i}"
	puts "CUDA device name : #{prop.name}"
	puts "CUDA driver version : #{driver_version/1000}.#{driver_version%100}"
	puts "CUDA runtime version : #{runtime_version/1000}.#{runtime_version%100}"
	puts "CUDA compute capability : #{prop.major}.#{prop.minor}"
	puts "Total global memory : #{prop.total_global_mem/1048576} MB"
	puts "CUDA cores : #{mpc = prop.multi_processor_count} x #{nps = ncores_per_sm(prop.major, prop.minor)} => #{mpc*nps.to_i}"
	puts "GPU clock rate : #{prop.clock_rate/1000} MHz"
	puts "Memory clock rate : #{prop.memory_clock_rate/1000} MHz"
	puts "Memory bus width : #{prop.global_memory_bus_width}-bit"
	puts "L2 cache size : #{prop.l2_cache_size} bytes"
	puts "Total constant memory : #{prop.total_const_mem} bytes"
	puts "Total shared memory per block : #{prop.shared_mem_per_block} bytes"
	puts "Total registers available per block : #{prop.regs_per_block}"
	puts "Warp size : #{prop.warp_size}"
	puts "Max dimension sizes of a block : #{prop.max_threads_dim[0]} x #{prop.max_threads_dim[1]} x #{prop.max_threads_dim[2]}"
	puts "Max dimension sizes of a grid : #{prop.max_grid_size[0]} x #{prop.max_grid_size[1]} x #{prop.max_grid_size[2]}"
	puts "Number of concurrent copy engines : #{prop.async_engine_count}"
	puts "Support unified addressing? : #{prop.unified_addressing > 0 ? "Yes" : "No"}"
	puts "Compute mode : #{CudaComputeMode[prop.compute_mode]}"
	puts
	end
	Number of CUDA devices: 2

	CUDA device 0
	CUDA device name : GeForce GTX 285
	CUDA driver version : 4.0
	CUDA runtime version : 4.0
	CUDA compute capability : 1.3
	Total global memory : 1023 MB
	CUDA cores : 30 x 8 => 240
	GPU clock rate : 1476 MHz
	Memory clock rate : 1242 MHz
	Memory bus width : 512-bit
	L2 cache size : 0 bytes
	Total constant memory : 65536 bytes
	Total shared memory per block : 16384 bytes
	Total registers available per block : 16384
	Warp size : 32
	Max dimension sizes of a block : 512 x 512 x 64
	Max dimension sizes of a grid : 65535 x 65535 x 1
	Number of concurrent copy engines : 1
	Support unified addressing? : No
	Compute mode : DEFAULT

	CUDA device 1
	CUDA device name : GeForce GTX 460
	CUDA driver version : 4.0
	CUDA runtime version : 4.0
	CUDA compute capability : 2.1
	Total global memory : 1023 MB
	CUDA cores : 7 x 48 => 336
	GPU clock rate : 1550 MHz
	Memory clock rate : 2000 MHz
	Memory bus width : 256-bit
	L2 cache size : 524288 bytes
	Total constant memory : 65536 bytes
	Total shared memory per block : 49152 bytes
	Total registers available per block : 32768
	Warp size : 32
	Max dimension sizes of a block : 1024 x 1024 x 64
	Max dimension sizes of a grid : 65535 x 65535 x 65535
	Number of concurrent copy engines : 1
	Support unified addressing? : Yes
	Compute mode : DEFAULT