Timer/devices.cu

## devices.cu
#include <stdio.h>

int main() {
  int devices;
  cudaGetDeviceCount(&devices);

  for (int d = 0; d < devices; ++d) {
    cudaDeviceProp p;
    cudaGetDeviceProperties(&p, d);
    int mp = p.multiProcessorCount, sp = 0;
    if (p.major == 2) {
      if (p.minor == 1) sp = 48;
      else sp = 32;
    } else if (p.major == 3) {
      sp = 192;
    } else if (p.major == 5) {
      sp = 128;
    }
    printf("Device %d: %s\n", d, p.name);
    printf(" -> multiprocessor count: %d\n", mp);
    printf(" -> stream processor count: %d (total %d)\n", sp, sp * mp);
    printf(" -> warp size: %d\n", p.warpSize);
    printf(" -> max threads per block: %d\n", p.maxThreadsPerBlock);
    printf(" -> max block dimensions: %d x %d x %d\n", p.maxThreadsDim[0], p.maxThreadsDim[1], p.maxThreadsDim[2]);
    printf(" -> max grid dimensions: %d x %d x %d\n", p.maxGridSize[0], p.maxGridSize[1], p.maxGridSize[2]);
    puts("");
  }
  return 0;
}

## example-launch.cu
#include <stdio.h>

const int ARRAY_SIZE = 200;

__global__ void Dup(long count, int *arr) {
  long idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < count) {
    arr[idx] *= 2;
  }
}

int main() {
  int *arr = (int *) malloc(ARRAY_SIZE * sizeof(int));
  if (arr == NULL) {
    fprintf(stderr, "malloc failed!\n");
    return 1;
  }
  for (int i = 0; i < ARRAY_SIZE; ++i) arr[i] = i;
  int *c_arr;
  cudaError_t err = cudaMalloc((void **) &c_arr, ARRAY_SIZE * sizeof(int));
  if (err != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(err));
    return 1;
  }
  err = cudaMemcpy(c_arr, arr, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
    return 1;
  }
  int blocks, threads;
  getLaunchConfiguration(Dup, ARRAY_SIZE, &blocks, &threads);
  printf("Launch %d blocks of %d threads for %d elements.\n", blocks, threads, ARRAY_SIZE);
  Dup<<<blocks, threads>>>(ARRAY_SIZE, c_arr);
  cudaDeviceSynchronize();
  err = cudaMemcpy(arr, c_arr, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
    return 1;
  }
  cudaFree(c_arr);
  for (int i = 0; i < ARRAY_SIZE; ++i) {
    if (arr[i] != i * 2) {
      fprintf(stderr, "CUDA failed to double array entries (index %d) ...\n", i);
      return 1;
    }
  }
  free(arr);
  puts("PASS");
  return 0;
}

## header-1.cu
template <class T>
cudaError_t cudaOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, T kernel, size_t dynamicSMemSize, int blockSizeLimit);

## launch-configuration.cu
template <class T>
__host__ void getLaunchConfiguration(T t, int n, int *blocks, int *threads) {
  cudaOccupancyMaxPotentialBlockSize(blocks, threads, t, 0, n);
  *blocks = (n + *threads - 1) / *threads;
}

## sample-invocation.cu
//Define a kernel `k` ...
__global__ void k(void *data, int count) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < count) {
    //Here be dragons
  }
}

int blocks = 64, threads = 512;

//Given kernel `k`, we can invoke it by writing the following:
k<<<blocks, threads>>>(...);
//Variables `blocks` and `threads` are of type `dim3` or `int`
//Note that if executing on more than a single dimension `dim3`, the method to compute idx must also be adjusted
//By default, passing an int to the kernel invocation creates a single dimension `dim3` (int x 1 x 1)
	#include <stdio.h>

	int main() {
	int devices;
	cudaGetDeviceCount(&devices);

	for (int d = 0; d < devices; ++d) {
	cudaDeviceProp p;
	cudaGetDeviceProperties(&p, d);
	int mp = p.multiProcessorCount, sp = 0;
	if (p.major == 2) {
	if (p.minor == 1) sp = 48;
	else sp = 32;
	} else if (p.major == 3) {
	sp = 192;
	} else if (p.major == 5) {
	sp = 128;
	}
	printf("Device %d: %s\n", d, p.name);
	printf(" -> multiprocessor count: %d\n", mp);
	printf(" -> stream processor count: %d (total %d)\n", sp, sp * mp);
	printf(" -> warp size: %d\n", p.warpSize);
	printf(" -> max threads per block: %d\n", p.maxThreadsPerBlock);
	printf(" -> max block dimensions: %d x %d x %d\n", p.maxThreadsDim[0], p.maxThreadsDim[1], p.maxThreadsDim[2]);
	printf(" -> max grid dimensions: %d x %d x %d\n", p.maxGridSize[0], p.maxGridSize[1], p.maxGridSize[2]);
	puts("");
	}
	return 0;
	}
	#include <stdio.h>

	const int ARRAY_SIZE = 200;

	__global__ void Dup(long count, int *arr) {
	long idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < count) {
	arr[idx] *= 2;
	}
	}

	int main() {
	int arr = (int ) malloc(ARRAY_SIZE * sizeof(int));
	if (arr == NULL) {
	fprintf(stderr, "malloc failed!\n");
	return 1;
	}
	for (int i = 0; i < ARRAY_SIZE; ++i) arr[i] = i;
	int *c_arr;
	cudaError_t err = cudaMalloc((void *) &c_arr, ARRAY_SIZE sizeof(int));
	if (err != cudaSuccess) {
	fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(err));
	return 1;
	}
	err = cudaMemcpy(c_arr, arr, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
	return 1;
	}
	int blocks, threads;
	getLaunchConfiguration(Dup, ARRAY_SIZE, &blocks, &threads);
	printf("Launch %d blocks of %d threads for %d elements.\n", blocks, threads, ARRAY_SIZE);
	Dup<<<blocks, threads>>>(ARRAY_SIZE, c_arr);
	cudaDeviceSynchronize();
	err = cudaMemcpy(arr, c_arr, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
	fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
	return 1;
	}
	cudaFree(c_arr);
	for (int i = 0; i < ARRAY_SIZE; ++i) {
	if (arr[i] != i * 2) {
	fprintf(stderr, "CUDA failed to double array entries (index %d) ...\n", i);
	return 1;
	}
	}
	free(arr);
	puts("PASS");
	return 0;
	}
	template <class T>
	__host__ void getLaunchConfiguration(T t, int n, int blocks, int threads) {
	cudaOccupancyMaxPotentialBlockSize(blocks, threads, t, 0, n);
	blocks = (n + threads - 1) / *threads;
	}
	//Define a kernel `k` ...
	__global__ void k(void *data, int count) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < count) {
	//Here be dragons
	}
	}

	int blocks = 64, threads = 512;

	//Given kernel `k`, we can invoke it by writing the following:
	k<<<blocks, threads>>>(...);
	//Variables `blocks` and `threads` are of type `dim3` or `int`
	//Note that if executing on more than a single dimension `dim3`, the method to compute idx must also be adjusted
	//By default, passing an int to the kernel invocation creates a single dimension `dim3` (int x 1 x 1)