Skip to content

Instantly share code, notes, and snippets.

@andreinechaev
Created April 1, 2018 23:17
Show Gist options
  • Save andreinechaev/f0d91a2c2e198f7c150310179a8ada7e to your computer and use it in GitHub Desktop.
Save andreinechaev/f0d91a2c2e198f7c150310179a8ada7e to your computer and use it in GitHub Desktop.
#include <stdio.h>
#define N 2048 * 2048 // Number of elements in each vector
/*
* Optimize this already-accelerated codebase. Work iteratively,
* and use nvprof to support your work.
*
* Aim to profile `saxpy` (without modifying `N`) running under
* 50us.
*
* Some bugs have been placed in this codebase for your edification.
*/
__global__ void init(int *a, int *b, int *c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = 2;
b[i] = 1;
c[i] = 0;
}
__global__ void saxpy(int * a, int * b, int * c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = tid; i < N; i += stride)
c[tid] = 2 * a[tid] + b[tid];
}
/*
*** Initial Solution ***
==1372== Profiling application: ./saxpy
==1372== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 12.031ms 1 12.031ms 12.031ms 12.031ms saxpy(int*, int*, int*)
API calls: 91.44% 159.38ms 3 53.127ms 37.803us 159.27ms cudaMallocManaged
6.91% 12.037ms 1 12.037ms 12.037ms 12.037ms cudaDeviceSynchronize
1.31% 2.2753ms 3 758.44us 743.90us 785.92us cudaFree
0.14% 247.27us 1 247.27us 247.27us 247.27us cuDeviceTotalMem
0.12% 209.19us 94 2.2250us 265ns 70.087us cuDeviceGetAttribute
0.07% 116.13us 1 116.13us 116.13us 116.13us cudaLaunch
0.01% 16.413us 1 16.413us 16.413us 16.413us cuDeviceGetName
0.00% 7.2780us 3 2.4260us 339ns 6.3490us cudaSetupArgument
0.00% 2.7880us 3 929ns 298ns 1.6590us cuDeviceGetCount
0.00% 2.5550us 1 2.5550us 2.5550us 2.5550us cudaConfigureCall
0.00% 1.2180us 2 609ns 308ns 910ns cuDeviceGet
==1372== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
2349 20.924KB 4.0000KB 648.00KB 48.00000MB 10.44067ms Host To Device
4 32.000KB 4.0000KB 60.000KB 128.0000KB 17.15200us Device To Host
7 - - - - 15.48042ms Gpu page fault groups
Total CPU Page faults: 146
***********************************
*** Prefetching to Device ***
==1318== Profiling application: ./saxpy
==1318== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 8.0467ms 1 8.0467ms 8.0467ms 8.0467ms saxpy(int*, int*, int*)
API calls: 92.61% 164.93ms 3 54.977ms 23.943us 164.88ms cudaMallocManaged
5.00% 8.9075ms 1 8.9075ms 8.9075ms 8.9075ms cudaDeviceSynchronize
1.20% 2.1337ms 3 711.24us 696.87us 728.92us cudaFree
0.84% 1.4973ms 1 1.4973ms 1.4973ms 1.4973ms cudaMemPrefetchAsync
0.14% 247.00us 1 247.00us 247.00us 247.00us cuDeviceTotalMem
0.12% 214.18us 94 2.2780us 287ns 71.352us cuDeviceGetAttribute
0.07% 129.07us 1 129.07us 129.07us 129.07us cudaLaunch
0.01% 16.381us 1 16.381us 16.381us 16.381us cuDeviceGetName
0.00% 7.1160us 3 2.3720us 409ns 6.0360us cudaSetupArgument
0.00% 4.7940us 1 4.7940us 4.7940us 4.7940us cudaGetDevice
0.00% 2.7170us 3 905ns 325ns 1.5530us cuDeviceGetCount
0.00% 1.5860us 2 793ns 375ns 1.2110us cuDeviceGet
0.00% 1.5270us 1 1.5270us 1.5270us 1.5270us cudaConfigureCall
==1318== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
1534 32.041KB 4.0000KB 2.0000MB 48.00000MB 8.485440ms Host To Device
4 32.000KB 4.0000KB 60.000KB 128.0000KB 17.37600us Device To Host
5 - - - - 8.955392ms Gpu page fault groups
Total CPU Page faults: 146
*****************************
*** Prefetch back to Host ***
==1426== Profiling application: ./saxpy
==1426== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 8.0743ms 1 8.0743ms 8.0743ms 8.0743ms saxpy(int*, int*, int*)
API calls: 91.62% 161.66ms 3 53.888ms 16.200us 161.61ms cudaMallocManaged
5.05% 8.9063ms 1 8.9063ms 8.9063ms 8.9063ms cudaDeviceSynchronize
1.74% 3.0698ms 2 1.5349ms 1.5340ms 1.5358ms cudaMemPrefetchAsync
1.24% 2.1957ms 3 731.91us 697.31us 776.04us cudaFree
0.14% 246.96us 1 246.96us 246.96us 246.96us cuDeviceTotalMem
0.12% 211.51us 94 2.2500us 286ns 69.702us cuDeviceGetAttribute
0.07% 126.57us 1 126.57us 126.57us 126.57us cudaLaunch
0.01% 16.835us 1 16.835us 16.835us 16.835us cuDeviceGetName
0.00% 6.5720us 3 2.1900us 393ns 5.6740us cudaSetupArgument
0.00% 5.3160us 1 5.3160us 5.3160us 5.3160us cudaGetDevice
0.00% 2.7870us 3 929ns 277ns 1.6330us cuDeviceGetCount
0.00% 1.5030us 1 1.5030us 1.5030us 1.5030us cudaConfigureCall
0.00% 1.3250us 2 662ns 328ns 997ns cuDeviceGet
==1426== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
1570 31.307KB 4.0000KB 2.0000MB 48.00000MB 8.613984ms Host To Device
12 1.3438MB 4.0000KB 2.0000MB 16.12500MB 1.348320ms Device To Host
6 - - - - 10.80938ms Gpu page fault groups
Total CPU Page faults: 146
**************************
*** Using GPU props for configs + init on GPU ***
==2206== Profiling application: ./saxpy
==2206== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.75% 4.9893ms 1 4.9893ms 4.9893ms 4.9893ms init(int*, int*, int*)
1.25% 63.358us 1 63.358us 63.358us 63.358us saxpy(int*, int*, int*)
API calls: 91.51% 159.09ms 3 53.031ms 19.148us 159.04ms cudaMallocManaged
3.48% 6.0512ms 6 1.0085ms 9.1330us 1.7907ms cudaMemPrefetchAsync
2.91% 5.0628ms 2 2.5314ms 68.505us 4.9943ms cudaDeviceSynchronize
1.60% 2.7738ms 3 924.61us 909.12us 940.08us cudaFree
0.14% 246.63us 1 246.63us 246.63us 246.63us cuDeviceTotalMem
0.12% 208.38us 1 208.38us 208.38us 208.38us cudaGetDeviceProperties
0.12% 207.78us 94 2.2100us 267ns 69.321us cuDeviceGetAttribute
0.11% 186.28us 2 93.140us 89.779us 96.501us cudaLaunch
0.01% 16.105us 1 16.105us 16.105us 16.105us cuDeviceGetName
0.00% 3.8030us 6 633ns 252ns 1.5990us cudaSetupArgument
0.00% 2.2580us 1 2.2580us 2.2580us 2.2580us cudaGetDevice
0.00% 2.1740us 3 724ns 256ns 1.2250us cuDeviceGetCount
0.00% 1.9630us 2 981ns 560ns 1.4030us cudaConfigureCall
0.00% 1.1520us 2 576ns 347ns 805ns cuDeviceGet
==2206== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
24 2.0000MB 2.0000MB 2.0000MB 48.00000MB 3.998144ms Device To Host
25 - - - - 5.183680ms Gpu page fault groups
**************************
*** Using strides and SM num ***
==2303== Profiling application: ./saxpy
==2303== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.35% 1.3869ms 1 1.3869ms 1.3869ms 1.3869ms init(int*, int*, int*)
1.65% 23.199us 1 23.199us 23.199us 23.199us saxpy(int*, int*, int*)
API calls: 93.00% 163.44ms 3 54.479ms 17.125us 163.38ms cudaMallocManaged
3.82% 6.7140ms 6 1.1190ms 12.456us 1.7746ms cudaMemPrefetchAsync
1.35% 2.3701ms 2 1.1850ms 978.79us 1.3913ms cudaDeviceSynchronize
1.32% 2.3198ms 3 773.28us 751.62us 791.74us cudaFree
0.14% 246.41us 1 246.41us 246.41us 246.41us cuDeviceTotalMem
0.12% 210.68us 1 210.68us 210.68us 210.68us cudaGetDeviceProperties
0.12% 209.46us 94 2.2280us 266ns 71.233us cuDeviceGetAttribute
0.11% 201.84us 2 100.92us 98.129us 103.71us cudaLaunch
0.01% 15.606us 1 15.606us 15.606us 15.606us cuDeviceGetName
0.00% 3.7930us 6 632ns 253ns 1.7190us cudaSetupArgument
0.00% 2.5320us 3 844ns 316ns 1.3630us cuDeviceGetCount
0.00% 2.4480us 1 2.4480us 2.4480us 2.4480us cudaGetDevice
0.00% 1.9920us 2 996ns 604ns 1.3880us cudaConfigureCall
0.00% 1.2850us 2 642ns 319ns 966ns cuDeviceGet
==2303== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
24 2.0000MB 2.0000MB 2.0000MB 48.00000MB 3.998368ms Device To Host
7 - - - - 1.367936ms Gpu page fault groups
*/
int main()
{
int *a, *b, *c;
int size = N * sizeof (int); // The total number of bytes per vector
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
int deviceId;
cudaGetDevice(&deviceId);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceId);
int threads_per_block = props.maxThreadsPerBlock;
int number_of_blocks = props.warpSize * 32;
// Initialize memory
init<<<number_of_blocks, threads_per_block>>>(a, b, c);
cudaError_t asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
// Print out the first and last 5 values of c for a quality check
for( int i = 0; i < 5; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf ("\n");
for( int i = N-5; i < N; ++i )
printf("c[%d] = %d, ", i, c[i]);
printf ("\n");
cudaFree( a ); cudaFree( b ); cudaFree( c );
}
@manicely6005
Copy link

I was running your code and noticed both of your CUDA kernels have errors.

  1. init - You are calling it with a grid size based on the warp size not the size of N, as if you intend to use grid-stride looping but you don't loop in your kernel.

  2. saxpy - You are trying to use grid-stride looping but inside your for loop you using tid as the index. Remember that i is increasing not tid.

Use the following

`global void init(int *a, int *b, int *c) {

int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

for (int i = tid; i < N; i += stride)
{
  a[i] = 2;
  b[i] = 1;
  c[i] = 0;
}

}

global void saxpy(int * a, int * b, int * c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

for (int i = tid; i < N; i += stride)
  c[i] = 2 * a[i] + b[i];

}`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment