andreinechaev/saxpy_prof.cu

## saxpy_prof.cu
#include <stdio.h>

#define N 2048 * 2048 // Number of elements in each vector

/*
 * Optimize this already-accelerated codebase. Work iteratively,
 * and use nvprof to support your work.
 *
 * Aim to profile `saxpy` (without modifying `N`) running under
 * 50us.
 *
 * Some bugs have been placed in this codebase for your edification.
 */


__global__ void init(int *a, int *b, int *c) {

  int i = threadIdx.x + blockIdx.x * blockDim.x;

  a[i] = 2;
  b[i] = 1;
  c[i] = 0;
}

__global__ void saxpy(int * a, int * b, int * c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = tid; i < N; i += stride)
      c[tid] = 2 * a[tid] + b[tid];
}

/*
*** Initial Solution ***

==1372== Profiling application: ./saxpy
==1372== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  12.031ms         1  12.031ms  12.031ms  12.031ms  saxpy(int*, int*, int*)
      API calls:   91.44%  159.38ms         3  53.127ms  37.803us  159.27ms  cudaMallocManaged
                    6.91%  12.037ms         1  12.037ms  12.037ms  12.037ms  cudaDeviceSynchronize
                    1.31%  2.2753ms         3  758.44us  743.90us  785.92us  cudaFree
                    0.14%  247.27us         1  247.27us  247.27us  247.27us  cuDeviceTotalMem
                    0.12%  209.19us        94  2.2250us     265ns  70.087us  cuDeviceGetAttribute
                    0.07%  116.13us         1  116.13us  116.13us  116.13us  cudaLaunch
                    0.01%  16.413us         1  16.413us  16.413us  16.413us  cuDeviceGetName
                    0.00%  7.2780us         3  2.4260us     339ns  6.3490us  cudaSetupArgument
                    0.00%  2.7880us         3     929ns     298ns  1.6590us  cuDeviceGetCount
                    0.00%  2.5550us         1  2.5550us  2.5550us  2.5550us  cudaConfigureCall
                    0.00%  1.2180us         2     609ns     308ns     910ns  cuDeviceGet

==1372== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    2349  20.924KB  4.0000KB  648.00KB  48.00000MB  10.44067ms  Host To Device
       4  32.000KB  4.0000KB  60.000KB  128.0000KB  17.15200us  Device To Host
       7         -         -         -           -  15.48042ms  Gpu page fault groups
Total CPU Page faults: 146


***********************************
*** Prefetching to Device ***

==1318== Profiling application: ./saxpy
==1318== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.0467ms         1  8.0467ms  8.0467ms  8.0467ms  saxpy(int*, int*, int*)
      API calls:   92.61%  164.93ms         3  54.977ms  23.943us  164.88ms  cudaMallocManaged
                    5.00%  8.9075ms         1  8.9075ms  8.9075ms  8.9075ms  cudaDeviceSynchronize
                    1.20%  2.1337ms         3  711.24us  696.87us  728.92us  cudaFree
                    0.84%  1.4973ms         1  1.4973ms  1.4973ms  1.4973ms  cudaMemPrefetchAsync
                    0.14%  247.00us         1  247.00us  247.00us  247.00us  cuDeviceTotalMem
                    0.12%  214.18us        94  2.2780us     287ns  71.352us  cuDeviceGetAttribute
                    0.07%  129.07us         1  129.07us  129.07us  129.07us  cudaLaunch
                    0.01%  16.381us         1  16.381us  16.381us  16.381us  cuDeviceGetName
                    0.00%  7.1160us         3  2.3720us     409ns  6.0360us  cudaSetupArgument
                    0.00%  4.7940us         1  4.7940us  4.7940us  4.7940us  cudaGetDevice
                    0.00%  2.7170us         3     905ns     325ns  1.5530us  cuDeviceGetCount
                    0.00%  1.5860us         2     793ns     375ns  1.2110us  cuDeviceGet
                    0.00%  1.5270us         1  1.5270us  1.5270us  1.5270us  cudaConfigureCall

==1318== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    1534  32.041KB  4.0000KB  2.0000MB  48.00000MB  8.485440ms  Host To Device
       4  32.000KB  4.0000KB  60.000KB  128.0000KB  17.37600us  Device To Host
       5         -         -         -           -  8.955392ms  Gpu page fault groups
Total CPU Page faults: 146

*****************************
*** Prefetch back to Host ***

==1426== Profiling application: ./saxpy
==1426== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.0743ms         1  8.0743ms  8.0743ms  8.0743ms  saxpy(int*, int*, int*)
      API calls:   91.62%  161.66ms         3  53.888ms  16.200us  161.61ms  cudaMallocManaged
                    5.05%  8.9063ms         1  8.9063ms  8.9063ms  8.9063ms  cudaDeviceSynchronize
                    1.74%  3.0698ms         2  1.5349ms  1.5340ms  1.5358ms  cudaMemPrefetchAsync
                    1.24%  2.1957ms         3  731.91us  697.31us  776.04us  cudaFree
                    0.14%  246.96us         1  246.96us  246.96us  246.96us  cuDeviceTotalMem
                    0.12%  211.51us        94  2.2500us     286ns  69.702us  cuDeviceGetAttribute
                    0.07%  126.57us         1  126.57us  126.57us  126.57us  cudaLaunch
                    0.01%  16.835us         1  16.835us  16.835us  16.835us  cuDeviceGetName
                    0.00%  6.5720us         3  2.1900us     393ns  5.6740us  cudaSetupArgument
                    0.00%  5.3160us         1  5.3160us  5.3160us  5.3160us  cudaGetDevice
                    0.00%  2.7870us         3     929ns     277ns  1.6330us  cuDeviceGetCount
                    0.00%  1.5030us         1  1.5030us  1.5030us  1.5030us  cudaConfigureCall
                    0.00%  1.3250us         2     662ns     328ns     997ns  cuDeviceGet

==1426== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    1570  31.307KB  4.0000KB  2.0000MB  48.00000MB  8.613984ms  Host To Device
      12  1.3438MB  4.0000KB  2.0000MB  16.12500MB  1.348320ms  Device To Host
       6         -         -         -           -  10.80938ms  Gpu page fault groups
Total CPU Page faults: 146

**************************

*** Using GPU props for configs + init on GPU ***

==2206== Profiling application: ./saxpy
==2206== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.75%  4.9893ms         1  4.9893ms  4.9893ms  4.9893ms  init(int*, int*, int*)
                    1.25%  63.358us         1  63.358us  63.358us  63.358us  saxpy(int*, int*, int*)
      API calls:   91.51%  159.09ms         3  53.031ms  19.148us  159.04ms  cudaMallocManaged
                    3.48%  6.0512ms         6  1.0085ms  9.1330us  1.7907ms  cudaMemPrefetchAsync
                    2.91%  5.0628ms         2  2.5314ms  68.505us  4.9943ms  cudaDeviceSynchronize
                    1.60%  2.7738ms         3  924.61us  909.12us  940.08us  cudaFree
                    0.14%  246.63us         1  246.63us  246.63us  246.63us  cuDeviceTotalMem
                    0.12%  208.38us         1  208.38us  208.38us  208.38us  cudaGetDeviceProperties
                    0.12%  207.78us        94  2.2100us     267ns  69.321us  cuDeviceGetAttribute
                    0.11%  186.28us         2  93.140us  89.779us  96.501us  cudaLaunch
                    0.01%  16.105us         1  16.105us  16.105us  16.105us  cuDeviceGetName
                    0.00%  3.8030us         6     633ns     252ns  1.5990us  cudaSetupArgument
                    0.00%  2.2580us         1  2.2580us  2.2580us  2.2580us  cudaGetDevice
                    0.00%  2.1740us         3     724ns     256ns  1.2250us  cuDeviceGetCount
                    0.00%  1.9630us         2     981ns     560ns  1.4030us  cudaConfigureCall
                    0.00%  1.1520us         2     576ns     347ns     805ns  cuDeviceGet

==2206== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      24  2.0000MB  2.0000MB  2.0000MB  48.00000MB  3.998144ms  Device To Host
      25         -         -         -           -  5.183680ms  Gpu page fault groups


**************************
*** Using strides and SM num ***

==2303== Profiling application: ./saxpy
==2303== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.35%  1.3869ms         1  1.3869ms  1.3869ms  1.3869ms  init(int*, int*, int*)
                    1.65%  23.199us         1  23.199us  23.199us  23.199us  saxpy(int*, int*, int*)
      API calls:   93.00%  163.44ms         3  54.479ms  17.125us  163.38ms  cudaMallocManaged
                    3.82%  6.7140ms         6  1.1190ms  12.456us  1.7746ms  cudaMemPrefetchAsync
                    1.35%  2.3701ms         2  1.1850ms  978.79us  1.3913ms  cudaDeviceSynchronize
                    1.32%  2.3198ms         3  773.28us  751.62us  791.74us  cudaFree
                    0.14%  246.41us         1  246.41us  246.41us  246.41us  cuDeviceTotalMem
                    0.12%  210.68us         1  210.68us  210.68us  210.68us  cudaGetDeviceProperties
                    0.12%  209.46us        94  2.2280us     266ns  71.233us  cuDeviceGetAttribute
                    0.11%  201.84us         2  100.92us  98.129us  103.71us  cudaLaunch
                    0.01%  15.606us         1  15.606us  15.606us  15.606us  cuDeviceGetName
                    0.00%  3.7930us         6     632ns     253ns  1.7190us  cudaSetupArgument
                    0.00%  2.5320us         3     844ns     316ns  1.3630us  cuDeviceGetCount
                    0.00%  2.4480us         1  2.4480us  2.4480us  2.4480us  cudaGetDevice
                    0.00%  1.9920us         2     996ns     604ns  1.3880us  cudaConfigureCall
                    0.00%  1.2850us         2     642ns     319ns     966ns  cuDeviceGet

==2303== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      24  2.0000MB  2.0000MB  2.0000MB  48.00000MB  3.998368ms  Device To Host
       7         -         -         -           -  1.367936ms  Gpu page fault groups
*/

int main()
{
    int *a, *b, *c;

    int size = N * sizeof (int); // The total number of bytes per vector

    cudaMallocManaged(&a, size);
    cudaMallocManaged(&b, size);
    cudaMallocManaged(&c, size);

    int deviceId;
    cudaGetDevice(&deviceId);

    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, deviceId);

    int threads_per_block = props.maxThreadsPerBlock;
    int number_of_blocks = props.warpSize * 32;
    // Initialize memory
    init<<<number_of_blocks, threads_per_block>>>(a, b, c);

    cudaError_t asyncErr = cudaDeviceSynchronize();
    if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));

    cudaMemPrefetchAsync(a, size, deviceId);
    cudaMemPrefetchAsync(b, size, deviceId);
    cudaMemPrefetchAsync(c, size, deviceId);
    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );

    asyncErr = cudaDeviceSynchronize();
    if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));

    cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
    cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
    cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
    // Print out the first and last 5 values of c for a quality check
    for( int i = 0; i < 5; ++i )
        printf("c[%d] = %d, ", i, c[i]);
    printf ("\n");
    for( int i = N-5; i < N; ++i )
        printf("c[%d] = %d, ", i, c[i]);
    printf ("\n");

    cudaFree( a ); cudaFree( b ); cudaFree( c );
}
	#include <stdio.h>

	#define N 2048 * 2048 // Number of elements in each vector

	/*
	* Optimize this already-accelerated codebase. Work iteratively,
	* and use nvprof to support your work.
	*
	* Aim to profile `saxpy` (without modifying `N`) running under
	* 50us.
	*
	* Some bugs have been placed in this codebase for your edification.
	*/


	__global__ void init(int a, int b, int *c) {

	int i = threadIdx.x + blockIdx.x * blockDim.x;

	a[i] = 2;
	b[i] = 1;
	c[i] = 0;
	}

	__global__ void saxpy(int * a, int * b, int * c)
	{
	int tid = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;

	for (int i = tid; i < N; i += stride)
	c[tid] = 2 * a[tid] + b[tid];
	}

	/*
	* Initial Solution *

	==1372== Profiling application: ./saxpy
	==1372== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 100.00% 12.031ms 1 12.031ms 12.031ms 12.031ms saxpy(int, int, int*)
	API calls: 91.44% 159.38ms 3 53.127ms 37.803us 159.27ms cudaMallocManaged
	6.91% 12.037ms 1 12.037ms 12.037ms 12.037ms cudaDeviceSynchronize
	1.31% 2.2753ms 3 758.44us 743.90us 785.92us cudaFree
	0.14% 247.27us 1 247.27us 247.27us 247.27us cuDeviceTotalMem
	0.12% 209.19us 94 2.2250us 265ns 70.087us cuDeviceGetAttribute
	0.07% 116.13us 1 116.13us 116.13us 116.13us cudaLaunch
	0.01% 16.413us 1 16.413us 16.413us 16.413us cuDeviceGetName
	0.00% 7.2780us 3 2.4260us 339ns 6.3490us cudaSetupArgument
	0.00% 2.7880us 3 929ns 298ns 1.6590us cuDeviceGetCount
	0.00% 2.5550us 1 2.5550us 2.5550us 2.5550us cudaConfigureCall
	0.00% 1.2180us 2 609ns 308ns 910ns cuDeviceGet

	==1372== Unified Memory profiling result:
	Device "Tesla V100-SXM2-16GB (0)"
	Count Avg Size Min Size Max Size Total Size Total Time Name
	2349 20.924KB 4.0000KB 648.00KB 48.00000MB 10.44067ms Host To Device
	4 32.000KB 4.0000KB 60.000KB 128.0000KB 17.15200us Device To Host
	7 - - - - 15.48042ms Gpu page fault groups
	Total CPU Page faults: 146


	***********************************
	* Prefetching to Device *

	==1318== Profiling application: ./saxpy
	==1318== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 100.00% 8.0467ms 1 8.0467ms 8.0467ms 8.0467ms saxpy(int, int, int*)
	API calls: 92.61% 164.93ms 3 54.977ms 23.943us 164.88ms cudaMallocManaged
	5.00% 8.9075ms 1 8.9075ms 8.9075ms 8.9075ms cudaDeviceSynchronize
	1.20% 2.1337ms 3 711.24us 696.87us 728.92us cudaFree
	0.84% 1.4973ms 1 1.4973ms 1.4973ms 1.4973ms cudaMemPrefetchAsync
	0.14% 247.00us 1 247.00us 247.00us 247.00us cuDeviceTotalMem
	0.12% 214.18us 94 2.2780us 287ns 71.352us cuDeviceGetAttribute
	0.07% 129.07us 1 129.07us 129.07us 129.07us cudaLaunch
	0.01% 16.381us 1 16.381us 16.381us 16.381us cuDeviceGetName
	0.00% 7.1160us 3 2.3720us 409ns 6.0360us cudaSetupArgument
	0.00% 4.7940us 1 4.7940us 4.7940us 4.7940us cudaGetDevice
	0.00% 2.7170us 3 905ns 325ns 1.5530us cuDeviceGetCount
	0.00% 1.5860us 2 793ns 375ns 1.2110us cuDeviceGet
	0.00% 1.5270us 1 1.5270us 1.5270us 1.5270us cudaConfigureCall

	==1318== Unified Memory profiling result:
	Device "Tesla V100-SXM2-16GB (0)"
	Count Avg Size Min Size Max Size Total Size Total Time Name
	1534 32.041KB 4.0000KB 2.0000MB 48.00000MB 8.485440ms Host To Device
	4 32.000KB 4.0000KB 60.000KB 128.0000KB 17.37600us Device To Host
	5 - - - - 8.955392ms Gpu page fault groups
	Total CPU Page faults: 146

	*****************************
	* Prefetch back to Host *

	==1426== Profiling application: ./saxpy
	==1426== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 100.00% 8.0743ms 1 8.0743ms 8.0743ms 8.0743ms saxpy(int, int, int*)
	API calls: 91.62% 161.66ms 3 53.888ms 16.200us 161.61ms cudaMallocManaged
	5.05% 8.9063ms 1 8.9063ms 8.9063ms 8.9063ms cudaDeviceSynchronize
	1.74% 3.0698ms 2 1.5349ms 1.5340ms 1.5358ms cudaMemPrefetchAsync
	1.24% 2.1957ms 3 731.91us 697.31us 776.04us cudaFree
	0.14% 246.96us 1 246.96us 246.96us 246.96us cuDeviceTotalMem
	0.12% 211.51us 94 2.2500us 286ns 69.702us cuDeviceGetAttribute
	0.07% 126.57us 1 126.57us 126.57us 126.57us cudaLaunch
	0.01% 16.835us 1 16.835us 16.835us 16.835us cuDeviceGetName
	0.00% 6.5720us 3 2.1900us 393ns 5.6740us cudaSetupArgument
	0.00% 5.3160us 1 5.3160us 5.3160us 5.3160us cudaGetDevice
	0.00% 2.7870us 3 929ns 277ns 1.6330us cuDeviceGetCount
	0.00% 1.5030us 1 1.5030us 1.5030us 1.5030us cudaConfigureCall
	0.00% 1.3250us 2 662ns 328ns 997ns cuDeviceGet

	==1426== Unified Memory profiling result:
	Device "Tesla V100-SXM2-16GB (0)"
	Count Avg Size Min Size Max Size Total Size Total Time Name
	1570 31.307KB 4.0000KB 2.0000MB 48.00000MB 8.613984ms Host To Device
	12 1.3438MB 4.0000KB 2.0000MB 16.12500MB 1.348320ms Device To Host
	6 - - - - 10.80938ms Gpu page fault groups
	Total CPU Page faults: 146

	**************************

	* Using GPU props for configs + init on GPU *

	==2206== Profiling application: ./saxpy
	==2206== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 98.75% 4.9893ms 1 4.9893ms 4.9893ms 4.9893ms init(int, int, int*)
	1.25% 63.358us 1 63.358us 63.358us 63.358us saxpy(int, int, int*)
	API calls: 91.51% 159.09ms 3 53.031ms 19.148us 159.04ms cudaMallocManaged
	3.48% 6.0512ms 6 1.0085ms 9.1330us 1.7907ms cudaMemPrefetchAsync
	2.91% 5.0628ms 2 2.5314ms 68.505us 4.9943ms cudaDeviceSynchronize
	1.60% 2.7738ms 3 924.61us 909.12us 940.08us cudaFree
	0.14% 246.63us 1 246.63us 246.63us 246.63us cuDeviceTotalMem
	0.12% 208.38us 1 208.38us 208.38us 208.38us cudaGetDeviceProperties
	0.12% 207.78us 94 2.2100us 267ns 69.321us cuDeviceGetAttribute
	0.11% 186.28us 2 93.140us 89.779us 96.501us cudaLaunch
	0.01% 16.105us 1 16.105us 16.105us 16.105us cuDeviceGetName
	0.00% 3.8030us 6 633ns 252ns 1.5990us cudaSetupArgument
	0.00% 2.2580us 1 2.2580us 2.2580us 2.2580us cudaGetDevice
	0.00% 2.1740us 3 724ns 256ns 1.2250us cuDeviceGetCount
	0.00% 1.9630us 2 981ns 560ns 1.4030us cudaConfigureCall
	0.00% 1.1520us 2 576ns 347ns 805ns cuDeviceGet

	==2206== Unified Memory profiling result:
	Device "Tesla V100-SXM2-16GB (0)"
	Count Avg Size Min Size Max Size Total Size Total Time Name
	24 2.0000MB 2.0000MB 2.0000MB 48.00000MB 3.998144ms Device To Host
	25 - - - - 5.183680ms Gpu page fault groups


	**************************
	* Using strides and SM num *

	==2303== Profiling application: ./saxpy
	==2303== Profiling result:
	Type Time(%) Time Calls Avg Min Max Name
	GPU activities: 98.35% 1.3869ms 1 1.3869ms 1.3869ms 1.3869ms init(int, int, int*)
	1.65% 23.199us 1 23.199us 23.199us 23.199us saxpy(int, int, int*)
	API calls: 93.00% 163.44ms 3 54.479ms 17.125us 163.38ms cudaMallocManaged
	3.82% 6.7140ms 6 1.1190ms 12.456us 1.7746ms cudaMemPrefetchAsync
	1.35% 2.3701ms 2 1.1850ms 978.79us 1.3913ms cudaDeviceSynchronize
	1.32% 2.3198ms 3 773.28us 751.62us 791.74us cudaFree
	0.14% 246.41us 1 246.41us 246.41us 246.41us cuDeviceTotalMem
	0.12% 210.68us 1 210.68us 210.68us 210.68us cudaGetDeviceProperties
	0.12% 209.46us 94 2.2280us 266ns 71.233us cuDeviceGetAttribute
	0.11% 201.84us 2 100.92us 98.129us 103.71us cudaLaunch
	0.01% 15.606us 1 15.606us 15.606us 15.606us cuDeviceGetName
	0.00% 3.7930us 6 632ns 253ns 1.7190us cudaSetupArgument
	0.00% 2.5320us 3 844ns 316ns 1.3630us cuDeviceGetCount
	0.00% 2.4480us 1 2.4480us 2.4480us 2.4480us cudaGetDevice
	0.00% 1.9920us 2 996ns 604ns 1.3880us cudaConfigureCall
	0.00% 1.2850us 2 642ns 319ns 966ns cuDeviceGet

	==2303== Unified Memory profiling result:
	Device "Tesla V100-SXM2-16GB (0)"
	Count Avg Size Min Size Max Size Total Size Total Time Name
	24 2.0000MB 2.0000MB 2.0000MB 48.00000MB 3.998368ms Device To Host
	7 - - - - 1.367936ms Gpu page fault groups
	*/

	int main()
	{
	int a, b, *c;

	int size = N * sizeof (int); // The total number of bytes per vector

	cudaMallocManaged(&a, size);
	cudaMallocManaged(&b, size);
	cudaMallocManaged(&c, size);

	int deviceId;
	cudaGetDevice(&deviceId);

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, deviceId);

	int threads_per_block = props.maxThreadsPerBlock;
	int number_of_blocks = props.warpSize * 32;
	// Initialize memory
	init<<<number_of_blocks, threads_per_block>>>(a, b, c);

	cudaError_t asyncErr = cudaDeviceSynchronize();
	if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));

	cudaMemPrefetchAsync(a, size, deviceId);
	cudaMemPrefetchAsync(b, size, deviceId);
	cudaMemPrefetchAsync(c, size, deviceId);
	saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );

	asyncErr = cudaDeviceSynchronize();
	if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));

	cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
	cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
	cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
	// Print out the first and last 5 values of c for a quality check
	for( int i = 0; i < 5; ++i )
	printf("c[%d] = %d, ", i, c[i]);
	printf ("\n");
	for( int i = N-5; i < N; ++i )
	printf("c[%d] = %d, ", i, c[i]);
	printf ("\n");

	cudaFree( a ); cudaFree( b ); cudaFree( c );
	}