nwh/output

## output
kernel_1: thread 0 writing 0 to shared memory.
kernel_1: thread 1 writing 1 to shared memory.
kernel_1: thread 2 writing 2 to shared memory.
kernel_1: thread 3 writing 3 to shared memory.
kernel_1: thread 4 writing 4 to shared memory.
kernel_2: thread 0 reading 0 from shared memory.
kernel_2: thread 1 reading 1 from shared memory.
kernel_2: thread 2 reading 2 from shared memory.
kernel_2: thread 3 reading 3 from shared memory.
kernel_2: thread 4 reading 4 from shared memory.
--- end of persisten-shared-mem ---

## persistent-shared-mem.cu
#include <iostream>
#include <cstdio>

__global__
void kernel_1() {
  int id = blockDim.x * blockIdx.x + threadIdx.x;
  extern __shared__ int smem[];
  int myval = id;
  smem[threadIdx.x] = myval;
  printf("kernel_1: thread %d writing %d to shared memory.\n",id,myval);
}

__global__
void kernel_2() {
  int id = blockDim.x * blockIdx.x + threadIdx.x;
  extern __shared__ int smem[];
  int myval = smem[threadIdx.x];
  printf("kernel_2: thread %d reading %d from shared memory.\n",id,myval);
}

int main() {
  using std::cout;
  using std::endl;

  int num_block = 1;
  int num_thread = 5;

  kernel_1<<<num_block,num_thread,num_thread*sizeof(int)>>>();
  kernel_2<<<num_block,num_thread,num_thread*sizeof(int)>>>();

  cudaDeviceSynchronize();

  cout << "--- end of persisten-shared-mem ---" << endl;

  return 0;
}

## shared-memory.md

      
    Raw
  

              shared-memory.md
            
          
    This simple example shows that shared memory will persist between kernel
launches.  I believe this to be a lucky coincidence.  This property is not
guaranteed in general.
From the CUDA Programming Guide:
The __shared__ qualifier, optionally used together with __device__, declares a variable that:
* Resides in the shared memory space of a thread block,
* Has the lifetime of the block,
* Is only accessible from all the threads within the block.

Question and answer from Stack Overflow.
	kernel_1: thread 0 writing 0 to shared memory.
	kernel_1: thread 1 writing 1 to shared memory.
	kernel_1: thread 2 writing 2 to shared memory.
	kernel_1: thread 3 writing 3 to shared memory.
	kernel_1: thread 4 writing 4 to shared memory.
	kernel_2: thread 0 reading 0 from shared memory.
	kernel_2: thread 1 reading 1 from shared memory.
	kernel_2: thread 2 reading 2 from shared memory.
	kernel_2: thread 3 reading 3 from shared memory.
	kernel_2: thread 4 reading 4 from shared memory.
	--- end of persisten-shared-mem ---
	#include <iostream>
	#include <cstdio>

	__global__
	void kernel_1() {
	int id = blockDim.x * blockIdx.x + threadIdx.x;
	extern __shared__ int smem[];
	int myval = id;
	smem[threadIdx.x] = myval;
	printf("kernel_1: thread %d writing %d to shared memory.\n",id,myval);
	}

	__global__
	void kernel_2() {
	int id = blockDim.x * blockIdx.x + threadIdx.x;
	extern __shared__ int smem[];
	int myval = smem[threadIdx.x];
	printf("kernel_2: thread %d reading %d from shared memory.\n",id,myval);
	}

	int main() {
	using std::cout;
	using std::endl;

	int num_block = 1;
	int num_thread = 5;

	kernel_1<<<num_block,num_thread,num_thread*sizeof(int)>>>();
	kernel_2<<<num_block,num_thread,num_thread*sizeof(int)>>>();

	cudaDeviceSynchronize();

	cout << "--- end of persisten-shared-mem ---" << endl;

	return 0;
	}