sithhell/gist:ebdfcd0c7ac6ae626163a1fe366ac269 Secret

## gistfile1.txt
[17:45:13]:heller@daint104:/users/heller:1:$ /users/biddisco/apps/daint/llvm/bin/clang++ --cuda-path=$CUDATOOLKIT_HOME hello.cu -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64 -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/extras/CUPTI/lib64 -Wl,--as-needed -Wl,-lcupti -Wl,-lcudart -Wl,--no-as-needed -L/opt/cray/nvidia/default/lib64  -L/opt/cray/nvidia/default/lib64 -lcuda
[17:45:42]:heller@daint104:/users/heller:0:$
[17:45:43]:heller@daint104:/users/heller:0:$
[17:45:43]:heller@daint104:/users/heller:0:$
[17:45:43]:heller@daint104:/users/heller:0:$ ./a.out
Hello Hello
[17:45:47]:heller@daint104:/users/heller:0:$ cat hello.cu


// This is the REAL "hello world" for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string "World!"
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 16;

__global__
void hello(char *a, int *b)
{
    a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
    char a[N] = "Hello \0\0\0\0\0\0";
    int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    char *ad;
    int *bd;
    const int csize = N*sizeof(char);
    const int isize = N*sizeof(int);

    printf("%s", a);

    cudaMalloc( (void**)&ad, csize );
    cudaMalloc( (void**)&bd, isize );
    cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
    cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

    dim3 dimBlock( blocksize, 1 );
    dim3 dimGrid( 1, 1 );
    hello<<<dimGrid, dimBlock>>>(ad, bd);
    cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
    cudaFree( ad );
    cudaFree( bd );

    printf("%s\n", a);
    return EXIT_SUCCESS;
}
[17:46:09]:heller@daint104:/users/heller:0:$ salloc -N 1 --partition=normal --time=00:10:00
salloc: error: You have to specify, at least, what sort of node you need: -C gpu for GPU enabled nodes, or -C mc for multicore nodes.
Other features are possible, but 'gpu' and 'mc' are exclusive.
salloc: error: Job submit/allocate failed: Requested node configuration is not available
[17:46:25]:heller@daint104:/users/heller:1:$ salloc -C gpu -N 1 --partition=normal --time=00:10:00
salloc: Pending job allocation 3416139
salloc: job 3416139 queued and waiting for resources
salloc: job 3416139 has been allocated resources
salloc: Granted job allocation 3416139
ModuleCmd_Switch.c(179):ERROR:152: Module 'PrgEnv-cray' is currently not loaded
[17:47:08]:heller@daint104:/users/heller:0:$ srun ./a.out
Hello World!
[17:47:23]:heller@daint104:/users/heller:0:$
	[17:45:13]:heller@daint104:/users/heller:1:$ /users/biddisco/apps/daint/llvm/bin/clang++ --cuda-path=$CUDATOOLKIT_HOME hello.cu -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/lib64 -L/opt/nvidia/cudatoolkit8.0/8.0.54_2.2.8_ga620558-2.1/extras/CUPTI/lib64 -Wl,--as-needed -Wl,-lcupti -Wl,-lcudart -Wl,--no-as-needed -L/opt/cray/nvidia/default/lib64 -L/opt/cray/nvidia/default/lib64 -lcuda
	[17:45:42]:heller@daint104:/users/heller:0:$
	[17:45:43]:heller@daint104:/users/heller:0:$
	[17:45:43]:heller@daint104:/users/heller:0:$
	[17:45:43]:heller@daint104:/users/heller:0:$ ./a.out
	Hello Hello
	[17:45:47]:heller@daint104:/users/heller:0:$ cat hello.cu


	// This is the REAL "hello world" for CUDA!
	// It takes the string "Hello ", prints it, then passes it to CUDA with an array
	// of offsets. Then the offsets are added in parallel to produce the string "World!"
	// By Ingemar Ragnemalm 2010

	#include <stdio.h>

	const int N = 16;
	const int blocksize = 16;

	__global__
	void hello(char a, int b)
	{
	a[threadIdx.x] += b[threadIdx.x];
	}

	int main()
	{
	char a[N] = "Hello \0\0\0\0\0\0";
	int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

	char *ad;
	int *bd;
	const int csize = N*sizeof(char);
	const int isize = N*sizeof(int);

	printf("%s", a);

	cudaMalloc( (void**)&ad, csize );
	cudaMalloc( (void**)&bd, isize );
	cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
	cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

	dim3 dimBlock( blocksize, 1 );
	dim3 dimGrid( 1, 1 );
	hello<<<dimGrid, dimBlock>>>(ad, bd);
	cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
	cudaFree( ad );
	cudaFree( bd );

	printf("%s\n", a);
	return EXIT_SUCCESS;
	}
	[17:46:09]:heller@daint104:/users/heller:0:$ salloc -N 1 --partition=normal --time=00:10:00
	salloc: error: You have to specify, at least, what sort of node you need: -C gpu for GPU enabled nodes, or -C mc for multicore nodes.
	Other features are possible, but 'gpu' and 'mc' are exclusive.
	salloc: error: Job submit/allocate failed: Requested node configuration is not available
	[17:46:25]:heller@daint104:/users/heller:1:$ salloc -C gpu -N 1 --partition=normal --time=00:10:00
	salloc: Pending job allocation 3416139
	salloc: job 3416139 queued and waiting for resources
	salloc: job 3416139 has been allocated resources
	salloc: Granted job allocation 3416139
	ModuleCmd_Switch.c(179):ERROR:152: Module 'PrgEnv-cray' is currently not loaded
	[17:47:08]:heller@daint104:/users/heller:0:$ srun ./a.out
	Hello World!
	[17:47:23]:heller@daint104:/users/heller:0:$