abidrahmank/example.cu

## example.cu
#include <stdio.h>

typedef unsigned int uint;

__global__ void square(uint *d_out, uint *d_in, int rows){
    int loc = blockIdx.x*rows + 2*threadIdx.x;
	int f = d_in[loc]+d_in[loc+1];
	d_out[loc/2] = f*f;
//	printf("%d %f %f %f %f %f \n", loc, d_in[loc], d_in[loc+1], f, temp, d_out[loc/2]);
	}

int main(){

	const int ELEMS = 64;
	const int ARRAY_SIZE = ELEMS*ELEMS*sizeof(uint);
	const int E = 32;

	uint h_in[ELEMS][ELEMS];
	uint h_out[ELEMS][E];

	for (int i=0;i<ELEMS;i++){
		for (int j=0; j<ELEMS; j++){
		h_in[i][j] = uint(i*ELEMS+j);
		//printf("%d ", (int)h_in[i][j]);
		}
		//getchar();
	}

	uint* d_in;
	uint* d_out;

	cudaMalloc((void**) &d_in, ARRAY_SIZE);
	cudaMalloc((void**) &d_out, ARRAY_SIZE);

	cudaMemcpy(d_in, h_in, ARRAY_SIZE, cudaMemcpyHostToDevice);
	square<<<ELEMS,ELEMS>>>(d_out, d_in, ELEMS);

	cudaMemcpy(h_out, d_out, ELEMS*E*sizeof(uint), cudaMemcpyDeviceToHost);

	cudaFree(d_in);
	cudaFree(d_out);


/* ----------------- Print the result -------------------------- */

	printf("\n");

	for(int i=0; i<ELEMS;i++){
		for (int j=0; j<E; j++){
			printf("%d %d \n", i*E+j,h_out[i][j]);
			//continue;
		}
		//getchar();
	}

	getchar();

	}
	#include <stdio.h>

	typedef unsigned int uint;

	__global__ void square(uint d_out, uint d_in, int rows){
	int loc = blockIdx.xrows + 2threadIdx.x;
	int f = d_in[loc]+d_in[loc+1];
	d_out[loc/2] = f*f;
	// printf("%d %f %f %f %f %f \n", loc, d_in[loc], d_in[loc+1], f, temp, d_out[loc/2]);
	}

	int main(){

	const int ELEMS = 64;
	const int ARRAY_SIZE = ELEMSELEMSsizeof(uint);
	const int E = 32;

	uint h_in[ELEMS][ELEMS];
	uint h_out[ELEMS][E];

	for (int i=0;i<ELEMS;i++){
	for (int j=0; j<ELEMS; j++){
	h_in[i][j] = uint(i*ELEMS+j);
	//printf("%d ", (int)h_in[i][j]);
	}
	//getchar();
	}

	uint* d_in;
	uint* d_out;

	cudaMalloc((void**) &d_in, ARRAY_SIZE);
	cudaMalloc((void**) &d_out, ARRAY_SIZE);

	cudaMemcpy(d_in, h_in, ARRAY_SIZE, cudaMemcpyHostToDevice);
	square<<<ELEMS,ELEMS>>>(d_out, d_in, ELEMS);

	cudaMemcpy(h_out, d_out, ELEMSEsizeof(uint), cudaMemcpyDeviceToHost);

	cudaFree(d_in);
	cudaFree(d_out);


	/* ----------------- Print the result -------------------------- */

	printf("\n");

	for(int i=0; i<ELEMS;i++){
	for (int j=0; j<E; j++){
	printf("%d %d \n", i*E+j,h_out[i][j]);
	//continue;
	}
	//getchar();
	}

	getchar();

	}