ehzawad/mmul_samplecuda.cu

## mmul_samplecuda.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define X 3
#define Y 6
#define Z 10

__global__ void multiplyMatrices(int *a, int *b, int *c) {
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;
  int z = blockIdx.z * blockDim.z + threadIdx.z;

  if (x < X && y < Y && z < Z) {
    c[x * Y * Z + y * Z + z] = a[x * Y * Z + y * Z + z] * b[x * Y * Z + y * Z + z];
  }
}

int main() {
  int a[X*Y*Z];
  int b[X*Y*Z];
  int c[X*Y*Z];

  // Initializing a and b with values
  for (int i = 0; i < X * Y * Z; i++) {
    a[i] = i + 1;
    b[i] = i + 1;
  }

  int *d_a, *d_b, *d_c;

  cudaMalloc((void**)&d_a, X*Y*Z*sizeof(int));
  cudaMalloc((void**)&d_b, X*Y*Z*sizeof(int));
  cudaMalloc((void**)&d_c, X*Y*Z*sizeof(int));

  cudaMemcpy(d_a, a, X*Y*Z*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, X*Y*Z*sizeof(int), cudaMemcpyHostToDevice);

  dim3 threadsPerBlock(Z, Y, X);
  dim3 numBlocks(1, 1, 1);

  multiplyMatrices<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c);

  cudaMemcpy(c, d_c, X*Y*Z*sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

  // Output or further processing...
    // Printing the result
  printf("Result (3D Matrix):\n");
  for (int x = 0; x < X; x++) {
    for (int y = 0; y < Y; y++) {
      for (int z = 0; z < Z; z++) {
        printf("%d ", c[x * Y * Z + y * Z + z]);
      }
      printf("\n");
    }
    printf("\n");
  }
}

}
	#include <stdio.h>
	#include <cuda_runtime.h>

	#define X 3
	#define Y 6
	#define Z 10

	__global__ void multiplyMatrices(int a, int b, int *c) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int z = blockIdx.z * blockDim.z + threadIdx.z;

	if (x < X && y < Y && z < Z) {
	c[x * Y * Z + y * Z + z] = a[x * Y * Z + y * Z + z] * b[x * Y * Z + y * Z + z];
	}
	}

	int main() {
	int a[XYZ];
	int b[XYZ];
	int c[XYZ];

	// Initializing a and b with values
	for (int i = 0; i < X * Y * Z; i++) {
	a[i] = i + 1;
	b[i] = i + 1;
	}

	int d_a, d_b, *d_c;

	cudaMalloc((void*)&d_a, XYZsizeof(int));
	cudaMalloc((void*)&d_b, XYZsizeof(int));
	cudaMalloc((void*)&d_c, XYZsizeof(int));

	cudaMemcpy(d_a, a, XYZ*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, XYZ*sizeof(int), cudaMemcpyHostToDevice);

	dim3 threadsPerBlock(Z, Y, X);
	dim3 numBlocks(1, 1, 1);

	multiplyMatrices<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c);

	cudaMemcpy(c, d_c, XYZ*sizeof(int), cudaMemcpyDeviceToHost);

	cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

	// Output or further processing...
	// Printing the result
	printf("Result (3D Matrix):\n");
	for (int x = 0; x < X; x++) {
	for (int y = 0; y < Y; y++) {
	for (int z = 0; z < Z; z++) {
	printf("%d ", c[x * Y * Z + y * Z + z]);
	}
	printf("\n");
	}
	printf("\n");
	}
	}

	}