Bhavya031/matix.c

## matix.c
#include <stdio.h>

// Matrix dimensions
#define N 4
#define M 4
#define P 4

// CUDA kernel for matrix multiplication
__global__ void matrixMul(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < P) {
        int sum = 0;
        for (int k = 0; k < M; ++k) {
            sum += a[row * M + k] * b[k * P + col];
        }
        c[row * P + col] = sum;
    }
}

int main() {
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;  // Device pointers

    // Allocate memory on the CPU
    a = (int *)malloc(N * M * sizeof(int));
    b = (int *)malloc(M * P * sizeof(int));
    c = (int *)malloc(N * P * sizeof(int));

    // Initialize matrices a and b here

    // Allocate memory on the GPU
    cudaMalloc((void **)&d_a, N * M * sizeof(int));
    cudaMalloc((void **)&d_b, M * P * sizeof(int));
    cudaMalloc((void **)&d_c, N * P * sizeof(int));

    // Copy data from CPU to GPU
    cudaMemcpy(d_a, a, N * M * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, M * P * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 dimGrid(N / 16, P / 16);
    dim3 dimBlock(16, 16);

    // Launch the CUDA kernel
    matrixMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);

    // Copy the result from GPU to CPU
    cudaMemcpy(c, d_c, N * P * sizeof(int), cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free CPU memory
    free(a);
    free(b);
    free(c);

    return 0;
}
	#include <stdio.h>

	// Matrix dimensions
	#define N 4
	#define M 4
	#define P 4

	// CUDA kernel for matrix multiplication
	__global__ void matrixMul(int a, int b, int *c) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < N && col < P) {
	int sum = 0;
	for (int k = 0; k < M; ++k) {
	sum += a[row * M + k] * b[k * P + col];
	}
	c[row * P + col] = sum;
	}
	}

	int main() {
	int a, b, *c;
	int d_a, d_b, *d_c; // Device pointers

	// Allocate memory on the CPU
	a = (int )malloc(N M * sizeof(int));
	b = (int )malloc(M P * sizeof(int));
	c = (int )malloc(N P * sizeof(int));

	// Initialize matrices a and b here

	// Allocate memory on the GPU
	cudaMalloc((void *)&d_a, N M * sizeof(int));
	cudaMalloc((void *)&d_b, M P * sizeof(int));
	cudaMalloc((void *)&d_c, N P * sizeof(int));

	// Copy data from CPU to GPU
	cudaMemcpy(d_a, a, N * M * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, M * P * sizeof(int), cudaMemcpyHostToDevice);

	// Define grid and block dimensions
	dim3 dimGrid(N / 16, P / 16);
	dim3 dimBlock(16, 16);

	// Launch the CUDA kernel
	matrixMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);

	// Copy the result from GPU to CPU
	cudaMemcpy(c, d_c, N * P * sizeof(int), cudaMemcpyDeviceToHost);

	// Free GPU memory
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	// Free CPU memory
	free(a);
	free(b);
	free(c);

	return 0;
	}