Last active
July 10, 2024 13:44
-
-
Save DWarez/e5e0b9d30dc12b26c1a973271ee49a1f to your computer and use it in GitHub Desktop.
Matrix multiplications CUDA kernels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define TILE_WIDTH 7 | |
__global__ void matMulKernel(float* A, float* B, float* C, int Width) { | |
__shared__ float Ads[TILE_WIDTH][TILE_WIDTH]; | |
__shared__ float Bds[TILE_WIDTH][TILE_WIDTH]; | |
int bx = blockIdx.x; | |
int by = blockIdx.y; | |
int tx = threadIdx.x; | |
int ty = threadIdx.y; | |
int row = by * TILE_WIDTH + ty; | |
int col = bx * TILE_WIDTH + tx; | |
float Cvalue = 0; | |
for(int ph = 0; ph < ceil(Width/float(TILE_WIDTH)); ++ph) { | |
if(row < Width && ph * TILE_WIDTH + tx < Width) | |
Ads[ty][tx] = A[row * Width + ph * TILE_WIDTH + tx]; | |
else | |
Ads[ty][tx] = 0; | |
if (ph * TILE_WIDTH + ty < Width && col < Width) | |
Bds[ty][tx] = B[(ph * TILE_WIDTH + ty) * Width + col]; | |
else | |
Bds[ty][tx] = 0; | |
__syncthreads(); | |
for(int i = 0; i < TILE_WIDTH; ++i) { | |
Cvalue += Ads[ty][i] * Bds[i][tx]; | |
} | |
__syncthreads(); | |
} | |
if (row < Width && col < Width) | |
C[row * Width + col] = Cvalue; | |
} | |
// Main function to initialize data and launch the kernel | |
int main() { | |
const int N = 16; // Size of the NxN matrix | |
const int size = N * N * sizeof(float); | |
// Allocate host memory | |
float* A_h = (float*)malloc(size); | |
float* B_h = (float*)malloc(size); | |
float* C_h = (float*)malloc(size); | |
// Initialize host matrices | |
for (int i = 0; i < N * N; ++i) { | |
A_h[i] = i; | |
B_h[i] = i; | |
} | |
// Allocate device memory | |
float *A_d, *B_d, *C_d; | |
cudaMalloc(&A_d, size); | |
cudaMalloc(&B_d, size); | |
cudaMalloc(&C_d, size); | |
// Copy data from host to device | |
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice); | |
// Define block size and grid size | |
dim3 blockSize(TILE_WIDTH, TILE_WIDTH); | |
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y); | |
// Launch the matrix multiplication kernel | |
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N); | |
// Copy the result back to host | |
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost); | |
// Print the result (for small matrices) | |
printf("Matrix A:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", A_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix B:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", B_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix C:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", C_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
// Free device memory | |
cudaFree(A_d); | |
cudaFree(B_d); | |
cudaFree(C_d); | |
// Free host memory | |
free(A_h); | |
free(B_h); | |
free(C_h); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
__global__ void matMulKernel(float* A, float* B, float* C, int Width) { | |
int row = blockIdx.y * blockDim.y + threadIdx.y; | |
int col = blockIdx.x * blockDim.x + threadIdx.x; | |
if((row < Width) && (col < Width)){ | |
float sum = 0.0f; | |
for (int i = 0; i < Width; i++) { | |
sum += A[row * Width + i] * B[i * Width + col]; | |
} | |
C[row * Width + col] = sum; | |
} | |
} | |
// Main function to initialize data and launch the kernel | |
int main() { | |
const int N = 16; // Size of the NxN matrix | |
const int size = N * N * sizeof(float); | |
// Allocate host memory | |
float* A_h = (float*)malloc(size); | |
float* B_h = (float*)malloc(size); | |
float* C_h = (float*)malloc(size); | |
// Initialize host matrices | |
for (int i = 0; i < N * N; ++i) { | |
A_h[i] = i; | |
B_h[i] = i; | |
} | |
// Allocate device memory | |
float *A_d, *B_d, *C_d; | |
cudaMalloc(&A_d, size); | |
cudaMalloc(&B_d, size); | |
cudaMalloc(&C_d, size); | |
// Copy data from host to device | |
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice); | |
// Define block size and grid size | |
dim3 blockSize(16, 16); | |
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y); | |
// Launch the matrix multiplication kernel | |
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N); | |
// Copy the result back to host | |
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost); | |
// Print the result (for small matrices) | |
printf("Matrix A:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", A_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix B:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", B_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix C:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", C_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
// Free device memory | |
cudaFree(A_d); | |
cudaFree(B_d); | |
cudaFree(C_d); | |
// Free host memory | |
free(A_h); | |
free(B_h); | |
free(C_h); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define TILE_WIDTH 4 | |
__global__ void matMulKernel(float* A, float* B, float* C, int Width) { | |
__shared__ float Ads[TILE_WIDTH][TILE_WIDTH]; | |
__shared__ float Bds[TILE_WIDTH][TILE_WIDTH]; | |
int bx = blockIdx.x; | |
int by = blockIdx.y; | |
int tx = threadIdx.x; | |
int ty = threadIdx.y; | |
int row = by * TILE_WIDTH + ty; | |
int col = bx * TILE_WIDTH + tx; | |
float Cvalue = 0; | |
for(int ph = 0; ph < Width/TILE_WIDTH; ++ph) { | |
Ads[ty][tx] = A[row * Width + ph * TILE_WIDTH + tx]; | |
Bds[ty][tx] = B[(ph * TILE_WIDTH + ty) * Width + col]; | |
__syncthreads(); | |
for(int i = 0; i < TILE_WIDTH; ++i) { | |
Cvalue += Ads[ty][i] * Bds[i][tx]; | |
} | |
__syncthreads(); | |
} | |
C[row * Width + col] = Cvalue; | |
} | |
// Main function to initialize data and launch the kernel | |
int main() { | |
const int N = 16; // Size of the NxN matrix | |
const int size = N * N * sizeof(float); | |
// Allocate host memory | |
float* A_h = (float*)malloc(size); | |
float* B_h = (float*)malloc(size); | |
float* C_h = (float*)malloc(size); | |
// Initialize host matrices | |
for (int i = 0; i < N * N; ++i) { | |
A_h[i] = i; | |
B_h[i] = i; | |
} | |
// Allocate device memory | |
float *A_d, *B_d, *C_d; | |
cudaMalloc(&A_d, size); | |
cudaMalloc(&B_d, size); | |
cudaMalloc(&C_d, size); | |
// Copy data from host to device | |
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice); | |
// Define block size and grid size | |
dim3 blockSize(TILE_WIDTH, TILE_WIDTH); | |
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y); | |
// Launch the matrix multiplication kernel | |
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N); | |
// Copy the result back to host | |
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost); | |
// Print the result (for small matrices) | |
printf("Matrix A:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", A_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix B:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", B_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
printf("\nMatrix C:\n"); | |
for (int i = 0; i < N; ++i) { | |
for (int j = 0; j < N; ++j) { | |
printf("%.2f ", C_h[i * N + j]); | |
} | |
printf("\n"); | |
} | |
// Free device memory | |
cudaFree(A_d); | |
cudaFree(B_d); | |
cudaFree(C_d); | |
// Free host memory | |
free(A_h); | |
free(B_h); | |
free(C_h); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment