Skip to content

Instantly share code, notes, and snippets.

@DWarez
Last active July 10, 2024 13:44
Show Gist options
  • Save DWarez/e5e0b9d30dc12b26c1a973271ee49a1f to your computer and use it in GitHub Desktop.
Save DWarez/e5e0b9d30dc12b26c1a973271ee49a1f to your computer and use it in GitHub Desktop.
Matrix multiplications CUDA kernels
#include <stdio.h>
#define TILE_WIDTH 7
__global__ void matMulKernel(float* A, float* B, float* C, int Width) {
__shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
__shared__ float Bds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
float Cvalue = 0;
for(int ph = 0; ph < ceil(Width/float(TILE_WIDTH)); ++ph) {
if(row < Width && ph * TILE_WIDTH + tx < Width)
Ads[ty][tx] = A[row * Width + ph * TILE_WIDTH + tx];
else
Ads[ty][tx] = 0;
if (ph * TILE_WIDTH + ty < Width && col < Width)
Bds[ty][tx] = B[(ph * TILE_WIDTH + ty) * Width + col];
else
Bds[ty][tx] = 0;
__syncthreads();
for(int i = 0; i < TILE_WIDTH; ++i) {
Cvalue += Ads[ty][i] * Bds[i][tx];
}
__syncthreads();
}
if (row < Width && col < Width)
C[row * Width + col] = Cvalue;
}
// Main function to initialize data and launch the kernel
int main() {
const int N = 16; // Size of the NxN matrix
const int size = N * N * sizeof(float);
// Allocate host memory
float* A_h = (float*)malloc(size);
float* B_h = (float*)malloc(size);
float* C_h = (float*)malloc(size);
// Initialize host matrices
for (int i = 0; i < N * N; ++i) {
A_h[i] = i;
B_h[i] = i;
}
// Allocate device memory
float *A_d, *B_d, *C_d;
cudaMalloc(&A_d, size);
cudaMalloc(&B_d, size);
cudaMalloc(&C_d, size);
// Copy data from host to device
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
// Define block size and grid size
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);
// Launch the matrix multiplication kernel
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N);
// Copy the result back to host
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
// Print the result (for small matrices)
printf("Matrix A:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", A_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix B:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", B_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix C:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", C_h[i * N + j]);
}
printf("\n");
}
// Free device memory
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
// Free host memory
free(A_h);
free(B_h);
free(C_h);
return 0;
}
#include <stdio.h>
__global__ void matMulKernel(float* A, float* B, float* C, int Width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if((row < Width) && (col < Width)){
float sum = 0.0f;
for (int i = 0; i < Width; i++) {
sum += A[row * Width + i] * B[i * Width + col];
}
C[row * Width + col] = sum;
}
}
// Main function to initialize data and launch the kernel
int main() {
const int N = 16; // Size of the NxN matrix
const int size = N * N * sizeof(float);
// Allocate host memory
float* A_h = (float*)malloc(size);
float* B_h = (float*)malloc(size);
float* C_h = (float*)malloc(size);
// Initialize host matrices
for (int i = 0; i < N * N; ++i) {
A_h[i] = i;
B_h[i] = i;
}
// Allocate device memory
float *A_d, *B_d, *C_d;
cudaMalloc(&A_d, size);
cudaMalloc(&B_d, size);
cudaMalloc(&C_d, size);
// Copy data from host to device
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
// Define block size and grid size
dim3 blockSize(16, 16);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);
// Launch the matrix multiplication kernel
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N);
// Copy the result back to host
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
// Print the result (for small matrices)
printf("Matrix A:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", A_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix B:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", B_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix C:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", C_h[i * N + j]);
}
printf("\n");
}
// Free device memory
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
// Free host memory
free(A_h);
free(B_h);
free(C_h);
return 0;
}
#include <stdio.h>
#define TILE_WIDTH 4
__global__ void matMulKernel(float* A, float* B, float* C, int Width) {
__shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
__shared__ float Bds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
float Cvalue = 0;
for(int ph = 0; ph < Width/TILE_WIDTH; ++ph) {
Ads[ty][tx] = A[row * Width + ph * TILE_WIDTH + tx];
Bds[ty][tx] = B[(ph * TILE_WIDTH + ty) * Width + col];
__syncthreads();
for(int i = 0; i < TILE_WIDTH; ++i) {
Cvalue += Ads[ty][i] * Bds[i][tx];
}
__syncthreads();
}
C[row * Width + col] = Cvalue;
}
// Main function to initialize data and launch the kernel
int main() {
const int N = 16; // Size of the NxN matrix
const int size = N * N * sizeof(float);
// Allocate host memory
float* A_h = (float*)malloc(size);
float* B_h = (float*)malloc(size);
float* C_h = (float*)malloc(size);
// Initialize host matrices
for (int i = 0; i < N * N; ++i) {
A_h[i] = i;
B_h[i] = i;
}
// Allocate device memory
float *A_d, *B_d, *C_d;
cudaMalloc(&A_d, size);
cudaMalloc(&B_d, size);
cudaMalloc(&C_d, size);
// Copy data from host to device
cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
// Define block size and grid size
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);
// Launch the matrix multiplication kernel
matMulKernel<<<gridSize, blockSize>>>(A_d, B_d, C_d, N);
// Copy the result back to host
cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
// Print the result (for small matrices)
printf("Matrix A:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", A_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix B:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", B_h[i * N + j]);
}
printf("\n");
}
printf("\nMatrix C:\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%.2f ", C_h[i * N + j]);
}
printf("\n");
}
// Free device memory
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
// Free host memory
free(A_h);
free(B_h);
free(C_h);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment