Skip to content

Instantly share code, notes, and snippets.

@Sid911
Last active July 8, 2023 04:57
Show Gist options
  • Save Sid911/96f4b560baf5bcda04a354aa9002ea97 to your computer and use it in GitHub Desktop.
Save Sid911/96f4b560baf5bcda04a354aa9002ea97 to your computer and use it in GitHub Desktop.
Matrix multiplication CPU GPU implementations

Matrix Mul tests

This Gist compares different matrix multiplication implementations like AVX, SSE4.1, Pure Loops for CPU. It also implements CuBLAS and kernel based approach for matrix multiplying in CUDA.

Note: This isn't any scientific test just a random test to see how well the implementations perform in a general sense rather than quantative

Hardware Config

sid@sid-ubuntu ~> neofetch
            .-/+oossssoo+/-.               sid@sid-ubuntu 
        `:+ssssssssssssssssss+:`           -------------- 
      -+ssssssssssssssssssyyssss+-         OS: Ubuntu 22.04.2 LTS x86_64 
    .ossssssssssssssssssdMMMNysssso.       Host: Vivobook_ASUSLaptop K6500ZE_K6500ZE 1.0 
   /ssssssssssshdmmNNmmyNMMMMhssssss/      Kernel: 5.19.0-46-generic 
  +ssssssssshmydMMMMMMMNddddyssssssss+     Uptime: 4 hours, 48 mins 
 /sssssssshNMMMyhhyyyyhmNMMMNhssssssss/    Packages: 2531 (dpkg), 21 (flatpak), 17 (snap) 
.ssssssssdMMMNhsssssssssshNMMMdssssssss.   Shell: fish 3.3.1 
+sssshhhyNMMNyssssssssssssyNMMMysssssss+   Resolution: 1920x1080 
ossyNMMMNyMMhsssssssssssssshmmmhssssssso   DE: GNOME 42.5 
ossyNMMMNyMMhsssssssssssssshmmmhssssssso   WM: Mutter 
+sssshhhyNMMNyssssssssssssyNMMMysssssss+   WM Theme: Adwaita 
.ssssssssdMMMNhsssssssssshNMMMdssssssss.   Theme: Fluent-Dark-compact [GTK2/3] 
 /sssssssshNMMMyhhyyyyhdNMMMNhssssssss/    Icons: Fluent-green-dark [GTK2/3] 
  +sssssssssdmydMMMMMMMMddddyssssssss+     Terminal: xfce4-terminal 
   /ssssssssssshdmNNNNmyNMMMMhssssss/      Terminal Font: FiraCode Nerd Font Mono weight 
    .ossssssssssssssssssdMMMNysssso.       CPU: 12th Gen Intel i5-12450H (12) @ 4.400GHz 
      -+sssssssssssssssssyyyssss+-         GPU: NVIDIA GeForce RTX 3050 Ti Mobile 
        `:+ssssssssssssssssss+:`           GPU: Intel Alder Lake-P GT1 [UHD Graphics] 
            .-/+oossssoo+/-.               Memory: 5930MiB / 15709MiB 

#!/bin/sh
nvcc -O3 ./matrix_mul.cu -lcublas -o matrix_multiply_cuda
g++ -mavx -msse4.1 -O3 -mfma -o matrix_multiply matrix_mul_instrinsics.cpp
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
#include <ctime>
// CUDA error checking macros
#define CUDA_CHECK(status) \
{ \
cudaError_t error = status; \
if (error != cudaSuccess) \
{ \
std::cout << "CUDA Error: " << cudaGetErrorString(error) << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
}
// cuBLAS error checking macros
#define CUBLAS_CHECK(status) \
{ \
cublasStatus_t error = status; \
if (error != CUBLAS_STATUS_SUCCESS) \
{ \
std::cout << "cuBLAS Error: " << error << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
}
void matrixMultiplyCuBLAS(float *A, float *B, float *C, int M, int N, int K, cublasHandle_t &handle)
{
float alpha = 1.0f;
float beta = 0.0f;
CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, K, N, &alpha, B, M, A, N, &beta, C, M));
}
__global__ void matrixMultiplyCUDA(float *A, float *B, float *C, int M, int N, int K)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < K)
{
float sum = 0.0f;
for (int i = 0; i < N; ++i)
{
sum += A[row * N + i] * B[i * K + col];
}
C[row * K + col] = sum;
}
}
int main()
{
const int M = 1000;
const int N = 1000;
const int K = 1000;
// Allocate memory for matrices
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
// Initialize matrices with random values
srand(static_cast<unsigned int>(time(nullptr)));
for (int i = 0; i < M * N; ++i)
{
A[i] = static_cast<float>(rand()) / RAND_MAX;
}
for (int i = 0; i < N * K; ++i)
{
B[i] = static_cast<float>(rand()) / RAND_MAX;
}
float *d_A;
float *d_B;
float *d_C;
size_t matrixSize = M * N * sizeof(float);
// Allocate and copy to device
CUDA_CHECK(cudaMalloc((void **)&d_A, matrixSize));
CUDA_CHECK(cudaMalloc((void **)&d_B, matrixSize));
CUDA_CHECK(cudaMalloc((void **)&d_C, M * K * sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_A, A, matrixSize, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, B, matrixSize, cudaMemcpyHostToDevice));
// Setup CuBLAS handles
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
const int numIterations = 100;
clock_t startTime = clock();
for (int i = 0; i < numIterations; ++i)
{
matrixMultiplyCuBLAS(d_A, d_B, d_C, M, N, K, handle);
}
clock_t endTime = clock();
double elapsedTime = static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC;
double averageTime = elapsedTime / numIterations;
cudaDeviceSynchronize();
std::cout << "Cublas time: " << averageTime << " seconds" << std::endl;
// destroy cublasHandle_t;
CUBLAS_CHECK(cublasDestroy(handle));
// No need to copy again the inputs
dim3 blockSize(32, 32);
dim3 gridSize((K + blockSize.x - 1) / blockSize.x, (M + blockSize.y - 1) / blockSize.y);
startTime = clock();
for (int i = 0; i < numIterations; ++i)
{
matrixMultiplyCUDA<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
}
CUDA_CHECK(cudaDeviceSynchronize());
endTime = clock();
elapsedTime = static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC;
averageTime = elapsedTime / numIterations;
std::cout << "CUDA time: " << averageTime << " seconds" << std::endl;
CUDA_CHECK(cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
// Clean up allocated memory
delete[] A;
delete[] B;
delete[] C;
return 0;
}
#include <iostream>
#include <vector>
#include <memory>
#include <chrono>
#include <immintrin.h>
#include <ctime>
#include <cstdlib>
#include <cstring>
// SSE implementation
template <int M, int N, int X>
void matrixMultiplySSE(float (*mat1)[N], float (*mat2)[X], float (*result)[X])
{
static_assert(M % 4 == 0 && N % 4 == 0 && X % 4 == 0,
"Matrix dimensions must be divisible by 4.");
int i, j, k;
__m128 v1, v2, v3, v4;
for (i = 0; i < M; ++i) {
for (j = 0; j < X; ++j) {
v1 = _mm_set1_ps(mat1[i][j]);
for (k = 0; k < N; k += 4) {
v2 = _mm_loadu_ps(&mat2[j][k]);
v3 = _mm_loadu_ps(&result[i][k]);
v4 = _mm_add_ps(v3, _mm_mul_ps(v1, v2));
_mm_storeu_ps(&result[i][k], v4);
}
}
}
}
// AVX implementation
template <int M, int N, int X>
void matrixMultiplyAVX(float (*mat1)[N], float (*mat2)[X], float (*result)[X])
{
static_assert(M % 8 == 0 && N % 8 == 0 && X % 8 == 0,
"Matrix dimensions must be divisible by 8.");
int i, j, k;
__m256 v1, v2, v3, v4;
for (i = 0; i < M; ++i) {
for (j = 0; j < X; ++j) {
v1 = _mm256_set1_ps(mat1[i][j]);
for (k = 0; k < N; k += 8) {
v2 = _mm256_loadu_ps(&mat2[j][k]);
v3 = _mm256_loadu_ps(&result[i][k]);
v4 = _mm256_add_ps(v3, _mm256_mul_ps(v1, v2));
_mm256_storeu_ps(&result[i][k], v4);
}
}
}
}
// Regular implementation
template <int M, int N, int X>
void matrixMultiply(float (*A)[N], float (*B)[X], float (*C)[X])
{
for (int i = 0; i < M; ++i) {
for (int j = 0; j < X; ++j) {
C[i][j] = 0.0f;
for (int k = 0; k < N; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
int main() {
const int M = 1000;
const int N = 1000;
const int X = 1000;
// Allocate memory for matrices
float (*A)[N] = new float[M][N];
float (*B)[X] = new float[N][X];
float (*C)[X] = new float[M][X];
float (*C_SSE)[X] = new float[M][X];
float (*C_AVX)[X] = new float[M][X];
//Generate random values for matrices A and B
srand(static_cast<unsigned int>(time(nullptr)));
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
A[i][j] = static_cast<float>(rand()) / RAND_MAX;
}
}
for (int i = 0; i < N; ++i) {
for (int j = 0; j < X; ++j) {
B[i][j] = static_cast<float>(rand()) / RAND_MAX;
}
}
// Perform matrix multiplication using the regular implementation
clock_t startTime = clock();
matrixMultiply<M, N, X>(A, B, C);
clock_t endTime = clock();
double elapsedTime = static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC;
std::cout << "Regular implementation time: " << elapsedTime << " seconds" << std::endl;
// Perform matrix multiplication using the SSE implementation
startTime = clock();
matrixMultiplySSE<M, N, X>(A, B, C_SSE);
endTime = clock();
elapsedTime = static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC;
std::cout << "SSE implementation time: " << elapsedTime << " seconds" << std::endl;
// Perform matrix multiplication using the AVX implementation
startTime = clock();
matrixMultiplyAVX<M, N, X>(A, B, C_AVX);
endTime = clock();
elapsedTime = static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC;
std::cout << "AVX implementation time: " << elapsedTime << " seconds" << std::endl;
// Free memory
delete[] A;
delete[] B;
delete[] C;
delete[] C_SSE;
delete[] C_AVX;
return 0;
}
sid@sid-ubuntu ~/D/G/Gists> time ./matrix_multiply_cuda
Cublas time: 6.63e-06 seconds
CUDA time: 0.0054958 seconds
________________________________________________________
Executed in 2.79 secs fish external
usr time 1.15 secs 794.00 micros 1.14 secs
sys time 1.36 secs 397.00 micros 1.36 secs
sid@sid-ubuntu ~/D/G/Gists> time ./matrix_multiply
Regular implementation time: 2.22722 seconds
SSE implementation time: 0.768816 seconds
AVX implementation time: 0.457482 seconds
________________________________________________________
Executed in 3.49 secs fish external
usr time 3.49 secs 0.00 micros 3.49 secs
sys time 0.00 secs 746.00 micros 0.00 secs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment