Skip to content

Instantly share code, notes, and snippets.

@olivatooo
Created December 1, 2019 16:07
Show Gist options
  • Save olivatooo/20ea58208a8063d1ee29d9bb3fda3bb9 to your computer and use it in GitHub Desktop.
Save olivatooo/20ea58208a8063d1ee29d9bb3fda3bb9 to your computer and use it in GitHub Desktop.
Matrix multiplication using cuda
#include <stdio.h>
#include <stdlib.h>
#define SIZE 10
#define VALUE 5
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int dim_matrix)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < dim_matrix && row < dim_matrix)
{
for(int i = 0; i < dim_matrix; i++)
{
sum += a[row * dim_matrix + i] * b[i * dim_matrix + col];
}
c[row * dim_matrix + col] = sum;
}
}
int main(int argc, char const *argv[])
{
int dim_matrix = SIZE;
srand(1);
int *host_matrix_a, *host_matrix_b, *host_matrix_c;
cudaMallocHost((void **) &host_matrix_a, sizeof(int)*dim_matrix*dim_matrix);
cudaMallocHost((void **) &host_matrix_b, sizeof(int)*dim_matrix*dim_matrix);
cudaMallocHost((void **) &host_matrix_c, sizeof(int)*dim_matrix*dim_matrix);
// Randomizando Matriz A
for (int i = 0; i < dim_matrix; ++i) {
for (int j = 0; j < dim_matrix; ++j) {
host_matrix_a[i * dim_matrix + j] = rand() % 1024;
}
}
// Randomizando Matriz B
for (int i = 0; i < dim_matrix; ++i) {
for (int j = 0; j < dim_matrix; ++j) {
host_matrix_b[i * dim_matrix + j] = rand() % 1024;
}
}
float gpu_elapsed_time;
// Usado para medição do tempo
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Inicio da medição do tempo
cudaEventRecord(start, 0);
// Alocando espaço no device
int *device_matrix_a, *device_matrix_b, *device_matrix_c;
cudaMalloc((void **) &device_matrix_a, sizeof(int)*dim_matrix*dim_matrix);
cudaMalloc((void **) &device_matrix_b, sizeof(int)*dim_matrix*dim_matrix);
cudaMalloc((void **) &device_matrix_c, sizeof(int)*dim_matrix*dim_matrix);
// Copiando matriz A e B do host para o device
cudaMemcpy(device_matrix_a, host_matrix_a, sizeof(int)*dim_matrix*dim_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(device_matrix_b, host_matrix_b, sizeof(int)*dim_matrix*dim_matrix, cudaMemcpyHostToDevice);
unsigned int grid_rows = (dim_matrix + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (dim_matrix + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
gpu_matrix_mult<<<dimGrid, dimBlock>>>(device_matrix_a, device_matrix_b, device_matrix_c, dim_matrix);
// Transfere os resultados para o host
cudaMemcpy(host_matrix_c, device_matrix_c, sizeof(int)*dim_matrix*dim_matrix, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// Finalização do tempo
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Tempo gasto
cudaEventElapsedTime(&gpu_elapsed_time, start, stop);
printf("Time elapsed on matrix multiplication %dx%d on GPU: %f ms.\n\n", dim_matrix, dim_matrix, dim_matrix, dim_matrix, gpu_elapsed_time);
// Libera memória
cudaFree(device_matrix_a);
cudaFree(device_matrix_b);
cudaFree(device_matrix_c);
cudaFreeHost(host_matrix_a);
cudaFreeHost(host_matrix_b);
cudaFreeHost(host_matrix_c);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment