Skip to content

Instantly share code, notes, and snippets.

@johnwalley
Last active August 29, 2015 14:05
Show Gist options
  • Save johnwalley/4f4d9c231e4d7ba9aec5 to your computer and use it in GitHub Desktop.
Save johnwalley/4f4d9c231e4d7ba9aec5 to your computer and use it in GitHub Desktop.
Simple matrix-vector multiplication
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size);
__global__ void multiplyKernel(float *c, const float *a, const float *b, const int size) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
c[index] = 0;
for (int j = 0; j < size; ++j)
c[index] += a[index * size + j] * b[index];
}
int main()
{
const int arraySize = 1024 * 8;
float* a;
float b[arraySize];
float c[arraySize] = { 0 };
int nIter = 1;
a = (float*)malloc(sizeof(float) * arraySize * arraySize);
// Initialize the host input vectors
for (int i = 0; i < arraySize; ++i) {
for (int j = 0; j < arraySize; ++j) {
a[i*arraySize + j] = (float)(i * j);
}
}
for (int i = 0; i < arraySize; ++i) {
a[i] = (float)i;
}
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
cudaEventCreate(&start);
cudaEvent_t stop;
cudaEventCreate(&stop);
// Record the start event
cudaEventRecord(start, NULL);
// Execute the kernel
for (int j = 0; j < nIter; j++) {
multiplyWithCuda(c, a, b, arraySize);
}
// Record the stop event
cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
printf("Time= %.3f msec\n", msecTotal);
free(a);
return 0;
}
// Helper function for using CUDA to multiply a matrix by a vector in parallel.
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size)
{
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaDeviceSynchronize();
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment