# johnwalley/MatrixVectorMultiply1.cu Created Aug 10, 2014

Simple matrix-vector multiplication
 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size); __global__ void multiplyKernel(float *c, const float *a, const float *b, const int size) { int index = threadIdx.x + blockIdx.x * blockDim.x; c[index] = 0; for (int j = 0; j < size; ++j) c[index] += a[index * size + j] * b[index]; } int main() { const int arraySize = 1024 * 8; float* a; float b[arraySize]; float c[arraySize] = { 0 }; int nIter = 1; a = (float*)malloc(sizeof(float) * arraySize * arraySize); // Initialize the host input vectors for (int i = 0; i < arraySize; ++i) { for (int j = 0; j < arraySize; ++j) { a[i*arraySize + j] = (float)(i * j); } } for (int i = 0; i < arraySize; ++i) { a[i] = (float)i; } // Execute the kernel for (int j = 0; j < nIter; j++) { multiplyWithCuda(c, a, b, arraySize); } free(a); return 0; } // Helper function for using CUDA to multiply a matrix by a vector in parallel. cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size) { float *dev_a = 0; float *dev_b = 0; float *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float)); cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float)); cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float)); // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice); // Allocate CUDA events that we'll use for timing cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); // Record the start event cudaEventRecord(start, NULL); // Launch a kernel on the GPU with one thread for each element. multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size); // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaDeviceSynchronize(); // Record the stop event cudaEventRecord(stop, NULL); // Wait for the stop event to complete cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time= %.3f msec\n", msecTotal); // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost); cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; }