Skip to content

Instantly share code, notes, and snippets.

@amroamroamro
Created July 22, 2015 17:45
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save amroamroamro/1f926d9d298cf2e5f573 to your computer and use it in GitHub Desktop.
Save amroamroamro/1f926d9d298cf2e5f573 to your computer and use it in GitHub Desktop.
[MATLAB] Example MEX-function with CUDA

Example of using CUDA in a MATLAB MEX-function.

Tested on Windows 8.1 x64, MATLAB R2015a, CUDA 6.5, Visual Studio 2013.

Steps to compile and test:

C:\> call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" amd64
C:\> set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5
C:\> set PATH=%PATH%;%CUDA_PATH%\bin;%CUDA_PATH%\lib;%CUDA_PATH%\lib64
C:\> nvcc -c -m64 add.cu
C:\> matlab
>> mex -largeArrayDims main_mex.cpp add.obj -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\x64" -lcudart
>> a = rand([5 5], 'single'); b = rand([5 5], 'single');
>> c1 = a+b;
>> c2 = main_mex(a,b)
#include "cuda_runtime.h"
#include "add_wrapper.hpp"
__global__ void addKernel(float *c, const float *a, const float *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
// C = A + B
void addWithCUDA(float *cpuC, const float *cpuA, const float *cpuB, const size_t sz)
{
//TODO: add error checking
// choose which GPU to run on
cudaSetDevice(0);
// allocate GPU buffers
float *gpuA, *gpuB, *gpuC;
cudaMalloc((void**)&gpuA, sz*sizeof(float));
cudaMalloc((void**)&gpuB, sz*sizeof(float));
cudaMalloc((void**)&gpuC, sz*sizeof(float));
// copy input vectors from host memory to GPU buffers
cudaMemcpy(gpuA, cpuA, sz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gpuB, cpuB, sz*sizeof(float), cudaMemcpyHostToDevice);
// launch kernel on the GPU with one thread per element
addKernel<<<1,sz>>>(gpuC, gpuA, gpuB);
// wait for the kernel to finish
cudaDeviceSynchronize();
// copy output vector from GPU buffer to host memory
cudaMemcpy(cpuC, gpuC, sz*sizeof(float), cudaMemcpyDeviceToHost);
// cleanup
cudaFree(gpuA);
cudaFree(gpuB);
cudaFree(gpuC);
}
void resetDevice()
{
cudaDeviceReset();
}
#ifndef ADD_WRAPPER_HPP
#define ADD_WRAPPER_HPP
void addWithCUDA(float *, const float *, const float *, const size_t);
void resetDevice();
#endif
#include "mex.h"
#include "add_wrapper.hpp"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// input validation
if (nrhs != 2 || nlhs > 1)
mexErrMsgTxt("Wrong number of input/output arguments.");
if (!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1]))
mexErrMsgTxt("Inputs must be single arrays.");
if (mxIsComplex(prhs[0]) || mxIsComplex(prhs[1]))
mexErrMsgTxt("Inputs must be real arrays.");
if (mxIsSparse(prhs[0]) || mxIsSparse(prhs[1]))
mexErrMsgTxt("Inputs must be dense arrays.");
if (mxGetNumberOfElements(prhs[0]) != mxGetNumberOfElements(prhs[1]))
mexErrMsgTxt("Inputs must have the same size.");
// create ouput array
mwSize numel = mxGetNumberOfElements(prhs[0]);
mwSize ndims = mxGetNumberOfDimensions(prhs[0]);
const mwSize *dims = mxGetDimensions(prhs[0]);
plhs[0] = mxCreateNumericArray(ndims, dims, mxSINGLE_CLASS, mxREAL);
// get pointers to data
float *c = (float*) mxGetData(plhs[0]);
float *a = (float*) mxGetData(prhs[0]);
float *b = (float*) mxGetData(prhs[1]);
// perform addition on the GPU: c = a + b
addWithCUDA(c, a, b, numel);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment