amroamroamro/ README.md

## README.md

      
    Raw
  

               README.md
            
          
    Example of using CUDA in a MATLAB MEX-function.
Tested on Windows 8.1 x64, MATLAB R2015a, CUDA 6.5, Visual Studio 2013.
Steps to compile and test:
C:\> call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" amd64
C:\> set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5
C:\> set PATH=%PATH%;%CUDA_PATH%\bin;%CUDA_PATH%\lib;%CUDA_PATH%\lib64
C:\> nvcc -c -m64 add.cu
C:\> matlab
>> mex -largeArrayDims main_mex.cpp add.obj -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\x64" -lcudart
>> a = rand([5 5], 'single'); b = rand([5 5], 'single');
>> c1 = a+b;
>> c2 = main_mex(a,b)

  
## add.cu
#include "cuda_runtime.h"
#include "add_wrapper.hpp"

__global__ void addKernel(float *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

// C = A + B
void addWithCUDA(float *cpuC, const float *cpuA, const float *cpuB, const size_t sz)
{
    //TODO: add error checking

    // choose which GPU to run on
    cudaSetDevice(0);

    // allocate GPU buffers
    float *gpuA, *gpuB, *gpuC;
    cudaMalloc((void**)&gpuA, sz*sizeof(float));
    cudaMalloc((void**)&gpuB, sz*sizeof(float));
    cudaMalloc((void**)&gpuC, sz*sizeof(float));

    // copy input vectors from host memory to GPU buffers
    cudaMemcpy(gpuA, cpuA, sz*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gpuB, cpuB, sz*sizeof(float), cudaMemcpyHostToDevice);

    // launch kernel on the GPU with one thread per element
    addKernel<<<1,sz>>>(gpuC, gpuA, gpuB);

    // wait for the kernel to finish
    cudaDeviceSynchronize();

    // copy output vector from GPU buffer to host memory
    cudaMemcpy(cpuC, gpuC, sz*sizeof(float), cudaMemcpyDeviceToHost);

    // cleanup
    cudaFree(gpuA);
    cudaFree(gpuB);
    cudaFree(gpuC);
}

void resetDevice()
{
    cudaDeviceReset();
}

## add_wrapper.hpp
#ifndef ADD_WRAPPER_HPP
#define ADD_WRAPPER_HPP

void addWithCUDA(float *, const float *, const float *, const size_t);
void resetDevice();

#endif

## main_mex.cpp
#include "mex.h"
#include "add_wrapper.hpp"

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
    // input validation
    if (nrhs != 2 || nlhs > 1)
        mexErrMsgTxt("Wrong number of input/output arguments.");
    if (!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1]))
        mexErrMsgTxt("Inputs must be single arrays.");
    if (mxIsComplex(prhs[0]) || mxIsComplex(prhs[1]))
        mexErrMsgTxt("Inputs must be real arrays.");
    if (mxIsSparse(prhs[0]) || mxIsSparse(prhs[1]))
        mexErrMsgTxt("Inputs must be dense arrays.");
    if (mxGetNumberOfElements(prhs[0]) != mxGetNumberOfElements(prhs[1]))
        mexErrMsgTxt("Inputs must have the same size.");

    // create ouput array
    mwSize numel = mxGetNumberOfElements(prhs[0]);
    mwSize ndims = mxGetNumberOfDimensions(prhs[0]);
    const mwSize *dims = mxGetDimensions(prhs[0]);
    plhs[0] = mxCreateNumericArray(ndims, dims, mxSINGLE_CLASS, mxREAL);

    // get pointers to data
    float *c = (float*) mxGetData(plhs[0]);
    float *a = (float*) mxGetData(prhs[0]);
    float *b = (float*) mxGetData(prhs[1]);

    // perform addition on the GPU: c = a + b
    addWithCUDA(c, a, b, numel);
}
	#include "cuda_runtime.h"
	#include "add_wrapper.hpp"

	__global__ void addKernel(float c, const float a, const float *b)
	{
	int i = threadIdx.x;
	c[i] = a[i] + b[i];
	}

	// C = A + B
	void addWithCUDA(float cpuC, const float cpuA, const float *cpuB, const size_t sz)
	{
	//TODO: add error checking

	// choose which GPU to run on
	cudaSetDevice(0);

	// allocate GPU buffers
	float gpuA, gpuB, *gpuC;
	cudaMalloc((void*)&gpuA, szsizeof(float));
	cudaMalloc((void*)&gpuB, szsizeof(float));
	cudaMalloc((void*)&gpuC, szsizeof(float));

	// copy input vectors from host memory to GPU buffers
	cudaMemcpy(gpuA, cpuA, sz*sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(gpuB, cpuB, sz*sizeof(float), cudaMemcpyHostToDevice);

	// launch kernel on the GPU with one thread per element
	addKernel<<<1,sz>>>(gpuC, gpuA, gpuB);

	// wait for the kernel to finish
	cudaDeviceSynchronize();

	// copy output vector from GPU buffer to host memory
	cudaMemcpy(cpuC, gpuC, sz*sizeof(float), cudaMemcpyDeviceToHost);

	// cleanup
	cudaFree(gpuA);
	cudaFree(gpuB);
	cudaFree(gpuC);
	}

	void resetDevice()
	{
	cudaDeviceReset();
	}
	#ifndef ADD_WRAPPER_HPP
	#define ADD_WRAPPER_HPP

	void addWithCUDA(float , const float , const float *, const size_t);
	void resetDevice();

	#endif
	#include "mex.h"
	#include "add_wrapper.hpp"

	void mexFunction(int nlhs, mxArray plhs[], int nrhs, const mxArray prhs[])
	{
	// input validation
	if (nrhs != 2 \|\| nlhs > 1)
	mexErrMsgTxt("Wrong number of input/output arguments.");
	if (!mxIsSingle(prhs[0]) \|\| !mxIsSingle(prhs[1]))
	mexErrMsgTxt("Inputs must be single arrays.");
	if (mxIsComplex(prhs[0]) \|\| mxIsComplex(prhs[1]))
	mexErrMsgTxt("Inputs must be real arrays.");
	if (mxIsSparse(prhs[0]) \|\| mxIsSparse(prhs[1]))
	mexErrMsgTxt("Inputs must be dense arrays.");
	if (mxGetNumberOfElements(prhs[0]) != mxGetNumberOfElements(prhs[1]))
	mexErrMsgTxt("Inputs must have the same size.");

	// create ouput array
	mwSize numel = mxGetNumberOfElements(prhs[0]);
	mwSize ndims = mxGetNumberOfDimensions(prhs[0]);
	const mwSize *dims = mxGetDimensions(prhs[0]);
	plhs[0] = mxCreateNumericArray(ndims, dims, mxSINGLE_CLASS, mxREAL);

	// get pointers to data
	float c = (float) mxGetData(plhs[0]);
	float a = (float) mxGetData(prhs[0]);
	float b = (float) mxGetData(prhs[1]);

	// perform addition on the GPU: c = a + b
	addWithCUDA(c, a, b, numel);
	}