Skip to content

Instantly share code, notes, and snippets.

@machinaut
Created July 26, 2021 23:08
Show Gist options
  • Save machinaut/30b365d31abb4941fc838e0acb9e5db3 to your computer and use it in GitHub Desktop.
Save machinaut/30b365d31abb4941fc838e0acb9e5db3 to your computer and use it in GitHub Desktop.
Trying a bare cuda vector add against pytorch and triton
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
CUDA_PATH ?= /usr/local/cuda
.PHONY: clean
vadd.so: vadd.o
nvcc -shared $^ -o $@ -lcuda
vadd.o: vadd.cu
nvcc -I $(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc -arch=sm_70 --compiler-options '-fPIC' $^ -c $@
clean:
rm -f *.o *.so
// For the CUDA runtime routines (prefixed with "cuda_")
// #include <cuda.h>
#include <cuda_runtime.h>
namespace
{
__global__ void _vadd(const float *A, const float *B, float *C, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[i];
}
}
}
extern "C" void vadd(const float *A, const float *B, float *C, int n, int threads)
{
const int blocks = (n + threads - 1) / threads;
_vadd<<<blocks, threads>>>(A, B, C, n);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment