Skip to content

Instantly share code, notes, and snippets.

@jfacoustic
Created December 22, 2019 17:32
Show Gist options
  • Save jfacoustic/bcc2bdc47dd721b197f984c70c50e300 to your computer and use it in GitHub Desktop.
Save jfacoustic/bcc2bdc47dd721b197f984c70c50e300 to your computer and use it in GitHub Desktop.
Function Ptr Cuda Kernel
// CUDA function ptr example
// For basic vector operations
// Based on Chapter 2 of Programming Massively Parallel Processors 3rd Edition Kirk & Hwu
#include <stdio.h>
#include <cuda.h>
__global__ void addKernel(float * a, float * b, float * c, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < n) {
c[i] = a[i] + b[i];
}
}
// Exercises:
// Subtraction
// Dot Product
// Add two indices simultaneously
void vec_op(void (*kernel)(float*, float*, float*, int), float * h_a, float * h_b, float * h_c, int n) {
int size = n * sizeof(float);
float *d_a, *d_b, *d_c;
cudaMalloc((void**) &d_a, size);
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &d_b, size);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &d_c, size);
kernel<<<ceil(n/256.0),256>>>(d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
int main(void) {
int n = 100;
float * a_h = (float*) malloc(sizeof(float) * n);
float * b_h = (float*) malloc(sizeof(float) * n);
// randomize vector creation
for(int i = 0; i < n; i++) {
a_h[i] = 1.5*i;
b_h[i] = 2.0*i;
}
float * c_h = (float*) malloc(sizeof(float) * n);
vec_op(&addKernel, a_h, b_h, c_h, n);
for(int i = 0; i < n; i++) {
printf("%f \n", c_h[i] );
}
return 0;
}
// By Joshua Mathews// CUDA function ptr example
// For basic vector operations
// Based on Chapter 2 of Programming Massively Parallel Processors 3rd Edition Kirk & Hwu
#include <stdio.h>
#include <cuda.h>
__global__ void addKernel(float * a, float * b, float * c, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < n) {
c[i] = a[i] + b[i];
}
}
// Exercises:
// Subtraction
// Dot Product
// Add two indices simultaneously
void vec_op(void (*kernel)(float*, float*, float*, int), float * h_a, float * h_b, float * h_c, int n) {
int size = n * sizeof(float);
float *d_a, *d_b, *d_c;
cudaMalloc((void**) &d_a, size);
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &d_b, size);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &d_c, size);
kernel<<<ceil(n/256.0),256>>>(d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
int main(void) {
int n = 100;
float * a_h = (float*) malloc(sizeof(float) * n);
float * b_h = (float*) malloc(sizeof(float) * n);
// TODO: randomize vector creation
for(int i = 0; i < n; i++) {
a_h[i] = 1.5*i;
b_h[i] = 2.0*i;
}
float * c_h = (float*) malloc(sizeof(float) * n);
vec_op(&addKernel, a_h, b_h, c_h, n);
for(int i = 0; i < n; i++) {
printf("%f \n", c_h[i] );
}
return 0;
}
// By Joshua Mathews
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment