Skip to content

Instantly share code, notes, and snippets.

View allanmac's full-sized avatar

Allan MacKinnon allanmac

  • Dispatch3 Inc.
  • South Florida, USA
  • 22:43 (UTC -04:00)
  • X @pixelio
View GitHub Profile
@allanmac
allanmac / threadedCode.cu
Created December 9, 2012 05:42
A primitive example of threaded code in CUDA.
#include <stdio.h>
//
//
//
#define LAUNCH_BOUNDS // __launch_bounds__(512)
#define DEVICE_FUNCTION_QUALIFIERS __device__
@allanmac
allanmac / fmuladd.cu
Last active December 10, 2015 08:38
__global__
void fmuladdTest(float* const values)
{
const unsigned int tidx = threadIdx.x;
const float b = values[ tidx];
float a = values[2*tidx];
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.73f);
__global__
void fmaTest(float* const values)
{
const unsigned int tidx = threadIdx.x;
const float b = values[ tidx];
float a = values[2*tidx];
a = __fmaf_rn(a, b, 0.73f);
a = __fmaf_rn(a, b, 0.37f);
@allanmac
allanmac / gmem.cu
Created February 6, 2013 20:20
Exercise vector load and store. Also evaluate LDG.CT.
//
//
//
#define WARP_SIZE 32
#define RESTRICT __restrict
//
@allanmac
allanmac / smid.cu
Last active December 12, 2015 09:19
Probe the CUDA special registers %smid and %nsmid.
#include <stdio.h>
//
//
//
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
DEVICE_INTRINSIC_QUALIFIERS
unsigned int
@allanmac
allanmac / geff.cu
Last active December 14, 2015 01:29
Example kernel used to observe Global Load/Store Efficiency metrics in the Visual Profiler.
#include <stdio.h>
//
//
//
#define TYPE unsigned int
#define REPS 1
@allanmac
allanmac / natural.cu
Last active December 14, 2015 06:38
The `setp` and `selp` instructions are your friends.
extern "C"
__global__
void natural(const unsigned int b,
const unsigned int c,
const unsigned int y,
const unsigned int z,
const unsigned int id,
unsigned int* const out)
{
const bool flag = (id == 1);
@allanmac
allanmac / fdimf.cu
Last active December 14, 2015 17:49
Inspecting fdimf() output.
#define KERNEL_QUALIFIERS extern "C" __global__
KERNEL_QUALIFIERS
void fdimfTest(const float x, const float y, float* const fout)
{
fout[threadIdx.x] = fdimf(x,y);
}
KERNEL_QUALIFIERS
void fdimfTest2(const float x, const float y, float* const fout)
@allanmac
allanmac / shflmax.cu
Created March 14, 2013 20:38
Butterfly max.
#define KERNEL_QUALIFIERS extern "C" __global__
KERNEL_QUALIFIERS
void shflmax(const int* const vin, int* const vout)
{
int v = vin[threadIdx.x];
v = max(v,__shfl_xor(v,16));
v = max(v,__shfl_xor(v, 8));
@allanmac
allanmac / shflrot.cu
Last active September 21, 2023 22:59
Experiments with shfl.idx/up/down to see how negative indices or offsets are handled. The shuffled value and its predicate are returned.
#include <stdio.h>
//
//
//
#define WARP_SIZE 32
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__