Allan MacKinnon allanmac

## fma.cu

__global__
void fmaTest(float* const values)
{
  const unsigned int tidx = threadIdx.x;
  const float        b    = values[  tidx];
  float              a    = values[2*tidx];

  a = __fmaf_rn(a, b, 0.73f);
  a = __fmaf_rn(a, b, 0.37f);

## gmem.cu

//
//
//

#define WARP_SIZE 32

#define RESTRICT  __restrict

//

## smid.cu
#include <stdio.h>

//
//
//

#define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

DEVICE_INTRINSIC_QUALIFIERS
unsigned int

## geff.cu

#include <stdio.h>

//
//
//

#define TYPE                   unsigned int

#define REPS                   1

## natural.cu
extern "C"
__global__
void natural(const unsigned int  b,
             const unsigned int  c,
             const unsigned int  y,
             const unsigned int  z,
             const unsigned int  id,
             unsigned int* const out)
{
  const bool flag = (id == 1);

## fdimf.cu
#define KERNEL_QUALIFIERS extern "C" __global__

KERNEL_QUALIFIERS
void fdimfTest(const float x, const float y, float* const fout)
{
  fout[threadIdx.x] = fdimf(x,y);
}

KERNEL_QUALIFIERS
void fdimfTest2(const float x, const float y, float* const fout)

## shflmax.cu

#define KERNEL_QUALIFIERS extern "C" __global__

KERNEL_QUALIFIERS
void shflmax(const int* const vin, int* const vout)
{
  int v = vin[threadIdx.x];

  v = max(v,__shfl_xor(v,16));
  v = max(v,__shfl_xor(v, 8));

## bfe64.cu
#include <stdio.h>

//
//
//

#define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

//
//

## Makefile
all:
  nvcc -m 32 -Xptxas=-v,-abi=no     		\
	-gencode=arch=compute_11,code=sm_11  	\
	-gencode=arch=compute_12,code=sm_12  	\
	-gencode=arch=compute_20,code=sm_21  	\
	-gencode=arch=compute_30,code=sm_30  	\
	-gencode=arch=compute_35,code=sm_35  	\
	blocks.cu -o blocks

## mem4.cu
#define KERNEL_QUALIFIERS  extern "C" __global__

//
//
//

#define REPEAT1()                               \
  REPS(0)

#define REPEAT4()                               \

	__global__
	void fmaTest(float* const values)
	{
	const unsigned int tidx = threadIdx.x;
	const float b = values[ tidx];
	float a = values[2*tidx];

	a = __fmaf_rn(a, b, 0.73f);
	a = __fmaf_rn(a, b, 0.37f);
	#include <stdio.h>

	//
	//
	//

	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int

	#include <stdio.h>

	//
	//
	//

	#define TYPE unsigned int

	#define REPS 1
	extern "C"
	__global__
	void natural(const unsigned int b,
	const unsigned int c,
	const unsigned int y,
	const unsigned int z,
	const unsigned int id,
	unsigned int* const out)
	{
	const bool flag = (id == 1);
	#define KERNEL_QUALIFIERS extern "C" __global__

	KERNEL_QUALIFIERS
	void fdimfTest(const float x, const float y, float* const fout)
	{
	fout[threadIdx.x] = fdimf(x,y);
	}

	KERNEL_QUALIFIERS
	void fdimfTest2(const float x, const float y, float* const fout)

	#define KERNEL_QUALIFIERS extern "C" __global__

	KERNEL_QUALIFIERS
	void shflmax(const int* const vin, int* const vout)
	{
	int v = vin[threadIdx.x];

	v = max(v,__shfl_xor(v,16));
	v = max(v,__shfl_xor(v, 8));
	all:
	nvcc -m 32 -Xptxas=-v,-abi=no \
	-gencode=arch=compute_11,code=sm_11 \
	-gencode=arch=compute_12,code=sm_12 \
	-gencode=arch=compute_20,code=sm_21 \
	-gencode=arch=compute_30,code=sm_30 \
	-gencode=arch=compute_35,code=sm_35 \
	blocks.cu -o blocks
	#define KERNEL_QUALIFIERS extern "C" __global__

	//
	//
	//

	#define REPEAT1() \
	REPS(0)

	#define REPEAT4() \