Allan MacKinnon allanmac

## malloc.cu
#include <stdio.h>

//
//
//

static
void
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{

## short4.cu
// -*- compile-command: "nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin short4.cu"; -*-

#include <stdint.h>

typedef uint32_t u32;
typedef uint64_t u64;

typedef union
{
  short4 s16v4;

## sha256.cu
// -*- compile-command: "nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin sha256.cu"; -*-

//
// Copyright 2013 Allan MacKinnon <allanmac@alum.mit.edu>
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to

## float3 SoA to AoS
===============================================================================================

Load three arrays (x, y and z) in SoA order, repack them and store them in AoS order.

Strategy: each warp permutes its load lane with:

   (rowNum + (laneId() * 3)) & 31

This will convert SoA into AoS but with x/y/z staggered across rows of registers.

## warp_scan.inl
#pragma once

//
//
//

#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP)

////////////////////////////////////////////////////////////////////////
//

## peer.cu

#include <stdio.h>
#include <cuda.h>

int main(int argc, char** argv)
{
  cuInit(0);

  int count;

## symarg.cu
//
//
//

#define KERNEL_QUALIFIERS   extern "C" __global__

//
//
//

## scan.cu

#include <stdio.h>

//
//
//

#define WARP_SIZE                     32
#define VOLATILE                      volatile

## namespace.cu

#define KERNEL_QUALIFIERS                   __global__
#define VOLATILE                            volatile

#define DEVICE_INTRINSIC_QUALIFIERS         __device__ __forceinline__

#define DEVICE_STATIC_FUNCTION_QUALIFIERS   static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS  static DEVICE_INTRINSIC_QUALIFIERS

//

## sync.cu
//
//
//

#define KERNEL_QUALIFIERS  extern "C" __global__

//
//
//
	#include <stdio.h>

	//
	//
	//

	static
	void
	cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
	{
	// -- compile-command: "nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin short4.cu"; --

	#include <stdint.h>

	typedef uint32_t u32;
	typedef uint64_t u64;

	typedef union
	{
	short4 s16v4;
	// -- compile-command: "nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin sha256.cu"; --

	//
	// Copyright 2013 Allan MacKinnon <allanmac@alum.mit.edu>
	//
	// Permission is hereby granted, free of charge, to any person obtaining
	// a copy of this software and associated documentation files (the
	// "Software"), to deal in the Software without restriction, including
	// without limitation the rights to use, copy, modify, merge, publish,
	// distribute, sublicense, and/or sell copies of the Software, and to
	===============================================================================================

	Load three arrays (x, y and z) in SoA order, repack them and store them in AoS order.

	Strategy: each warp permutes its load lane with:

	(rowNum + (laneId() * 3)) & 31

	This will convert SoA into AoS but with x/y/z staggered across rows of registers.
	#pragma once

	//
	//
	//

	#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP)

	////////////////////////////////////////////////////////////////////////
	//

	#include <stdio.h>
	#include <cuda.h>

	int main(int argc, char** argv)
	{
	cuInit(0);

	int count;

	#include <stdio.h>

	//
	//
	//

	#define WARP_SIZE 32
	#define VOLATILE volatile

	#define KERNEL_QUALIFIERS __global__
	#define VOLATILE volatile

	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__

	#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
	#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS

	//