Trevor Killeen killeent

## dumb_kernel.cuh
// Super Dumb Kernel

__device__ __forceinline__ long calculateOffset(
  long index,                  // index to calculate offset for
  int ndim,                   // number of dimensions in Tensor
  long sizes[8],              // sizes for Tensor dims (either from the Tensor, or the size of the adv indexer at that dim)
  long strides[8],            // strides for Tensor
  bool adv[8],                // which Tensors are advanced indexers
  long *advIndexTensors[8],   // Adv Indexing Tensors
)

## 2a.txt
In [1]: import torch

In [2]: x = torch.arange(0, 64).view(8, 8)

In [3]: x
Out[3]:

    0     1     2     3     4     5     6     7
    8     9    10    11    12    13    14    15
   16    17    18    19    20    21    22    23

## TypeCheck.cpp
  // Case 1: arg is a non-tuple sequence object
  if (PySequence_Check(arg) && !PyTuple_Check(arg)) return true;

#ifdef WITH_NUMPY
  // Case 2: arg is an nd-array with type integer or bool
  if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject*)arg) == NPY_INT64 || PyArray_TYPE((PyArrayObject*)arg) == NPY_BOOL)) return true;
#endif

  // Case 3: arg is a tuple containing at least one sequence object, ndarray, or LongTensor
  if (PyTuple_Check(arg)) {

## Tensor.cpp
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Tensor.cpp"
#else

#ifdef WITH_NUMPY

#ifdef TH_REAL_IS_DOUBLE
#define NUMPY_TYPE_ENUM NPY_DOUBLE
#endif
#ifdef TH_REAL_IS_FLOAT

## check.cpp
bool THPUtils_checkAdvancedIndexing(PyObject *arg) {
  // Checks whether the specified selection object should trigger advanced
  // indexing

  // Case 1: arg is a non-tuple sequence object
  if (PyList_Check(arg) || PyRange_Check(arg)) return true;

#ifdef WITH_NUMPY
  // Case 2: arg is an nd-array with type integer or bool
  if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject*)arg) == NPY_INT64 || PyArray_TYPE((PyArrayObject*)arg) == NPY_BOOL)) return true;

## reduce.cuh
#ifndef THC_REDUCE_APPLY_UTILS_INC
#define THC_REDUCE_APPLY_UTILS_INC

#include <algorithm>
#include <cuda.h>
#include <assert.h>
#include "THCGeneral.h"
#include "THCTensor.h"
#include "THCDeviceUtils.cuh"
#include "THCTensorInfo.cuh"

## results-updated.txt
Testing average duration for 10 loops
Testing 1D Tensor of size 8: 6 usec (TH), 51 usec (THC)
Testing 1D Tensor of size 16: 1 usec (TH), 43 usec (THC)
Testing 1D Tensor of size 32: 1 usec (TH), 42 usec (THC)
Testing 1D Tensor of size 64: 1 usec (TH), 57 usec (THC)
Testing 1D Tensor of size 128: 3 usec (TH), 60 usec (THC)
Testing 1D Tensor of size 256: 4 usec (TH), 99 usec (THC)
Testing 1D Tensor of size 512: 10 usec (TH), 128 usec (THC)
Testing 1D Tensor of size 1024: 24 usec (TH), 130 usec (THC)
Testing 1D Tensor of size 2048: 52 usec (TH), 1723 usec (THC)

## reduce.cuh
// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
// return the reduced value
template <typename T, typename ReduceOp>
__device__ T reduceBlock(T* smem,
                         int numVals,
                         T threadVal,
                         ReduceOp reduceOp,
                         T init) {
  if (numVals == 0) {
    return init;

## reduceN.cuh
// Block-wide reduction where each thread locally reduces N
// values before letting a single warp take over
template <typename T, typename ReduceOp, int N>
__device__ T reduceBlockN(T *smem,
                         int numVals,
                         ReduceOp reduceOp,
                         T init) {
  T local = threadIdx.x < numVals ? smem[threadIdx.x] : init;

#pragma unroll

## results.txt
Testing average duration for 10 loops
Testing 1D Tensor of size 8: 6 usec (TH), 49 usec (THC)
Testing 1D Tensor of size 16: 1 usec (TH), 39 usec (THC)
Testing 1D Tensor of size 32: 1 usec (TH), 41 usec (THC)
Testing 1D Tensor of size 64: 1 usec (TH), 51 usec (THC)
Testing 1D Tensor of size 128: 2 usec (TH), 53 usec (THC)
Testing 1D Tensor of size 256: 6 usec (TH), 83 usec (THC)
Testing 1D Tensor of size 512: 10 usec (TH), 108 usec (THC)
Testing 1D Tensor of size 1024: 23 usec (TH), 109 usec (THC)
Testing 1D Tensor of size 2048: 48 usec (TH), 1370 usec (THC)
	// Super Dumb Kernel

	__device__ __forceinline__ long calculateOffset(
	long index, // index to calculate offset for
	int ndim, // number of dimensions in Tensor
	long sizes[8], // sizes for Tensor dims (either from the Tensor, or the size of the adv indexer at that dim)
	long strides[8], // strides for Tensor
	bool adv[8], // which Tensors are advanced indexers
	long *advIndexTensors[8], // Adv Indexing Tensors
	)
	In [1]: import torch

	In [2]: x = torch.arange(0, 64).view(8, 8)

	In [3]: x
	Out[3]:

	0 1 2 3 4 5 6 7
	8 9 10 11 12 13 14 15
	16 17 18 19 20 21 22 23
	// Case 1: arg is a non-tuple sequence object
	if (PySequence_Check(arg) && !PyTuple_Check(arg)) return true;

	#ifdef WITH_NUMPY
	// Case 2: arg is an nd-array with type integer or bool
	if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject)arg) == NPY_INT64 \|\| PyArray_TYPE((PyArrayObject)arg) == NPY_BOOL)) return true;
	#endif

	// Case 3: arg is a tuple containing at least one sequence object, ndarray, or LongTensor
	if (PyTuple_Check(arg)) {
	#ifndef TH_GENERIC_FILE
	#define TH_GENERIC_FILE "generic/Tensor.cpp"
	#else

	#ifdef WITH_NUMPY

	#ifdef TH_REAL_IS_DOUBLE
	#define NUMPY_TYPE_ENUM NPY_DOUBLE
	#endif
	#ifdef TH_REAL_IS_FLOAT
	bool THPUtils_checkAdvancedIndexing(PyObject *arg) {
	// Checks whether the specified selection object should trigger advanced
	// indexing

	// Case 1: arg is a non-tuple sequence object
	if (PyList_Check(arg) \|\| PyRange_Check(arg)) return true;

	#ifdef WITH_NUMPY
	// Case 2: arg is an nd-array with type integer or bool
	if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject)arg) == NPY_INT64 \|\| PyArray_TYPE((PyArrayObject)arg) == NPY_BOOL)) return true;
	#ifndef THC_REDUCE_APPLY_UTILS_INC
	#define THC_REDUCE_APPLY_UTILS_INC

	#include <algorithm>
	#include <cuda.h>
	#include <assert.h>
	#include "THCGeneral.h"
	#include "THCTensor.h"
	#include "THCDeviceUtils.cuh"
	#include "THCTensorInfo.cuh"
	Testing average duration for 10 loops
	Testing 1D Tensor of size 8: 6 usec (TH), 51 usec (THC)
	Testing 1D Tensor of size 16: 1 usec (TH), 43 usec (THC)
	Testing 1D Tensor of size 32: 1 usec (TH), 42 usec (THC)
	Testing 1D Tensor of size 64: 1 usec (TH), 57 usec (THC)
	Testing 1D Tensor of size 128: 3 usec (TH), 60 usec (THC)
	Testing 1D Tensor of size 256: 4 usec (TH), 99 usec (THC)
	Testing 1D Tensor of size 512: 10 usec (TH), 128 usec (THC)
	Testing 1D Tensor of size 1024: 24 usec (TH), 130 usec (THC)
	Testing 1D Tensor of size 2048: 52 usec (TH), 1723 usec (THC)
	// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
	// return the reduced value
	template <typename T, typename ReduceOp>
	__device__ T reduceBlock(T* smem,
	int numVals,
	T threadVal,
	ReduceOp reduceOp,
	T init) {
	if (numVals == 0) {
	return init;
	// Block-wide reduction where each thread locally reduces N
	// values before letting a single warp take over
	template <typename T, typename ReduceOp, int N>
	__device__ T reduceBlockN(T *smem,
	int numVals,
	ReduceOp reduceOp,
	T init) {
	T local = threadIdx.x < numVals ? smem[threadIdx.x] : init;

	#pragma unroll