Skip to content

Instantly share code, notes, and snippets.

View killeent's full-sized avatar

Trevor Killeen killeent

View GitHub Profile
// Super Dumb Kernel
__device__ __forceinline__ long calculateOffset(
long index, // index to calculate offset for
int ndim, // number of dimensions in Tensor
long sizes[8], // sizes for Tensor dims (either from the Tensor, or the size of the adv indexer at that dim)
long strides[8], // strides for Tensor
bool adv[8], // which Tensors are advanced indexers
long *advIndexTensors[8], // Adv Indexing Tensors
)
In [1]: import torch
In [2]: x = torch.arange(0, 64).view(8, 8)
In [3]: x
Out[3]:
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
// Case 1: arg is a non-tuple sequence object
if (PySequence_Check(arg) && !PyTuple_Check(arg)) return true;
#ifdef WITH_NUMPY
// Case 2: arg is an nd-array with type integer or bool
if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject*)arg) == NPY_INT64 || PyArray_TYPE((PyArrayObject*)arg) == NPY_BOOL)) return true;
#endif
// Case 3: arg is a tuple containing at least one sequence object, ndarray, or LongTensor
if (PyTuple_Check(arg)) {
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Tensor.cpp"
#else
#ifdef WITH_NUMPY
#ifdef TH_REAL_IS_DOUBLE
#define NUMPY_TYPE_ENUM NPY_DOUBLE
#endif
#ifdef TH_REAL_IS_FLOAT
bool THPUtils_checkAdvancedIndexing(PyObject *arg) {
// Checks whether the specified selection object should trigger advanced
// indexing
// Case 1: arg is a non-tuple sequence object
if (PyList_Check(arg) || PyRange_Check(arg)) return true;
#ifdef WITH_NUMPY
// Case 2: arg is an nd-array with type integer or bool
if (PyArray_Check(arg) && (PyArray_TYPE((PyArrayObject*)arg) == NPY_INT64 || PyArray_TYPE((PyArrayObject*)arg) == NPY_BOOL)) return true;
#ifndef THC_REDUCE_APPLY_UTILS_INC
#define THC_REDUCE_APPLY_UTILS_INC
#include <algorithm>
#include <cuda.h>
#include <assert.h>
#include "THCGeneral.h"
#include "THCTensor.h"
#include "THCDeviceUtils.cuh"
#include "THCTensorInfo.cuh"
Testing average duration for 10 loops
Testing 1D Tensor of size 8: 6 usec (TH), 51 usec (THC)
Testing 1D Tensor of size 16: 1 usec (TH), 43 usec (THC)
Testing 1D Tensor of size 32: 1 usec (TH), 42 usec (THC)
Testing 1D Tensor of size 64: 1 usec (TH), 57 usec (THC)
Testing 1D Tensor of size 128: 3 usec (TH), 60 usec (THC)
Testing 1D Tensor of size 256: 4 usec (TH), 99 usec (THC)
Testing 1D Tensor of size 512: 10 usec (TH), 128 usec (THC)
Testing 1D Tensor of size 1024: 24 usec (TH), 130 usec (THC)
Testing 1D Tensor of size 2048: 52 usec (TH), 1723 usec (THC)
// Block-wide reduction in shared memory helper; only threadIdx.x == 0 will
// return the reduced value
template <typename T, typename ReduceOp>
__device__ T reduceBlock(T* smem,
int numVals,
T threadVal,
ReduceOp reduceOp,
T init) {
if (numVals == 0) {
return init;
// Block-wide reduction where each thread locally reduces N
// values before letting a single warp take over
template <typename T, typename ReduceOp, int N>
__device__ T reduceBlockN(T *smem,
int numVals,
ReduceOp reduceOp,
T init) {
T local = threadIdx.x < numVals ? smem[threadIdx.x] : init;
#pragma unroll
Testing average duration for 10 loops
Testing 1D Tensor of size 8: 6 usec (TH), 49 usec (THC)
Testing 1D Tensor of size 16: 1 usec (TH), 39 usec (THC)
Testing 1D Tensor of size 32: 1 usec (TH), 41 usec (THC)
Testing 1D Tensor of size 64: 1 usec (TH), 51 usec (THC)
Testing 1D Tensor of size 128: 2 usec (TH), 53 usec (THC)
Testing 1D Tensor of size 256: 6 usec (TH), 83 usec (THC)
Testing 1D Tensor of size 512: 10 usec (TH), 108 usec (THC)
Testing 1D Tensor of size 1024: 23 usec (TH), 109 usec (THC)
Testing 1D Tensor of size 2048: 48 usec (TH), 1370 usec (THC)