Trevor Killeen killeent

## gist:c78ddf12a6e45ab30ae4cc3efa5a4d2a
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 9cccd34..136ce27 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -70,5 +70,10 @@ include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}/src
 ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
 add_subdirectory(src/ATen/test)
-add_subdirectory(contrib/data)
-add_subdirectory(contrib/meter)

## diff.patch

diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp
index a985509..293a4e1 100644
--- a/torch/csrc/distributed/Module.cpp
+++ b/torch/csrc/distributed/Module.cpp
@@ -186,8 +186,8 @@ THDTensorDescriptor THDPModule_makeDescriptor(PyObject *obj)
   PyObject *type = (PyObject*)Py_TYPE(obj);
 #define REGISTER_TH_DESCRIPTOR(TYPE, REAL)                                           \
   if (type == THP##TYPE##Class)                                                \
-    return at::CPU(REAL).unsafeTensorFromTH(((THP##TYPE*)obj)->cdata, true);

## gist:c19732a580ed2ca861ae3982a0da958d
#!/usr/bin/env bash
set -e

PYCMD=${PYCMD:="python"}
COVERAGE=0
while [[ "$#" -gt 0 ]]; do
    case "$1" in
        -p|--python) PYCMD=$2; shift 2 ;;
        -c|--coverage) COVERAGE=1; shift 1;;
        --) shift; break ;;

## depthwise_gradWeight.c
// Assumes:
// - input is (N, C, H, W)
// - gradOutput is (N, C, goH, goW)
// - gradWeight is (C, 1, kH, kW) --> (C, kH, kW)

// Naive Loop: No striding, padding, dilation handled

// These three loops would be parallelized, such that each is computed by a single block
for (int ch = 0; ch < C; ++ch) {
  for (gw_h_offset = 0; gw_h_offset < kH; ++gw_h_offset) {

## indexing.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                killeent
                / indexing.md
            
            
              Last active
              July 19, 2017 21:44
            
          
    PyTorch now supports a subset of NumPy style advanced indexing. This allows users to select arbitrary indices at each dimension of the Tensor, including non-adjacent indices and duplicate indices, using the same []-style operation. This allows for a more flexible indexing strategy without needing calls to PyTorch's Index[Select, Add, ...]  functions.
x = torch.Tensor(5, 5, 5)

# Pure Integer Array Indexing - specify arbitrary indices at each dim
x[[1, 2], [3, 2], [1, 0]] 
--> yields a 2-element Tensor (x[1][3][1], x[2][2][0])

# also supports broadcasting, duplicates


## adv_index.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                killeent
                / adv_index.md
            
            
              Created
              July 18, 2017 20:44
            
          
    Advanced Indexing


## impl.cpp
Tensor & CPUByteType::cat_out(TensorList tensors, int dim, Tensor & self) {
      auto self_ = checked_cast<CPUByteTensor>(self.pImpl,"self",0);
      auto tensors_ = tensor_list_checked_cast<CPUByteTensor, Tensor, THByteTensor>(tensors,"tensors",1);
      THByteTensor_catArray(self_->tensor, tensors_.data(), tensors_.size(), dim);
      return self;
}

## dumb_kernel.cuh
// Super Dumb Kernel

__device__ __forceinline__ long calculateOffset(
  long index,                  // index to calculate offset for
  int ndim,                   // number of dimensions in Tensor
  long sizes[8],              // sizes for Tensor dims (either from the Tensor, or the size of the adv indexer at that dim)
  long strides[8],            // strides for Tensor
  bool adv[8],                // which Tensors are advanced indexers
  long *advIndexTensors[8],   // Adv Indexing Tensors
)

## 2a.txt
In [1]: import torch

In [2]: x = torch.arange(0, 64).view(8, 8)

In [3]: x
Out[3]:

    0     1     2     3     4     5     6     7
    8     9    10    11    12    13    14    15
   16    17    18    19    20    21    22    23

## internals2.md

      
              1 file
            
          
              3 forks
            
          
              2 comments
            
          
              12 stars
            
          
                killeent
                / internals2.md
            
            
              Last active
              February 25, 2020 09:43
            
          
    WORK IN PROGRESS

PyTorch Internals Part II - The Build System

In the first post I explained how we generate a torch.Tensor object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components:

The core Torch libraries: TH, THC, THNN, THCUNN
Vendor libraries: CuDNN, NCCL
Python Extension libraries
Additional third-party libraries: NumPy, MKL, LAPACK
	diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
	index 9cccd34..136ce27 100644
	--- a/aten/CMakeLists.txt
	+++ b/aten/CMakeLists.txt
	@@ -70,5 +70,10 @@ include_directories(
	${CMAKE_CURRENT_SOURCE_DIR}/src
	${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
	add_subdirectory(src/ATen/test)
	-add_subdirectory(contrib/data)
	-add_subdirectory(contrib/meter)

	diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp
	index a985509..293a4e1 100644
	--- a/torch/csrc/distributed/Module.cpp
	+++ b/torch/csrc/distributed/Module.cpp
	@@ -186,8 +186,8 @@ THDTensorDescriptor THDPModule_makeDescriptor(PyObject *obj)
	PyObject type = (PyObject)Py_TYPE(obj);
	#define REGISTER_TH_DESCRIPTOR(TYPE, REAL) \
	if (type == THP##TYPE##Class) \
	- return at::CPU(REAL).unsafeTensorFromTH(((THP##TYPE*)obj)->cdata, true);
	#!/usr/bin/env bash
	set -e

	PYCMD=${PYCMD:="python"}
	COVERAGE=0
	while [[ "$#" -gt 0 ]]; do
	case "$1" in
	-p\|--python) PYCMD=$2; shift 2 ;;
	-c\|--coverage) COVERAGE=1; shift 1;;
	--) shift; break ;;
	// Assumes:
	// - input is (N, C, H, W)
	// - gradOutput is (N, C, goH, goW)
	// - gradWeight is (C, 1, kH, kW) --> (C, kH, kW)

	// Naive Loop: No striding, padding, dilation handled

	// These three loops would be parallelized, such that each is computed by a single block
	for (int ch = 0; ch < C; ++ch) {
	for (gw_h_offset = 0; gw_h_offset < kH; ++gw_h_offset) {
	Tensor & CPUByteType::cat_out(TensorList tensors, int dim, Tensor & self) {
	auto self_ = checked_cast<CPUByteTensor>(self.pImpl,"self",0);
	auto tensors_ = tensor_list_checked_cast<CPUByteTensor, Tensor, THByteTensor>(tensors,"tensors",1);
	THByteTensor_catArray(self_->tensor, tensors_.data(), tensors_.size(), dim);
	return self;
	}
	// Super Dumb Kernel

	__device__ __forceinline__ long calculateOffset(
	long index, // index to calculate offset for
	int ndim, // number of dimensions in Tensor
	long sizes[8], // sizes for Tensor dims (either from the Tensor, or the size of the adv indexer at that dim)
	long strides[8], // strides for Tensor
	bool adv[8], // which Tensors are advanced indexers
	long *advIndexTensors[8], // Adv Indexing Tensors
	)
	In [1]: import torch

	In [2]: x = torch.arange(0, 64).view(8, 8)

	In [3]: x
	Out[3]:

	0 1 2 3 4 5 6 7
	8 9 10 11 12 13 14 15
	16 17 18 19 20 21 22 23