Scott Gray scott-gray

## memset_bench.py
#!/usr/bin/env python

import pycuda.driver as drv
from pycuda.autoinit import context, device
from pycuda.compiler import SourceModule

SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]

print(device.name())

## tid_aware.cu

// A case for making the compiler more threadIdx aware in conditional code.

// Proposed solution:
// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
// This can't be that complicated to do.

__device__ __forceinline__ float shfl_xor(float var, int laneMask)

## simple_fprop_direct.py
import numpy as np

def ceil_div(x, y):
    return -(-x // y)

def out_dim(S, X, padding, strides):
    return ceil_div(X - S + 1 + 2*padding, strides)

def fconv_slice(q, S, X, padding, strides):

## gaussian_pool.py
#!/usr/bin/python

import numpy as np
import pycuda.driver as drv
from pycuda.tools import context_dependent_memoize
from pycuda.compiler import SourceModule


class GaussianPool(object):
	#!/usr/bin/env python

	import pycuda.driver as drv
	from pycuda.autoinit import context, device
	from pycuda.compiler import SourceModule

	SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]

	print(device.name())

	// A case for making the compiler more threadIdx aware in conditional code.

	// Proposed solution:
	// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
	// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
	// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
	// This can't be that complicated to do.

	__device__ __forceinline__ float shfl_xor(float var, int laneMask)
	import numpy as np

	def ceil_div(x, y):
	return -(-x // y)

	def out_dim(S, X, padding, strides):
	return ceil_div(X - S + 1 + 2*padding, strides)

	def fconv_slice(q, S, X, padding, strides):
	#!/usr/bin/python

	import numpy as np
	import pycuda.driver as drv
	from pycuda.tools import context_dependent_memoize
	from pycuda.compiler import SourceModule


	class GaussianPool(object):