Skip to content

Instantly share code, notes, and snippets.

View scott-gray's full-sized avatar

Scott Gray scott-gray

  • OpenAI
  • San Francisco, CA
View GitHub Profile
#!/usr/bin/env python
import pycuda.driver as drv
from pycuda.autoinit import context, device
from pycuda.compiler import SourceModule
SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]
print(device.name())
// A case for making the compiler more threadIdx aware in conditional code.
// Proposed solution:
// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
// This can't be that complicated to do.
__device__ __forceinline__ float shfl_xor(float var, int laneMask)
import numpy as np
def ceil_div(x, y):
return -(-x // y)
def out_dim(S, X, padding, strides):
return ceil_div(X - S + 1 + 2*padding, strides)
def fconv_slice(q, S, X, padding, strides):
@scott-gray
scott-gray / gaussian_pool.py
Last active September 11, 2016 23:21
Custom pooling kernels
#!/usr/bin/python
import numpy as np
import pycuda.driver as drv
from pycuda.tools import context_dependent_memoize
from pycuda.compiler import SourceModule
class GaussianPool(object):