Peter Andreas Entschev pentschev

## dask_cupy_custom_kernel_output.py
res.compute()
array([[0.000000e+00, 2.000000e+00, 4.000000e+00, ..., 2.042000e+03,
        2.044000e+03, 2.046000e+03],
       [1.024000e+03, 1.026000e+03, 1.028000e+03, ..., 3.066000e+03,
        3.068000e+03, 3.070000e+03],
       [2.048000e+03, 2.050000e+03, 2.052000e+03, ..., 4.090000e+03,
        4.092000e+03, 4.094000e+03],
       ...,
       [4.191232e+06, 4.191234e+06, 4.191236e+06, ..., 4.193274e+06,
        4.193276e+06, 4.193278e+06],

## map_blocks_dispatch_add_broadcast.py
res = da.map_blocks(dispatch_add_broadcast, dx, dy, dtype=cupy.float32)

## cupy_backed_dask_arrays.py
import dask.array as da

dx = da.from_array(x, chunks=(1024, 512), asarray=False)
dy = da.from_array(y, chunks=(1, 512), asarray=False)

## call_dispatch_add_broadcast.py
res_add_broadcast = dispatch_add_broadcast(x, y)

## dispatch_add_broadcast.py
def dispatch_add_broadcast(x, y):
    block_size = (32, 32)
    grid_size = (x.shape[1] // block_size[1], x.shape[0] // block_size[0])

    z = cupy.empty(x.shape, x.dtype)

    xdim0 = x.strides[0] // x.strides[1]
    zdim0 = z.strides[0] // z.strides[1]

    add_broadcast_kernel(grid_size, block_size, (x, y, z, xdim0, zdim0))

## cupy_add_broadcast_kernel.py
add_broadcast_kernel = cupy.RawKernel(
    r'''
    extern "C" __global__
    void add_broadcast_kernel(
        const float* x, const float* y, float* z,
        const int xdim0, const int zdim0)
    {
        int idx0 = blockIdx.x * blockDim.x + threadIdx.x;
        int idx1 = blockIdx.y * blockDim.y + threadIdx.y;
        z[idx1 * zdim0 + idx0] = x[idx1 * xdim0 + idx0] + y[idx0];

## cupy_simple_addition.py
import cupy

x = cupy.arange(4096 * 1024, dtype=cupy.float32).reshape((4096, 1024))
y = cupy.arange(1024, dtype=cupy.float32)

res_cupy = x + y

## copy_to_host_pool_numpy1.16.ipynb

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pentschev
                / copy_to_host_pool_numpy1.16.ipynb
            
            
              Created
              August 5, 2019 15:34
            
              
                Benchmark Numba copy_to_host with NumPy pool
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## cython_malloc_benchmark.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pentschev
                / cython_malloc_benchmark.ipynb
            
            
              Last active
              July 31, 2019 18:43
            
              
                Cython Malloc Benchmark
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## dask-cuda-multistream-timing.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                pentschev
                / dask-cuda-multistream-timing.ipynb
            
            
              Created
              July 24, 2019 14:03
            
              
                Timing multi-stream dask-cuda
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	res.compute()
	array([[0.000000e+00, 2.000000e+00, 4.000000e+00, ..., 2.042000e+03,
	2.044000e+03, 2.046000e+03],
	[1.024000e+03, 1.026000e+03, 1.028000e+03, ..., 3.066000e+03,
	3.068000e+03, 3.070000e+03],
	[2.048000e+03, 2.050000e+03, 2.052000e+03, ..., 4.090000e+03,
	4.092000e+03, 4.094000e+03],
	...,
	[4.191232e+06, 4.191234e+06, 4.191236e+06, ..., 4.193274e+06,
	4.193276e+06, 4.193278e+06],
	import dask.array as da

	dx = da.from_array(x, chunks=(1024, 512), asarray=False)
	dy = da.from_array(y, chunks=(1, 512), asarray=False)
	def dispatch_add_broadcast(x, y):
	block_size = (32, 32)
	grid_size = (x.shape[1] // block_size[1], x.shape[0] // block_size[0])

	z = cupy.empty(x.shape, x.dtype)

	xdim0 = x.strides[0] // x.strides[1]
	zdim0 = z.strides[0] // z.strides[1]

	add_broadcast_kernel(grid_size, block_size, (x, y, z, xdim0, zdim0))
	add_broadcast_kernel = cupy.RawKernel(
	r'''
	extern "C" __global__
	void add_broadcast_kernel(
	const float* x, const float* y, float* z,
	const int xdim0, const int zdim0)
	{
	int idx0 = blockIdx.x * blockDim.x + threadIdx.x;
	int idx1 = blockIdx.y * blockDim.y + threadIdx.y;
	z[idx1 * zdim0 + idx0] = x[idx1 * xdim0 + idx0] + y[idx0];
	import cupy

	x = cupy.arange(4096 * 1024, dtype=cupy.float32).reshape((4096, 1024))
	y = cupy.arange(1024, dtype=cupy.float32)

	res_cupy = x + y