Peter Andreas Entschev pentschev

## cupy_backed_dask_arrays.py
import dask.array as da

dx = da.from_array(x, chunks=(1024, 512), asarray=False)
dy = da.from_array(y, chunks=(1, 512), asarray=False)

## call_dispatch_add_broadcast.py
res_add_broadcast = dispatch_add_broadcast(x, y)

## dispatch_add_broadcast.py
def dispatch_add_broadcast(x, y):
    block_size = (32, 32)
    grid_size = (x.shape[1] // block_size[1], x.shape[0] // block_size[0])

    z = cupy.empty(x.shape, x.dtype)

    xdim0 = x.strides[0] // x.strides[1]
    zdim0 = z.strides[0] // z.strides[1]

    add_broadcast_kernel(grid_size, block_size, (x, y, z, xdim0, zdim0))

## cupy_add_broadcast_kernel.py
add_broadcast_kernel = cupy.RawKernel(
    r'''
    extern "C" __global__
    void add_broadcast_kernel(
        const float* x, const float* y, float* z,
        const int xdim0, const int zdim0)
    {
        int idx0 = blockIdx.x * blockDim.x + threadIdx.x;
        int idx1 = blockIdx.y * blockDim.y + threadIdx.y;
        z[idx1 * zdim0 + idx0] = x[idx1 * xdim0 + idx0] + y[idx0];

## cupy_simple_addition.py
import cupy

x = cupy.arange(4096 * 1024, dtype=cupy.float32).reshape((4096, 1024))
y = cupy.arange(1024, dtype=cupy.float32)

res_cupy = x + y

## dask_cupy_custom_kernel.py
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask.array.utils import assert_eq

import dask.array as da
import cupy

add_broadcast_kernel = cupy.RawKernel(
    r'''
    extern "C" __global__

## ucx_info_-b.diff
--- ucx_info_b	2019-08-08 11:01:32.820637475 -0700
+++ ucx_from_source_info_b	2019-08-08 11:02:15.976537113 -0700
@@ -8,6 +8,7 @@
 #define HAVE_CUDA                 1
 #define HAVE_CUDA_H               1
 #define HAVE_CUDA_RUNTIME_H       1
+#define HAVE_DC_EXP               1
 #define HAVE_DECL_ASPRINTF        1
 #define HAVE_DECL_BASENAME        1
 #define HAVE_DECL_CPU_ISSET       1

## copy_to_host_pool_numpy1.16.ipynb

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pentschev
                / copy_to_host_pool_numpy1.16.ipynb
            
            
              Created
              August 5, 2019 15:34
            
              
                Benchmark Numba copy_to_host with NumPy pool
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## dask-cuda-multistream-timing.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                pentschev
                / dask-cuda-multistream-timing.ipynb
            
            
              Created
              July 24, 2019 14:03
            
              
                Timing multi-stream dask-cuda
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## plot_benchmark_array.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pentschev
                / plot_benchmark_array.ipynb
            
            
              Created
              July 2, 2019 12:18
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	import dask.array as da

	dx = da.from_array(x, chunks=(1024, 512), asarray=False)
	dy = da.from_array(y, chunks=(1, 512), asarray=False)
	def dispatch_add_broadcast(x, y):
	block_size = (32, 32)
	grid_size = (x.shape[1] // block_size[1], x.shape[0] // block_size[0])

	z = cupy.empty(x.shape, x.dtype)

	xdim0 = x.strides[0] // x.strides[1]
	zdim0 = z.strides[0] // z.strides[1]

	add_broadcast_kernel(grid_size, block_size, (x, y, z, xdim0, zdim0))
	add_broadcast_kernel = cupy.RawKernel(
	r'''
	extern "C" __global__
	void add_broadcast_kernel(
	const float* x, const float* y, float* z,
	const int xdim0, const int zdim0)
	{
	int idx0 = blockIdx.x * blockDim.x + threadIdx.x;
	int idx1 = blockIdx.y * blockDim.y + threadIdx.y;
	z[idx1 * zdim0 + idx0] = x[idx1 * xdim0 + idx0] + y[idx0];
	import cupy

	x = cupy.arange(4096 * 1024, dtype=cupy.float32).reshape((4096, 1024))
	y = cupy.arange(1024, dtype=cupy.float32)

	res_cupy = x + y
	from dask.distributed import Client
	from dask_cuda import LocalCUDACluster
	from dask.array.utils import assert_eq

	import dask.array as da
	import cupy

	add_broadcast_kernel = cupy.RawKernel(
	r'''
	extern "C" __global__
	--- ucx_info_b 2019-08-08 11:01:32.820637475 -0700
	+++ ucx_from_source_info_b 2019-08-08 11:02:15.976537113 -0700
	@@ -8,6 +8,7 @@
	#define HAVE_CUDA 1
	#define HAVE_CUDA_H 1
	#define HAVE_CUDA_RUNTIME_H 1
	+#define HAVE_DC_EXP 1
	#define HAVE_DECL_ASPRINTF 1
	#define HAVE_DECL_BASENAME 1
	#define HAVE_DECL_CPU_ISSET 1