pentschev/ucx_map_overlap_smooth.py

## ucx_map_overlap_smooth.py
import asyncio
import time
import numpy as np
import cupy
import numba

import dask.array as da
from dask_cuda import DGX, LocalCUDACluster
from dask.distributed import Client, wait


@numba.cuda.jit
def _smooth_gpu(x, out):
    i, j = numba.cuda.grid(2)
    n, m = x.shape
    if 1 <= i < n - 1 and 1 <= j < m - 1:
        out[i, j] = (x[i - 1, j - 1] + x[i - 1, j] + x[i - 1, j + 1] +
                     x[i    , j - 1] + x[i    , j] + x[i    , j + 1] +
                     x[i + 1, j - 1] + x[i + 1, j] + x[i + 1, j + 1]) / 9


def smooth_gpu(x, out):
    import math

    threadsperblock = (16, 16)
    blockspergrid_x = math.ceil(x.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(x.shape[1] / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    _smooth_gpu[blockspergrid, threadsperblock](x, out)


def dispatch_smooth_gpu(x):
    out = cupy.zeros(x.shape, x.dtype)
    smooth_gpu(x, out)
    return out


async def f():
    #async with LocalCUDACluster(asynchronous=True) as cluster:
    async with DGX(asynchronous=True, silence_logs=True) as cluster:
        async with Client(cluster, asynchronous=True) as client:

            # Create a simple random array
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            x = rs.random((80000, 80000), chunks=(10000, 10000)).persist()
            await wait(x)

            import time
            t = time.time()
            y = x.map_overlap(dispatch_smooth_gpu, depth=1)
            result = await y.persist()
            print("Time:", time.time() - t)


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(f())
	import asyncio
	import time
	import numpy as np
	import cupy
	import numba

	import dask.array as da
	from dask_cuda import DGX, LocalCUDACluster
	from dask.distributed import Client, wait


	@numba.cuda.jit
	def _smooth_gpu(x, out):
	i, j = numba.cuda.grid(2)
	n, m = x.shape
	if 1 <= i < n - 1 and 1 <= j < m - 1:
	out[i, j] = (x[i - 1, j - 1] + x[i - 1, j] + x[i - 1, j + 1] +
	x[i , j - 1] + x[i , j] + x[i , j + 1] +
	x[i + 1, j - 1] + x[i + 1, j] + x[i + 1, j + 1]) / 9


	def smooth_gpu(x, out):
	import math

	threadsperblock = (16, 16)
	blockspergrid_x = math.ceil(x.shape[0] / threadsperblock[0])
	blockspergrid_y = math.ceil(x.shape[1] / threadsperblock[1])
	blockspergrid = (blockspergrid_x, blockspergrid_y)

	_smooth_gpu[blockspergrid, threadsperblock](x, out)


	def dispatch_smooth_gpu(x):
	out = cupy.zeros(x.shape, x.dtype)
	smooth_gpu(x, out)
	return out


	async def f():
	#async with LocalCUDACluster(asynchronous=True) as cluster:
	async with DGX(asynchronous=True, silence_logs=True) as cluster:
	async with Client(cluster, asynchronous=True) as client:

	# Create a simple random array
	rs = da.random.RandomState(RandomState=cupy.random.RandomState)
	x = rs.random((80000, 80000), chunks=(10000, 10000)).persist()
	await wait(x)

	import time
	t = time.time()
	y = x.map_overlap(dispatch_smooth_gpu, depth=1)
	result = await y.persist()
	print("Time:", time.time() - t)


	if __name__ == '__main__':
	asyncio.get_event_loop().run_until_complete(f())