quasiben/ucx_map_overlap_smooth.py

## ucx_map_overlap_smooth.py
import asyncio
import time
import numpy as np
import cupy
import numba

import dask.array as da
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait


@numba.cuda.jit
def _smooth_gpu(x, out):
    i, j = numba.cuda.grid(2)
    n, m = x.shape
    if 1 <= i < n - 1 and 1 <= j < m - 1:
        out[i, j] = (
            x[i - 1, j - 1]
            + x[i - 1, j]
            + x[i - 1, j + 1]
            + x[i, j - 1]
            + x[i, j]
            + x[i, j + 1]
            + x[i + 1, j - 1]
            + x[i + 1, j]
            + x[i + 1, j + 1]
        ) / 9


def smooth_gpu(x, out):
    import math

    threadsperblock = (16, 16)
    blockspergrid_x = math.ceil(x.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(x.shape[1] / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    _smooth_gpu[blockspergrid, threadsperblock](x, out)


def dispatch_smooth_gpu(x):
    out = cupy.zeros(x.shape, x.dtype)
    smooth_gpu(x, out)
    return out


async def f():
    protocol = "ucx"
    interface = "enp1s0f0"  # DGX-1
    interface = "enp134s0f1"  # DGX-2
    enable_tcp_over_ucx = True
    enable_nvlink = True
    enable_infiniband = False
    kwargs = {
        "protocol": protocol,
        "interface": interface,
        "enable_tcp_over_ucx": enable_tcp_over_ucx,
        "enable_nvlink": enable_nvlink,
        "enable_infiniband": enable_infiniband,
        "rmm_pool_size": '24GB',
        "ucx_net_devices": "auto" if enable_infiniband is True else None,
    }

    async with LocalCUDACluster(
        asynchronous=True, silence_logs=True, local_directory="/tmp/bzaitlen", **kwargs
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:

            # Create a simple random array
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            x = rs.random((80000, 80000), chunks=(10000, 10000)).persist()
            await wait(x)

            import time

            t = time.time()
            y = x.map_overlap(dispatch_smooth_gpu, depth=1)
            result = await y.persist()
            print("Time:", time.time() - t)


if __name__ == "__main__":
    asyncio.get_event_loop().run_until_complete(f())
	import asyncio
	import time
	import numpy as np
	import cupy
	import numba

	import dask.array as da
	from dask_cuda import LocalCUDACluster
	from dask.distributed import Client, wait


	@numba.cuda.jit
	def _smooth_gpu(x, out):
	i, j = numba.cuda.grid(2)
	n, m = x.shape
	if 1 <= i < n - 1 and 1 <= j < m - 1:
	out[i, j] = (
	x[i - 1, j - 1]
	+ x[i - 1, j]
	+ x[i - 1, j + 1]
	+ x[i, j - 1]
	+ x[i, j]
	+ x[i, j + 1]
	+ x[i + 1, j - 1]
	+ x[i + 1, j]
	+ x[i + 1, j + 1]
	) / 9


	def smooth_gpu(x, out):
	import math

	threadsperblock = (16, 16)
	blockspergrid_x = math.ceil(x.shape[0] / threadsperblock[0])
	blockspergrid_y = math.ceil(x.shape[1] / threadsperblock[1])
	blockspergrid = (blockspergrid_x, blockspergrid_y)

	_smooth_gpu[blockspergrid, threadsperblock](x, out)


	def dispatch_smooth_gpu(x):
	out = cupy.zeros(x.shape, x.dtype)
	smooth_gpu(x, out)
	return out


	async def f():
	protocol = "ucx"
	interface = "enp1s0f0" # DGX-1
	interface = "enp134s0f1" # DGX-2
	enable_tcp_over_ucx = True
	enable_nvlink = True
	enable_infiniband = False
	kwargs = {
	"protocol": protocol,
	"interface": interface,
	"enable_tcp_over_ucx": enable_tcp_over_ucx,
	"enable_nvlink": enable_nvlink,
	"enable_infiniband": enable_infiniband,
	"rmm_pool_size": '24GB',
	"ucx_net_devices": "auto" if enable_infiniband is True else None,
	}

	async with LocalCUDACluster(
	asynchronous=True, silence_logs=True, local_directory="/tmp/bzaitlen", **kwargs
	) as cluster:
	async with Client(cluster, asynchronous=True) as client:

	# Create a simple random array
	rs = da.random.RandomState(RandomState=cupy.random.RandomState)
	x = rs.random((80000, 80000), chunks=(10000, 10000)).persist()
	await wait(x)

	import time

	t = time.time()
	y = x.map_overlap(dispatch_smooth_gpu, depth=1)
	result = await y.persist()
	print("Time:", time.time() - t)


	if __name__ == "__main__":
	asyncio.get_event_loop().run_until_complete(f())