yjmade/cuda_shm.py

## cuda_shm.py
#In process 1
import torch
shape = (100000,)
a=torch.rand(shape, device="cuda:0").share_memory_()
deviceId, handle_ptr, size, offset,*_ =a.storage()._share_cuda_()


# in process 2
import numpy as np
import numba.cuda

context=numba.cuda.current_context(deviceId)
h1=numba.cuda.IpcHandle(
     None,
     numba.cuda.cudadrv.drvapi.cu_ipc_mem_handle(*handle_ptr),
     size,
     source_info=context.device.get_device_identity(),
     offset=offset
)
h2=numba.cuda.cudadrv.devicearray.IpcArrayHandle(h1,{
  'shape': shape,
  'strides': (4,), # float32 4, float64 8
  'dtype': np.float32
})
with h2 as nd_array:
  #now nd_array is SHM with torch
  #https://docs.cupy.dev/en/stable/user_guide/interoperability.html
  cupy_array = cupy.asarray(nd_array) # share same shm with torch
  np_array = np.array(nd_array) # clone to CPU
  torch_tensor=torch.as_tensor(nd_array,device="cuda") # share same shm with torch, don't specify which cuda, torch will identify itself
	#In process 1
	import torch
	shape = (100000,)
	a=torch.rand(shape, device="cuda:0").share_memory_()
	deviceId, handle_ptr, size, offset,*_ =a.storage()._share_cuda_()


	# in process 2
	import numpy as np
	import numba.cuda

	context=numba.cuda.current_context(deviceId)
	h1=numba.cuda.IpcHandle(
	None,
	numba.cuda.cudadrv.drvapi.cu_ipc_mem_handle(*handle_ptr),
	size,
	source_info=context.device.get_device_identity(),
	offset=offset
	)
	h2=numba.cuda.cudadrv.devicearray.IpcArrayHandle(h1,{
	'shape': shape,
	'strides': (4,), # float32 4, float64 8
	'dtype': np.float32
	})
	with h2 as nd_array:
	#now nd_array is SHM with torch
	#https://docs.cupy.dev/en/stable/user_guide/interoperability.html
	cupy_array = cupy.asarray(nd_array) # share same shm with torch
	np_array = np.array(nd_array) # clone to CPU
	torch_tensor=torch.as_tensor(nd_array,device="cuda") # share same shm with torch, don't specify which cuda, torch will identify itself