airMeng/2d memcpy. opencl VS sycl.md

## 2d memcpy. opencl VS sycl.md

      
    Raw
  

              2d memcpy. opencl VS sycl.md
            
          
    So lucky we are that we have a genius team like oneAPI compiler team. One of their great contribution is that they never obey any common sense or ease-to-use, just not stingy with their talents.
2D load/store API is the one of examples that we should be grateful indeed especially after several hours' failed attempts.
The definition of 2d memcpy in OpenCL
// Enqueue command to write a 2D or 3D rectangular region to a buffer object from host memory.
cl_int clEnqueueWriteBufferRect(cl_command_queue command_queue,
                                cl_mem buffer,
                                cl_bool blocking_write,
                                // buffer offset, up to 3D
                                const size_t * buffer_origin,
                                // host offset, up to 3D
                                const size_t * host_origin,
                                // The (width in bytes, height in rows, depth in slices) of the 2D or 3D rectangle being read or written
                                const size_t *region,
                                // buffer stride of the inner dimensions
                                size_t buffer_row_pitch,
                                // buffer stride of the 2nd dimension
                                size_t buffer_slice_pitch,
                                // host stride of the inner dimensions
                                size_t host_row_pitch,
                                // host stride of the 2nd dimensions
                                size_t host_slice_pitch,
                                void *ptr,
                                cl_uint num_events_in_wait_list,
                                const cl_event *event_wait_list,
                                cl_event *event)

It is worth emphasizing that pitch in GPU world is a quite common concepts which is narrowly equal to stride in BLAS world, which will be your first but not last surprise you will be nice to have as new comers in GPU world.
It's implementation in CUDA will be cudaMemcpy2D, which you can see mostly same as OpenCL except no offset needed (which I think quite unnecessary personally)
cudaMemcpy2D ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind )
Then we have the big one. How can you implement 2D memcpy in oneAPI/SYCL. You might search memcpy2d if you are a hot head, then the incredible website shows you nothing. Then you will doubt yourself and waste a long time on browsing the whole web.

Finally you got your answer! If you are hard enough! If you are lucky! If your colleage happend to walk though these miracles!
void sycl::_V1::handler::ext_oneapi_memcpy2d	(	void * 	Dest,
                                                size_t 	DestPitch,
                                                const void * 	Src,
                                                size_t 	SrcPitch,
                                                size_t 	Width,
                                                size_t 	Height 
)	

The explanations of parameters are almost the same with CUDA so I will not give comments here. I must remind the readers, you still have time to express your sincerely to SYCL teams' naming.

  An E2E example to show the usage and comparsion
#define DEVICE_MEM_ALIGNMENT (64)

#include <sycl/sycl.hpp>

#include <CL/cl.h>

void sycl_copy() {
  sycl::queue q{sycl::property::queue::in_order()};

  std::cout << "Running on "
            << q.get_device().get_info<sycl::info::device::name>() << "\n";
  int m = 16, n = 16;
  int *host_ptr = (int *)malloc(m * n * sizeof(int));
  for (int i = 0; i < m * n; ++i) {
    host_ptr[i] = 0;
  }
  int *device_ptr = (int *)sycl::aligned_alloc_device(DEVICE_MEM_ALIGNMENT,
                                                      m * n * sizeof(int), q);
  q.memcpy(device_ptr, host_ptr, m * n * sizeof(int)).wait();
  for (int i = 0; i < m * n; ++i) {
    host_ptr[i] = i;
  }
  const size_t dst_off = n * sizeof(int);
  const size_t src_off = n * sizeof(int);
  auto code = q.ext_oneapi_memcpy2d(device_ptr + (1 * n + 1), dst_off, host_ptr,
                                    src_off, 6 * sizeof(int), 6);
  code.wait();
  q.memcpy(host_ptr, device_ptr, m * n * sizeof(int)).wait();
  for (int i = 0; i < m * n; ++i) {
    if (i % n == 0) {
      printf("\n %d: ", i / 16);
    }
    printf("%d ", host_ptr[i]);
  }
  printf("\n");
  free(host_ptr);
  sycl::free(device_ptr, q);
}

void opencl_copy() {
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_int err;
  cl_event *ev;

  struct cl_device;
  struct cl_platform {
    cl_platform_id id;
    unsigned number;
    char name[128];
    char vendor[128];
    struct cl_device *devices;
    unsigned n_devices;
    struct cl_device *default_device;
  };

  struct cl_device {
    struct cl_platform *platform;
    cl_device_id id;
    unsigned number;
     cl_device_type type;
    char name[128];
  };

  enum { NPLAT = 16, NDEV = 16 };

  struct cl_platform platforms[NPLAT];
  unsigned n_platforms = 0;
  struct cl_device devices[NDEV];
  unsigned n_devices = 0;
  struct cl_device *default_device = NULL;

  platform = NULL;
  device = NULL;

  cl_platform_id platform_ids[NPLAT];
  (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));

  for (unsigned i = 0; i < n_platforms; i++) {
    struct cl_platform *p = &platforms[i];
    p->number = i;
    p->id = platform_ids[i];
    (clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name,
                       NULL));
    (clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor,
                       NULL));

    cl_device_id device_ids[NDEV];
    cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV,
                                                device_ids, &p->n_devices);
    if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
      p->n_devices = 0;
    } else {
      (clGetDeviceIDsError);
    }
    p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
    p->default_device = NULL;

    for (unsigned j = 0; j < p->n_devices; j++) {
      struct cl_device *d = &devices[n_devices];
      d->number = n_devices++;
      d->id = device_ids[j];
      d->platform = p;
      (clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
      (clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));

      if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
        p->default_device = d;
      }
    }

    if (default_device == NULL && p->default_device != NULL) {
      default_device = p->default_device;
    }
  }

  if (n_devices == 0) {
    fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
    exit(1);
  }
  fprintf(stderr, "opencl: selecting platform: '%s'\n",
            default_device->platform->name);
  fprintf(stderr, "opencl: selecting device: '%s'\n", default_device->name);

  platform = default_device->platform->id;
  device = default_device->id;

  cl_context_properties properties[] = {(intptr_t)CL_CONTEXT_PLATFORM,
                                        (intptr_t)platform, 0};
  context = clCreateContext(properties, 1, &device, NULL, NULL, NULL);
  queue = clCreateCommandQueue(context, device, 0, NULL);

  static size_t m = 16, n = 16;
  int host_data[m * n];
  int device_data[m * n];
  for (int i = 0; i < m * n; ++i) {
    host_data[i] = i;
  }
  cl_mem device_ptr = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                     m * n * sizeof(int), NULL, NULL);
  const size_t buffer_origin[3] = {4 * sizeof(int), 3, 0};
  const size_t host_origin[3] = {0, 0, 0};
  const size_t region[3] = {6 * sizeof(int), 5, 1};
  clEnqueueWriteBufferRect(queue, device_ptr, true, buffer_origin, host_origin,
                           region, n * sizeof(int), 0, n * sizeof(int), 0,
                           host_data, 0, NULL, NULL);
  clFinish(queue);
  clEnqueueReadBuffer(queue, device_ptr, true, 0, m * n * sizeof(int),
                      device_data, 0, NULL, NULL);
  for (int i = 0; i < m * n; ++i) {
    if (i % n == 0) {
      printf("\n %d: ", i / 16);
    }
    printf("%d ", device_data[i]);
  }
  printf("\n");
  clReleaseMemObject(device_ptr);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
  free(ev);
}

int main() {
  sycl_copy();
  opencl_copy();
  return 0;
}