keisukefukuda/simple_multiformat.cu

## simple_multiformat.cu
/*
  Copyright (c) 2012, Keisuke Fukuda. All rights reserved.
  License : new BSD license

  # The Makefile looks like this:
  NVCC=nvcc
  CUDA_SDK=${HOME}/NVIDIA_GPU_Computing_SDK
  CUTIL_INC=${CUDA_SDK}/C/common/inc

  simple_multiformat: simple_multiformat.cu
	  ${NVCC} -arch sm_20 -I${CUTIL_INC} `pkg-config --cflags libstarpu` -o $@ $< `pkg-config --libs libstarpu`

  NOTE:
  In this example, CPU-to-CUDA and CUDA-to-CPU formatting functions are both implemented on
  CPU and CUDA. However, as of StarPU 1.0 release, CPU-to-CUDA formatting must be on CUDA and
  CUDA-to-CPU formatting must be on CPU. We expect this limitation will be removed.
*/

#include <cmath>
#include <iostream>
#include <cutil.h>
#include <starpu.h>

// CUDA codelet function and CUDA kernel
void cuda_func(void *buffers[], void *arg);
static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r);

// CPU codelet function
void cpu_func(void *buffers[], void *arg);

// format functions
void cpu_to_cuda_on_cpu(void *buffers[], void *arg);
void cuda_to_cpu_on_cpu(void *buffers[], void *arg);
void cuda_to_cpu_on_cuda(void *buffers[], void *arg);

typedef struct _Point {
  double x; // input value
  double y; // expected to be x * 1.001 ^ R
  double z; // expected to be x * 1.002 ^ R
} Point;

// formatting function(CPU -> CUDA) on CPU
void cpu_to_cuda_on_cpu(void *buffers[], void *arg) {
  // This should be called?
  std::cerr << "CPU to CUDA conversion(on CPU) is called." << std::endl;

  // From array of structure (for CPU) to structure of array (CUDA).
  Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

  double *xs = struct_of_ary;
  double *ys = struct_of_ary + nx;
  double *zs = struct_of_ary + nx*2;

  for (int i = 0; i < nx; i++) {
    xs[i] = ary_of_struct[i].x;
    ys[i] = ary_of_struct[i].y;
    zs[i] = ary_of_struct[i].z;
  }
}

__global__ void cpu_to_cuda_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < nx) {
    Point *p = &aos[i];
    xs[i] = p->x;
    ys[i] = p->y;
    zs[i] = p->z;
  }
}

void cpu_to_cuda_on_cuda(void *buffers[], void *arg) {
  std::cerr << "CPU to CUDA conversion (on CUDA) is called." << std::endl;

  // From array of structure (for CPU) to structure of array (for CUDA)
  Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

  double *xs = struct_of_ary;
  double *ys = struct_of_ary + nx;
  double *zs = struct_of_ary + nx*2;

  dim3 gridDim = nx/1024 + 1;
  dim3 blockDim = nx % 1024;

  cpu_to_cuda_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);

  CUDA_SAFE_THREAD_SYNC();
}

// formatting function (CUDA -> CPU) on CPU
void cuda_to_cpu_on_cpu(void *buffers[], void *arg) {
  // This should be called?
  std::cerr << "CUDA to CPU conversion(on CPU) is called." << std::endl;

  // From structure of array(for CUDA) to array of structure (for CPU).
  Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

  double *xs = struct_of_ary;
  double *ys = struct_of_ary + nx;
  double *zs = struct_of_ary + nx*2;

  for (int i = 0; i < nx; i++) {
    ary_of_struct[i].x = xs[i];
    ary_of_struct[i].y = ys[i];
    ary_of_struct[i].z = zs[i];
  }
}

__global__ void cuda_to_cpu_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < nx) {
    Point *p = &aos[i];
    p->x = xs[i];
    p->y = ys[i];
    p->z = zs[i];
  }
}

void cuda_to_cpu_on_cuda(void *buffers[], void *arg) {
  // This should be called
  std::cerr << "CUDA to CPU conversion is called." << std::endl;

  double *struct_of_array = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

  double *xs = struct_of_array;
  double *ys = xs + nx;
  double *zs = ys + nx;

  dim3 gridDim = nx/1024 + 1;
  dim3 blockDim = nx % 1024;

  cuda_to_cpu_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);

  CUDA_SAFE_THREAD_SYNC();
}


////////////////////////////////////////////////////////////////////////////////
// CPU codelet function
////////////////////////////////////////////////////////////////////////////////
void cpu_func(void *buffers[], void *arg) {
  Point *ary = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  int r = *(int*)arg;

  for (int i = 0; i < nx; i++) {
    double y,z;
    y = z = ary[i].x;
    for (int j = 0; j < r; j++) y *= 1.001;
    for (int j = 0; j < r; j++) z *= 1.002;

    ary[i].y = y;
    ary[i].z = z;
  }
}

////////////////////////////////////////////////////////////////////////////////
// CUDA codelet function
////////////////////////////////////////////////////////////////////////////////

void cuda_func(void *buffers[], void *arg) {
  double *arrays = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  int r = *(int*)arg;

  double *xs = arrays;
  double *ys = xs + nx;
  double *zs = ys + nx;

  dim3 blockDim = nx % 1024;
  dim3 gridDim = nx / 1024 + 1;

  std::cerr << "CUDA kernel. NX = " << nx << std::endl;
  std::cerr << "blockDim = " << blockDim.x << ", "
            << "gridDim = " << gridDim.x
            << std::endl;

  kernel<<<blockDim, gridDim>>>(xs, ys, zs, nx, r);

  CUDA_SAFE_THREAD_SYNC();

  std::cerr << "CUDA kernel done" << std::endl;
}

static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < nx) {
    double x = xs[i];
    double y = x, z = x;
    for (int j = 0; j < r; j++) y *= 1.001;
    for (int j = 0; j < r; j++) z *= 1.002;
    ys[i] = y;
    zs[i] = z;
  }

  return;
}

////////////////////////////////////////////////////////////////////////////////
// Main function
////////////////////////////////////////////////////////////////////////////////

starpu_conf conf;
starpu_codelet cpu_to_cuda_cl;
starpu_codelet cuda_to_cpu_cl;
starpu_codelet cl;
starpu_data_handle_t handle;
starpu_multiformat_data_interface_ops fmt_ops;
starpu_task *task;

int main(int argc, char** argv) {
  starpu_conf_init(&conf);
  conf.sched_policy_name = "heft"; // multiformat is suppoted only in HEFT scheduling poilicy.

  int ret = starpu_init(&conf);
  STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

  bzero(&cpu_to_cuda_cl, sizeof(cpu_to_cuda_cl));
  bzero(&cuda_to_cpu_cl, sizeof(cuda_to_cpu_cl));
  bzero(&fmt_ops, sizeof(fmt_ops));

  // CPU to CUDA format on CPU codelet
  cpu_to_cuda_cl.where = STARPU_CUDA | STARPU_CPU;
  cpu_to_cuda_cl.cpu_funcs[0] = cpu_to_cuda_on_cpu;
  cpu_to_cuda_cl.cuda_funcs[0] = cpu_to_cuda_on_cuda;
  cpu_to_cuda_cl.nbuffers = 1;
  cpu_to_cuda_cl.name = "codelet_cpu_to_cuda";

  // CUDA to CPU format on CPU codelet
  cuda_to_cpu_cl.where = STARPU_CUDA | STARPU_CPU;
  cuda_to_cpu_cl.cpu_funcs[0] = cuda_to_cpu_on_cpu;
  cuda_to_cpu_cl.cuda_funcs[0] = cuda_to_cpu_on_cuda;
  cuda_to_cpu_cl.nbuffers = 1;
  cuda_to_cpu_cl.name = "codelet_cuda_to_cpu";

  const int LENGTH = 10;
  const int R = 1000;
  const double YR = pow(1.001, R); // answer
  const double ZR = pow(1.002, R); // answer

  fmt_ops.cpu_elemsize = sizeof(Point);
  fmt_ops.cuda_elemsize = sizeof(double) * 3;
  fmt_ops.cpu_to_cuda_cl = &cpu_to_cuda_cl;
  fmt_ops.cuda_to_cpu_cl = &cuda_to_cpu_cl;

  // Allocate and initialize data
  Point *data = new Point[LENGTH];

  std::cerr << "Initializing data... NX = " << LENGTH << std::endl;
  for (int i = 0; i < LENGTH; i++) {
    data[i].x = i + 1;
    data[i].y = 0.0;
    data[i].z = 0.0;
  }

  starpu_multiformat_data_register(&handle, 0, data, LENGTH, &fmt_ops);

  // build codelet
  bzero(&cl, sizeof(cl));
  cl.where = STARPU_CUDA;
  cl.cpu_funcs[0] = cpu_func;
  cl.cuda_funcs[0] = cuda_func;
  cl.nbuffers = 1;
  cl.modes[0] = STARPU_RW;
  cl.name = "cl";

  // build task
  task = starpu_task_create();
  assert(task);
  task->cl = &cl;
  task->synchronous = 1;
  task->handles[0] = handle;
  task->cl_arg = (void*)&R;
  task->cl_arg_size = sizeof(R);

  ret = starpu_task_submit(task);
  STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

  starpu_data_unregister(handle);

  // error check
  int err = 0;
  for (int i = 0; i < LENGTH; i++) {
    double x = data[i].x;
    if (abs(data[i].y - x * YR) > 1e-5) err++;
    if (abs(data[i].z - x * ZR) > 1e-5) err++;
  }

  std::cerr << err << " errors found." << std::endl;

  starpu_shutdown();

  delete[] data;

  return 0;
}
	/*
	Copyright (c) 2012, Keisuke Fukuda. All rights reserved.
	License : new BSD license

	# The Makefile looks like this:
	NVCC=nvcc
	CUDA_SDK=${HOME}/NVIDIA_GPU_Computing_SDK
	CUTIL_INC=${CUDA_SDK}/C/common/inc

	simple_multiformat: simple_multiformat.cu
	${NVCC} -arch sm_20 -I${CUTIL_INC} `pkg-config --cflags libstarpu` -o $@ $< `pkg-config --libs libstarpu`

	NOTE:
	In this example, CPU-to-CUDA and CUDA-to-CPU formatting functions are both implemented on
	CPU and CUDA. However, as of StarPU 1.0 release, CPU-to-CUDA formatting must be on CUDA and
	CUDA-to-CPU formatting must be on CPU. We expect this limitation will be removed.
	*/

	#include <cmath>
	#include <iostream>
	#include <cutil.h>
	#include <starpu.h>

	// CUDA codelet function and CUDA kernel
	void cuda_func(void buffers[], void arg);
	static __global__ void kernel(double xs, double ys, double *zs, int nx, int r);

	// CPU codelet function
	void cpu_func(void buffers[], void arg);

	// format functions
	void cpu_to_cuda_on_cpu(void buffers[], void arg);
	void cuda_to_cpu_on_cpu(void buffers[], void arg);
	void cuda_to_cpu_on_cuda(void buffers[], void arg);

	typedef struct _Point {
	double x; // input value
	double y; // expected to be x * 1.001 ^ R
	double z; // expected to be x * 1.002 ^ R
	} Point;

	// formatting function(CPU -> CUDA) on CPU
	void cpu_to_cuda_on_cpu(void buffers[], void arg) {
	// This should be called?
	std::cerr << "CPU to CUDA conversion(on CPU) is called." << std::endl;

	// From array of structure (for CPU) to structure of array (CUDA).
	Point ary_of_struct = (Point)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	double struct_of_ary = (double)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

	double *xs = struct_of_ary;
	double *ys = struct_of_ary + nx;
	double zs = struct_of_ary + nx2;

	for (int i = 0; i < nx; i++) {
	xs[i] = ary_of_struct[i].x;
	ys[i] = ary_of_struct[i].y;
	zs[i] = ary_of_struct[i].z;
	}
	}

	__global__ void cpu_to_cuda_kernel(Point aos, double xs, double ys, double zs, int nx) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < nx) {
	Point *p = &aos[i];
	xs[i] = p->x;
	ys[i] = p->y;
	zs[i] = p->z;
	}
	}

	void cpu_to_cuda_on_cuda(void buffers[], void arg) {
	std::cerr << "CPU to CUDA conversion (on CUDA) is called." << std::endl;

	// From array of structure (for CPU) to structure of array (for CUDA)
	Point ary_of_struct = (Point)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	double struct_of_ary = (double)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

	double *xs = struct_of_ary;
	double *ys = struct_of_ary + nx;
	double zs = struct_of_ary + nx2;

	dim3 gridDim = nx/1024 + 1;
	dim3 blockDim = nx % 1024;

	cpu_to_cuda_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);

	CUDA_SAFE_THREAD_SYNC();
	}

	// formatting function (CUDA -> CPU) on CPU
	void cuda_to_cpu_on_cpu(void buffers[], void arg) {
	// This should be called?
	std::cerr << "CUDA to CPU conversion(on CPU) is called." << std::endl;

	// From structure of array(for CUDA) to array of structure (for CPU).
	Point ary_of_struct = (Point)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	double struct_of_ary = (double)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

	double *xs = struct_of_ary;
	double *ys = struct_of_ary + nx;
	double zs = struct_of_ary + nx2;

	for (int i = 0; i < nx; i++) {
	ary_of_struct[i].x = xs[i];
	ary_of_struct[i].y = ys[i];
	ary_of_struct[i].z = zs[i];
	}
	}

	__global__ void cuda_to_cpu_kernel(Point aos, double xs, double ys, double zs, int nx) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < nx) {
	Point *p = &aos[i];
	p->x = xs[i];
	p->y = ys[i];
	p->z = zs[i];
	}
	}

	void cuda_to_cpu_on_cuda(void buffers[], void arg) {
	// This should be called
	std::cerr << "CUDA to CPU conversion is called." << std::endl;

	double struct_of_array = (double)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
	Point ary_of_struct = (Point)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);

	double *xs = struct_of_array;
	double *ys = xs + nx;
	double *zs = ys + nx;

	dim3 gridDim = nx/1024 + 1;
	dim3 blockDim = nx % 1024;

	cuda_to_cpu_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);

	CUDA_SAFE_THREAD_SYNC();
	}


	////////////////////////////////////////////////////////////////////////////////
	// CPU codelet function
	////////////////////////////////////////////////////////////////////////////////
	void cpu_func(void buffers[], void arg) {
	Point ary = (Point)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	int r = (int)arg;

	for (int i = 0; i < nx; i++) {
	double y,z;
	y = z = ary[i].x;
	for (int j = 0; j < r; j++) y *= 1.001;
	for (int j = 0; j < r; j++) z *= 1.002;

	ary[i].y = y;
	ary[i].z = z;
	}
	}

	////////////////////////////////////////////////////////////////////////////////
	// CUDA codelet function
	////////////////////////////////////////////////////////////////////////////////

	void cuda_func(void buffers[], void arg) {
	double arrays = (double)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
	int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	int r = (int)arg;

	double *xs = arrays;
	double *ys = xs + nx;
	double *zs = ys + nx;

	dim3 blockDim = nx % 1024;
	dim3 gridDim = nx / 1024 + 1;

	std::cerr << "CUDA kernel. NX = " << nx << std::endl;
	std::cerr << "blockDim = " << blockDim.x << ", "
	<< "gridDim = " << gridDim.x
	<< std::endl;

	kernel<<<blockDim, gridDim>>>(xs, ys, zs, nx, r);

	CUDA_SAFE_THREAD_SYNC();

	std::cerr << "CUDA kernel done" << std::endl;
	}

	static __global__ void kernel(double xs, double ys, double *zs, int nx, int r) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < nx) {
	double x = xs[i];
	double y = x, z = x;
	for (int j = 0; j < r; j++) y *= 1.001;
	for (int j = 0; j < r; j++) z *= 1.002;
	ys[i] = y;
	zs[i] = z;
	}

	return;
	}

	////////////////////////////////////////////////////////////////////////////////
	// Main function
	////////////////////////////////////////////////////////////////////////////////

	starpu_conf conf;
	starpu_codelet cpu_to_cuda_cl;
	starpu_codelet cuda_to_cpu_cl;
	starpu_codelet cl;
	starpu_data_handle_t handle;
	starpu_multiformat_data_interface_ops fmt_ops;
	starpu_task *task;

	int main(int argc, char** argv) {
	starpu_conf_init(&conf);
	conf.sched_policy_name = "heft"; // multiformat is suppoted only in HEFT scheduling poilicy.

	int ret = starpu_init(&conf);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	bzero(&cpu_to_cuda_cl, sizeof(cpu_to_cuda_cl));
	bzero(&cuda_to_cpu_cl, sizeof(cuda_to_cpu_cl));
	bzero(&fmt_ops, sizeof(fmt_ops));

	// CPU to CUDA format on CPU codelet
	cpu_to_cuda_cl.where = STARPU_CUDA \| STARPU_CPU;
	cpu_to_cuda_cl.cpu_funcs[0] = cpu_to_cuda_on_cpu;
	cpu_to_cuda_cl.cuda_funcs[0] = cpu_to_cuda_on_cuda;
	cpu_to_cuda_cl.nbuffers = 1;
	cpu_to_cuda_cl.name = "codelet_cpu_to_cuda";

	// CUDA to CPU format on CPU codelet
	cuda_to_cpu_cl.where = STARPU_CUDA \| STARPU_CPU;
	cuda_to_cpu_cl.cpu_funcs[0] = cuda_to_cpu_on_cpu;
	cuda_to_cpu_cl.cuda_funcs[0] = cuda_to_cpu_on_cuda;
	cuda_to_cpu_cl.nbuffers = 1;
	cuda_to_cpu_cl.name = "codelet_cuda_to_cpu";

	const int LENGTH = 10;
	const int R = 1000;
	const double YR = pow(1.001, R); // answer
	const double ZR = pow(1.002, R); // answer

	fmt_ops.cpu_elemsize = sizeof(Point);
	fmt_ops.cuda_elemsize = sizeof(double) * 3;
	fmt_ops.cpu_to_cuda_cl = &cpu_to_cuda_cl;
	fmt_ops.cuda_to_cpu_cl = &cuda_to_cpu_cl;

	// Allocate and initialize data
	Point *data = new Point[LENGTH];

	std::cerr << "Initializing data... NX = " << LENGTH << std::endl;
	for (int i = 0; i < LENGTH; i++) {
	data[i].x = i + 1;
	data[i].y = 0.0;
	data[i].z = 0.0;
	}

	starpu_multiformat_data_register(&handle, 0, data, LENGTH, &fmt_ops);

	// build codelet
	bzero(&cl, sizeof(cl));
	cl.where = STARPU_CUDA;
	cl.cpu_funcs[0] = cpu_func;
	cl.cuda_funcs[0] = cuda_func;
	cl.nbuffers = 1;
	cl.modes[0] = STARPU_RW;
	cl.name = "cl";

	// build task
	task = starpu_task_create();
	assert(task);
	task->cl = &cl;
	task->synchronous = 1;
	task->handles[0] = handle;
	task->cl_arg = (void*)&R;
	task->cl_arg_size = sizeof(R);

	ret = starpu_task_submit(task);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

	starpu_data_unregister(handle);

	// error check
	int err = 0;
	for (int i = 0; i < LENGTH; i++) {
	double x = data[i].x;
	if (abs(data[i].y - x * YR) > 1e-5) err++;
	if (abs(data[i].z - x * ZR) > 1e-5) err++;
	}

	std::cerr << err << " errors found." << std::endl;

	starpu_shutdown();

	delete[] data;

	return 0;
	}