Skip to content

Instantly share code, notes, and snippets.

@keisukefukuda
Created April 29, 2012 09:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save keisukefukuda/2548888 to your computer and use it in GitHub Desktop.
Save keisukefukuda/2548888 to your computer and use it in GitHub Desktop.
An example of using StarPU's multiformat feature.
/*
Copyright (c) 2012, Keisuke Fukuda. All rights reserved.
License : new BSD license
# The Makefile looks like this:
NVCC=nvcc
CUDA_SDK=${HOME}/NVIDIA_GPU_Computing_SDK
CUTIL_INC=${CUDA_SDK}/C/common/inc
simple_multiformat: simple_multiformat.cu
${NVCC} -arch sm_20 -I${CUTIL_INC} `pkg-config --cflags libstarpu` -o $@ $< `pkg-config --libs libstarpu`
NOTE:
In this example, CPU-to-CUDA and CUDA-to-CPU formatting functions are both implemented on
CPU and CUDA. However, as of StarPU 1.0 release, CPU-to-CUDA formatting must be on CUDA and
CUDA-to-CPU formatting must be on CPU. We expect this limitation will be removed.
*/
#include <cmath>
#include <iostream>
#include <cutil.h>
#include <starpu.h>
// CUDA codelet function and CUDA kernel
void cuda_func(void *buffers[], void *arg);
static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r);
// CPU codelet function
void cpu_func(void *buffers[], void *arg);
// format functions
void cpu_to_cuda_on_cpu(void *buffers[], void *arg);
void cuda_to_cpu_on_cpu(void *buffers[], void *arg);
void cuda_to_cpu_on_cuda(void *buffers[], void *arg);
typedef struct _Point {
double x; // input value
double y; // expected to be x * 1.001 ^ R
double z; // expected to be x * 1.002 ^ R
} Point;
// formatting function(CPU -> CUDA) on CPU
void cpu_to_cuda_on_cpu(void *buffers[], void *arg) {
// This should be called?
std::cerr << "CPU to CUDA conversion(on CPU) is called." << std::endl;
// From array of structure (for CPU) to structure of array (CUDA).
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
double *xs = struct_of_ary;
double *ys = struct_of_ary + nx;
double *zs = struct_of_ary + nx*2;
for (int i = 0; i < nx; i++) {
xs[i] = ary_of_struct[i].x;
ys[i] = ary_of_struct[i].y;
zs[i] = ary_of_struct[i].z;
}
}
__global__ void cpu_to_cuda_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < nx) {
Point *p = &aos[i];
xs[i] = p->x;
ys[i] = p->y;
zs[i] = p->z;
}
}
void cpu_to_cuda_on_cuda(void *buffers[], void *arg) {
std::cerr << "CPU to CUDA conversion (on CUDA) is called." << std::endl;
// From array of structure (for CPU) to structure of array (for CUDA)
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
double *xs = struct_of_ary;
double *ys = struct_of_ary + nx;
double *zs = struct_of_ary + nx*2;
dim3 gridDim = nx/1024 + 1;
dim3 blockDim = nx % 1024;
cpu_to_cuda_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);
CUDA_SAFE_THREAD_SYNC();
}
// formatting function (CUDA -> CPU) on CPU
void cuda_to_cpu_on_cpu(void *buffers[], void *arg) {
// This should be called?
std::cerr << "CUDA to CPU conversion(on CPU) is called." << std::endl;
// From structure of array(for CUDA) to array of structure (for CPU).
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
double *xs = struct_of_ary;
double *ys = struct_of_ary + nx;
double *zs = struct_of_ary + nx*2;
for (int i = 0; i < nx; i++) {
ary_of_struct[i].x = xs[i];
ary_of_struct[i].y = ys[i];
ary_of_struct[i].z = zs[i];
}
}
__global__ void cuda_to_cpu_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < nx) {
Point *p = &aos[i];
p->x = xs[i];
p->y = ys[i];
p->z = zs[i];
}
}
void cuda_to_cpu_on_cuda(void *buffers[], void *arg) {
// This should be called
std::cerr << "CUDA to CPU conversion is called." << std::endl;
double *struct_of_array = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
double *xs = struct_of_array;
double *ys = xs + nx;
double *zs = ys + nx;
dim3 gridDim = nx/1024 + 1;
dim3 blockDim = nx % 1024;
cuda_to_cpu_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx);
CUDA_SAFE_THREAD_SYNC();
}
////////////////////////////////////////////////////////////////////////////////
// CPU codelet function
////////////////////////////////////////////////////////////////////////////////
void cpu_func(void *buffers[], void *arg) {
Point *ary = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
int r = *(int*)arg;
for (int i = 0; i < nx; i++) {
double y,z;
y = z = ary[i].x;
for (int j = 0; j < r; j++) y *= 1.001;
for (int j = 0; j < r; j++) z *= 1.002;
ary[i].y = y;
ary[i].z = z;
}
}
////////////////////////////////////////////////////////////////////////////////
// CUDA codelet function
////////////////////////////////////////////////////////////////////////////////
void cuda_func(void *buffers[], void *arg) {
double *arrays = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
int r = *(int*)arg;
double *xs = arrays;
double *ys = xs + nx;
double *zs = ys + nx;
dim3 blockDim = nx % 1024;
dim3 gridDim = nx / 1024 + 1;
std::cerr << "CUDA kernel. NX = " << nx << std::endl;
std::cerr << "blockDim = " << blockDim.x << ", "
<< "gridDim = " << gridDim.x
<< std::endl;
kernel<<<blockDim, gridDim>>>(xs, ys, zs, nx, r);
CUDA_SAFE_THREAD_SYNC();
std::cerr << "CUDA kernel done" << std::endl;
}
static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < nx) {
double x = xs[i];
double y = x, z = x;
for (int j = 0; j < r; j++) y *= 1.001;
for (int j = 0; j < r; j++) z *= 1.002;
ys[i] = y;
zs[i] = z;
}
return;
}
////////////////////////////////////////////////////////////////////////////////
// Main function
////////////////////////////////////////////////////////////////////////////////
starpu_conf conf;
starpu_codelet cpu_to_cuda_cl;
starpu_codelet cuda_to_cpu_cl;
starpu_codelet cl;
starpu_data_handle_t handle;
starpu_multiformat_data_interface_ops fmt_ops;
starpu_task *task;
int main(int argc, char** argv) {
starpu_conf_init(&conf);
conf.sched_policy_name = "heft"; // multiformat is suppoted only in HEFT scheduling poilicy.
int ret = starpu_init(&conf);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
bzero(&cpu_to_cuda_cl, sizeof(cpu_to_cuda_cl));
bzero(&cuda_to_cpu_cl, sizeof(cuda_to_cpu_cl));
bzero(&fmt_ops, sizeof(fmt_ops));
// CPU to CUDA format on CPU codelet
cpu_to_cuda_cl.where = STARPU_CUDA | STARPU_CPU;
cpu_to_cuda_cl.cpu_funcs[0] = cpu_to_cuda_on_cpu;
cpu_to_cuda_cl.cuda_funcs[0] = cpu_to_cuda_on_cuda;
cpu_to_cuda_cl.nbuffers = 1;
cpu_to_cuda_cl.name = "codelet_cpu_to_cuda";
// CUDA to CPU format on CPU codelet
cuda_to_cpu_cl.where = STARPU_CUDA | STARPU_CPU;
cuda_to_cpu_cl.cpu_funcs[0] = cuda_to_cpu_on_cpu;
cuda_to_cpu_cl.cuda_funcs[0] = cuda_to_cpu_on_cuda;
cuda_to_cpu_cl.nbuffers = 1;
cuda_to_cpu_cl.name = "codelet_cuda_to_cpu";
const int LENGTH = 10;
const int R = 1000;
const double YR = pow(1.001, R); // answer
const double ZR = pow(1.002, R); // answer
fmt_ops.cpu_elemsize = sizeof(Point);
fmt_ops.cuda_elemsize = sizeof(double) * 3;
fmt_ops.cpu_to_cuda_cl = &cpu_to_cuda_cl;
fmt_ops.cuda_to_cpu_cl = &cuda_to_cpu_cl;
// Allocate and initialize data
Point *data = new Point[LENGTH];
std::cerr << "Initializing data... NX = " << LENGTH << std::endl;
for (int i = 0; i < LENGTH; i++) {
data[i].x = i + 1;
data[i].y = 0.0;
data[i].z = 0.0;
}
starpu_multiformat_data_register(&handle, 0, data, LENGTH, &fmt_ops);
// build codelet
bzero(&cl, sizeof(cl));
cl.where = STARPU_CUDA;
cl.cpu_funcs[0] = cpu_func;
cl.cuda_funcs[0] = cuda_func;
cl.nbuffers = 1;
cl.modes[0] = STARPU_RW;
cl.name = "cl";
// build task
task = starpu_task_create();
assert(task);
task->cl = &cl;
task->synchronous = 1;
task->handles[0] = handle;
task->cl_arg = (void*)&R;
task->cl_arg_size = sizeof(R);
ret = starpu_task_submit(task);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
starpu_data_unregister(handle);
// error check
int err = 0;
for (int i = 0; i < LENGTH; i++) {
double x = data[i].x;
if (abs(data[i].y - x * YR) > 1e-5) err++;
if (abs(data[i].z - x * ZR) > 1e-5) err++;
}
std::cerr << err << " errors found." << std::endl;
starpu_shutdown();
delete[] data;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment