Created
April 29, 2012 09:12
-
-
Save keisukefukuda/2548888 to your computer and use it in GitHub Desktop.
An example of using StarPU's multiformat feature.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright (c) 2012, Keisuke Fukuda. All rights reserved. | |
License : new BSD license | |
# The Makefile looks like this: | |
NVCC=nvcc | |
CUDA_SDK=${HOME}/NVIDIA_GPU_Computing_SDK | |
CUTIL_INC=${CUDA_SDK}/C/common/inc | |
simple_multiformat: simple_multiformat.cu | |
${NVCC} -arch sm_20 -I${CUTIL_INC} `pkg-config --cflags libstarpu` -o $@ $< `pkg-config --libs libstarpu` | |
NOTE: | |
In this example, CPU-to-CUDA and CUDA-to-CPU formatting functions are both implemented on | |
CPU and CUDA. However, as of StarPU 1.0 release, CPU-to-CUDA formatting must be on CUDA and | |
CUDA-to-CPU formatting must be on CPU. We expect this limitation will be removed. | |
*/ | |
#include <cmath> | |
#include <iostream> | |
#include <cutil.h> | |
#include <starpu.h> | |
// CUDA codelet function and CUDA kernel | |
void cuda_func(void *buffers[], void *arg); | |
static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r); | |
// CPU codelet function | |
void cpu_func(void *buffers[], void *arg); | |
// format functions | |
void cpu_to_cuda_on_cpu(void *buffers[], void *arg); | |
void cuda_to_cpu_on_cpu(void *buffers[], void *arg); | |
void cuda_to_cpu_on_cuda(void *buffers[], void *arg); | |
typedef struct _Point { | |
double x; // input value | |
double y; // expected to be x * 1.001 ^ R | |
double z; // expected to be x * 1.002 ^ R | |
} Point; | |
// formatting function(CPU -> CUDA) on CPU | |
void cpu_to_cuda_on_cpu(void *buffers[], void *arg) { | |
// This should be called? | |
std::cerr << "CPU to CUDA conversion(on CPU) is called." << std::endl; | |
// From array of structure (for CPU) to structure of array (CUDA). | |
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); | |
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
double *xs = struct_of_ary; | |
double *ys = struct_of_ary + nx; | |
double *zs = struct_of_ary + nx*2; | |
for (int i = 0; i < nx; i++) { | |
xs[i] = ary_of_struct[i].x; | |
ys[i] = ary_of_struct[i].y; | |
zs[i] = ary_of_struct[i].z; | |
} | |
} | |
__global__ void cpu_to_cuda_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) { | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < nx) { | |
Point *p = &aos[i]; | |
xs[i] = p->x; | |
ys[i] = p->y; | |
zs[i] = p->z; | |
} | |
} | |
void cpu_to_cuda_on_cuda(void *buffers[], void *arg) { | |
std::cerr << "CPU to CUDA conversion (on CUDA) is called." << std::endl; | |
// From array of structure (for CPU) to structure of array (for CUDA) | |
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); | |
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
double *xs = struct_of_ary; | |
double *ys = struct_of_ary + nx; | |
double *zs = struct_of_ary + nx*2; | |
dim3 gridDim = nx/1024 + 1; | |
dim3 blockDim = nx % 1024; | |
cpu_to_cuda_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx); | |
CUDA_SAFE_THREAD_SYNC(); | |
} | |
// formatting function (CUDA -> CPU) on CPU | |
void cuda_to_cpu_on_cpu(void *buffers[], void *arg) { | |
// This should be called? | |
std::cerr << "CUDA to CPU conversion(on CPU) is called." << std::endl; | |
// From structure of array(for CUDA) to array of structure (for CPU). | |
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); | |
double *struct_of_ary = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
double *xs = struct_of_ary; | |
double *ys = struct_of_ary + nx; | |
double *zs = struct_of_ary + nx*2; | |
for (int i = 0; i < nx; i++) { | |
ary_of_struct[i].x = xs[i]; | |
ary_of_struct[i].y = ys[i]; | |
ary_of_struct[i].z = zs[i]; | |
} | |
} | |
__global__ void cuda_to_cpu_kernel(Point *aos, double *xs, double *ys, double *zs, int nx) { | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < nx) { | |
Point *p = &aos[i]; | |
p->x = xs[i]; | |
p->y = ys[i]; | |
p->z = zs[i]; | |
} | |
} | |
void cuda_to_cpu_on_cuda(void *buffers[], void *arg) { | |
// This should be called | |
std::cerr << "CUDA to CPU conversion is called." << std::endl; | |
double *struct_of_array = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); | |
Point *ary_of_struct = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
double *xs = struct_of_array; | |
double *ys = xs + nx; | |
double *zs = ys + nx; | |
dim3 gridDim = nx/1024 + 1; | |
dim3 blockDim = nx % 1024; | |
cuda_to_cpu_kernel<<<blockDim, gridDim>>>(ary_of_struct, xs, ys, zs, nx); | |
CUDA_SAFE_THREAD_SYNC(); | |
} | |
//////////////////////////////////////////////////////////////////////////////// | |
// CPU codelet function | |
//////////////////////////////////////////////////////////////////////////////// | |
void cpu_func(void *buffers[], void *arg) { | |
Point *ary = (Point*)STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
int r = *(int*)arg; | |
for (int i = 0; i < nx; i++) { | |
double y,z; | |
y = z = ary[i].x; | |
for (int j = 0; j < r; j++) y *= 1.001; | |
for (int j = 0; j < r; j++) z *= 1.002; | |
ary[i].y = y; | |
ary[i].z = z; | |
} | |
} | |
//////////////////////////////////////////////////////////////////////////////// | |
// CUDA codelet function | |
//////////////////////////////////////////////////////////////////////////////// | |
void cuda_func(void *buffers[], void *arg) { | |
double *arrays = (double*)STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); | |
int nx = STARPU_MULTIFORMAT_GET_NX(buffers[0]); | |
int r = *(int*)arg; | |
double *xs = arrays; | |
double *ys = xs + nx; | |
double *zs = ys + nx; | |
dim3 blockDim = nx % 1024; | |
dim3 gridDim = nx / 1024 + 1; | |
std::cerr << "CUDA kernel. NX = " << nx << std::endl; | |
std::cerr << "blockDim = " << blockDim.x << ", " | |
<< "gridDim = " << gridDim.x | |
<< std::endl; | |
kernel<<<blockDim, gridDim>>>(xs, ys, zs, nx, r); | |
CUDA_SAFE_THREAD_SYNC(); | |
std::cerr << "CUDA kernel done" << std::endl; | |
} | |
static __global__ void kernel(double *xs, double *ys, double *zs, int nx, int r) { | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < nx) { | |
double x = xs[i]; | |
double y = x, z = x; | |
for (int j = 0; j < r; j++) y *= 1.001; | |
for (int j = 0; j < r; j++) z *= 1.002; | |
ys[i] = y; | |
zs[i] = z; | |
} | |
return; | |
} | |
//////////////////////////////////////////////////////////////////////////////// | |
// Main function | |
//////////////////////////////////////////////////////////////////////////////// | |
starpu_conf conf; | |
starpu_codelet cpu_to_cuda_cl; | |
starpu_codelet cuda_to_cpu_cl; | |
starpu_codelet cl; | |
starpu_data_handle_t handle; | |
starpu_multiformat_data_interface_ops fmt_ops; | |
starpu_task *task; | |
int main(int argc, char** argv) { | |
starpu_conf_init(&conf); | |
conf.sched_policy_name = "heft"; // multiformat is suppoted only in HEFT scheduling poilicy. | |
int ret = starpu_init(&conf); | |
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); | |
bzero(&cpu_to_cuda_cl, sizeof(cpu_to_cuda_cl)); | |
bzero(&cuda_to_cpu_cl, sizeof(cuda_to_cpu_cl)); | |
bzero(&fmt_ops, sizeof(fmt_ops)); | |
// CPU to CUDA format on CPU codelet | |
cpu_to_cuda_cl.where = STARPU_CUDA | STARPU_CPU; | |
cpu_to_cuda_cl.cpu_funcs[0] = cpu_to_cuda_on_cpu; | |
cpu_to_cuda_cl.cuda_funcs[0] = cpu_to_cuda_on_cuda; | |
cpu_to_cuda_cl.nbuffers = 1; | |
cpu_to_cuda_cl.name = "codelet_cpu_to_cuda"; | |
// CUDA to CPU format on CPU codelet | |
cuda_to_cpu_cl.where = STARPU_CUDA | STARPU_CPU; | |
cuda_to_cpu_cl.cpu_funcs[0] = cuda_to_cpu_on_cpu; | |
cuda_to_cpu_cl.cuda_funcs[0] = cuda_to_cpu_on_cuda; | |
cuda_to_cpu_cl.nbuffers = 1; | |
cuda_to_cpu_cl.name = "codelet_cuda_to_cpu"; | |
const int LENGTH = 10; | |
const int R = 1000; | |
const double YR = pow(1.001, R); // answer | |
const double ZR = pow(1.002, R); // answer | |
fmt_ops.cpu_elemsize = sizeof(Point); | |
fmt_ops.cuda_elemsize = sizeof(double) * 3; | |
fmt_ops.cpu_to_cuda_cl = &cpu_to_cuda_cl; | |
fmt_ops.cuda_to_cpu_cl = &cuda_to_cpu_cl; | |
// Allocate and initialize data | |
Point *data = new Point[LENGTH]; | |
std::cerr << "Initializing data... NX = " << LENGTH << std::endl; | |
for (int i = 0; i < LENGTH; i++) { | |
data[i].x = i + 1; | |
data[i].y = 0.0; | |
data[i].z = 0.0; | |
} | |
starpu_multiformat_data_register(&handle, 0, data, LENGTH, &fmt_ops); | |
// build codelet | |
bzero(&cl, sizeof(cl)); | |
cl.where = STARPU_CUDA; | |
cl.cpu_funcs[0] = cpu_func; | |
cl.cuda_funcs[0] = cuda_func; | |
cl.nbuffers = 1; | |
cl.modes[0] = STARPU_RW; | |
cl.name = "cl"; | |
// build task | |
task = starpu_task_create(); | |
assert(task); | |
task->cl = &cl; | |
task->synchronous = 1; | |
task->handles[0] = handle; | |
task->cl_arg = (void*)&R; | |
task->cl_arg_size = sizeof(R); | |
ret = starpu_task_submit(task); | |
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); | |
starpu_data_unregister(handle); | |
// error check | |
int err = 0; | |
for (int i = 0; i < LENGTH; i++) { | |
double x = data[i].x; | |
if (abs(data[i].y - x * YR) > 1e-5) err++; | |
if (abs(data[i].z - x * ZR) > 1e-5) err++; | |
} | |
std::cerr << err << " errors found." << std::endl; | |
starpu_shutdown(); | |
delete[] data; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment