Skip to content

Instantly share code, notes, and snippets.

@keisukefukuda
Created April 25, 2012 08:40
Show Gist options
  • Save keisukefukuda/2488259 to your computer and use it in GitHub Desktop.
Save keisukefukuda/2488259 to your computer and use it in GitHub Desktop.
Simple code to test StarPU's combined worker feature.
// Simple program using combined worker.
#include <iostream>
#include <starpu.h>
#include <omp.h>
starpu_perfmodel perf_model, perf_model_omp;
starpu_data_handle_t handle;
starpu_task *task = NULL, *task_omp = NULL;
starpu_codelet cl, cl_omp;
starpu_conf conf;
const int LENGTH = 8192*2; // buffer length (a start line)
const int REPEAT = 15000; // dummy computing load
// A simple CPU kernel with OpenMP acceleration
void kernel_omp(void *buffers[], void *arg) {
double *ary = (double*) STARPU_VECTOR_GET_PTR(buffers[0]);
int len = STARPU_VECTOR_GET_NX(buffers[0]);
double beg = omp_get_wtime();
// A heavy & parallel calculation
#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
for(int j = 0; j < len; j++) {
double t = ary[j];
for(int i = 0; i < REPEAT; i++) {
t *= 1.0001;
}
ary[j] = t;
}
std::cout << "OpenMP task : " << (omp_get_wtime() - beg) << " [s]\t"
<< "worker size = " << starpu_combined_worker_get_size() << ", "
<< "OpenMP threads = " << omp_get_max_threads()
<< std::endl;
}
// A simple CPU kernel without OpenMP for comparison
void kernel(void *buffers[], void *arg) {
double *ary = (double*) STARPU_VECTOR_GET_PTR(buffers[0]);
int len = STARPU_VECTOR_GET_NX(buffers[0]);
double beg = omp_get_wtime();
// A heavy & parallel calculation
for(int j = 0; j < len; j++) {
double t = ary[j];
for(int i = 0; i < REPEAT; i++) {
t *= 1.0001;
}
ary[j] = t;
}
std::cout << "Serial task : " << (omp_get_wtime() - beg) << " [s]" << std::endl;
}
int main() {
starpu_conf_init(&conf);
conf.single_combined_worker = 1;
conf.sched_policy_name = "pheft";
int ret = starpu_init(&conf);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
perf_model.type = STARPU_REGRESSION_BASED;
perf_model.symbol = "simple";
perf_model_omp.type = STARPU_REGRESSION_BASED;
perf_model_omp.symbol = "simple_omp";
bzero(&cl, sizeof(cl));
cl.where = STARPU_CPU;
cl.cpu_funcs[0] = kernel;
cl.nbuffers = 1;
cl.modes[0] = STARPU_RW;
cl.model = &perf_model;
cl.max_parallelism = INT_MAX;
bzero(&cl_omp, sizeof(cl_omp));
cl_omp.where = STARPU_CPU;
cl_omp.type = STARPU_FORKJOIN;
cl_omp.cpu_funcs[0] = kernel_omp;
cl_omp.nbuffers = 1;
cl_omp.modes[0] = STARPU_RW;
cl_omp.model = &perf_model_omp;
cl_omp.max_parallelism = INT_MAX;
// Launch many tasks changing the buffers size.
for (float f = 1.0; f < 5.0; f += 0.1) {
int length = (int)(LENGTH * f);
double *ary = new double[length];
// the content of ary can be whatever, we use the initial vandom value
starpu_vector_data_register(&handle, 0, (uintptr_t) ary, length, sizeof(ary[0]));
// Here, we don't care about race conditions and data consistency
task = starpu_task_create();
task->cl = &cl;
task->handles[0] = handle;
task_omp = starpu_task_create();
task_omp->cl = &cl_omp;
task_omp->handles[0] = handle;
ret = starpu_task_submit(task);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
ret = starpu_task_submit(task_omp);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
starpu_task_wait_for_all();
starpu_data_unregister(handle);
delete[] ary;
}
starpu_shutdown();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment