Skip to content

Instantly share code, notes, and snippets.

@TApplencourt
Last active February 24, 2022 19:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TApplencourt/cdb8d5b3de9899cab7ce2da7529d9e94 to your computer and use it in GitHub Desktop.
Save TApplencourt/cdb8d5b3de9899cab7ce2da7529d9e94 to your computer and use it in GitHub Desktop.
#define MAD_4(x, y) \
x = y * x + y; \
y = x * y + x; \
x = y * x + y; \
y = x * y + x;
#define MAD_16(x, y) \
MAD_4(x, y); \
MAD_4(x, y); \
MAD_4(x, y); \
MAD_4(x, y);
#define MAD_64(x, y) \
MAD_16(x, y); \
MAD_16(x, y); \
MAD_16(x, y); \
MAD_16(x, y);
#include <chrono>
#include <iostream>
#include <sycl/sycl.hpp>
template <class T> T busy_wait(long N, T i) {
T x = 1.3f;
T y = (T)i;
for (long j = 0; j < N; j++) {
MAD_64(x, y);
}
return y;
}
template <class T>
void out_of_order_queue(std::vector<std::string> modes, long C_tripcount, bool enable_profiling,
bool in_order, int n_queues, bool serial, long *total_cpu_time,
int *max_index_cpu_tine_command = NULL, long *max_cpu_time_command = NULL,
bool *concurent = NULL) {
const sycl::device D{sycl::gpu_selector()};
const int globalWIs = D.get_info<sycl::info::device::sub_group_sizes>()[0];
const int N = D.get_info<sycl::info::device::max_mem_alloc_size>() / sizeof(T);
const sycl::context C(D);
if (n_queues == -1)
n_queues = modes.size();
sycl::property_list pl;
if (enable_profiling && in_order)
pl = sycl::property_list{sycl::property::queue::in_order{},
sycl::property::queue::enable_profiling{}};
else if (enable_profiling)
pl = sycl::property_list{sycl::property::queue::enable_profiling{}};
else if (in_order)
pl = sycl::property_list{sycl::property::queue::in_order{}};
std::vector<sycl::queue> Qs;
for (int i = 0; i < n_queues; i++)
Qs.push_back(sycl::queue(C, D, pl));
std::vector<std::vector<T *>> buffers;
for (auto &mode : modes) {
std::vector<T *> buffer;
for (const char &t : mode) {
T *b;
if (t == 'D') {
b = sycl::malloc_device<T>(N, D, C);
} else if (t == 'H') {
b = sycl::malloc_host<T>(N, C);
} else if (t == 'M') {
b = static_cast<T *>(malloc(N * sizeof(T)));
std::fill(b, b + N, 0);
} else if (t == 'C') {
b = sycl::malloc_device<T>(globalWIs, D, C);
}
buffer.push_back(b);
}
buffers.push_back(buffer);
}
std::vector<long> cpu_times;
std::vector<sycl::event> events;
std::chrono::system_clock::time_point s0;
for (int r = 0; r < 2; r++) {
cpu_times.clear();
events.clear();
s0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < modes.size(); i++) {
const auto s = std::chrono::high_resolution_clock::now();
sycl::queue Q = Qs[i % n_queues];
sycl::event event;
if (modes[i] == "C") {
T *ptr = buffers[i][0];
event = Q.parallel_for(globalWIs, [ptr, C_tripcount](sycl::item<1> j) {
ptr[j] = busy_wait(C_tripcount, (T)j);
});
} else {
event = Q.copy(buffers[i][1], buffers[i][0], N);
}
if (enable_profiling)
events.push_back(event);
if (serial) {
Q.wait();
const auto e = std::chrono::high_resolution_clock::now();
cpu_times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(e - s).count());
}
}
for (auto &Q : Qs)
Q.wait();
}
const auto e0 = std::chrono::high_resolution_clock::now();
for (const auto &buffer : buffers)
for (auto &ptr : buffer)
(sycl::get_pointer_type(ptr, C) != sycl::usm::alloc::unknown) ? sycl::free(ptr, C)
: free(ptr);
*total_cpu_time = {std::chrono::duration_cast<std::chrono::microseconds>(e0 - s0).count()};
if (serial) {
*max_index_cpu_tine_command =
std::distance(cpu_times.begin(), std::max_element(cpu_times.begin(), cpu_times.end()));
*max_cpu_time_command = cpu_times[*max_index_cpu_tine_command];
}
if (enable_profiling) {
*concurent = false;
for (int i = 0; i < events.size(); i++) {
for (int j = 0; j < events.size(); j++) {
if (i == j)
continue;
const auto starti =
events[i].get_profiling_info<sycl::info::event_profiling::command_start>();
const auto startj =
events[j].get_profiling_info<sycl::info::event_profiling::command_start>();
const auto endj = events[j].get_profiling_info<sycl::info::event_profiling::command_end>();
if ((startj <= starti) && (starti <= endj)) {
*concurent = true;
return;
}
}
}
}
}
int main(int argc, char *argv[]) {
long C_tripcount = atoi(argv[1]);
bool enable_profiling = (std::string{argv[2]} == "enable_profiling");
bool in_order = (std::string{argv[3]} == "in_order");
int n_queues = atoi(argv[4]);
std::vector<std::string> modes(argv + 5, argv + argc);
long serial_total_cpu_time;
int serial_max_cpu_time_index_command;
long serial_max_cpu_time_command;
bool serial_concurent;
out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, true,
&serial_total_cpu_time, &serial_max_cpu_time_index_command,
&serial_max_cpu_time_command, &serial_concurent);
std::cout << "Total serial (us): " << serial_total_cpu_time << " (max commands (us) was "
<< modes[serial_max_cpu_time_index_command] << ": " << serial_max_cpu_time_command
<< ")" << std::endl;
long concurent_total_cpu_time;
bool concurent;
out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, false,
&concurent_total_cpu_time, NULL, NULL, &concurent);
std::cout << "Total // (us): " << concurent_total_cpu_time << std::endl;
std::cout << "Got " << 1. * serial_total_cpu_time / concurent_total_cpu_time
<< "x speed-up relative to serial," << std::endl;
std::cout << "was expecting (assuming maximun concurency and negligeable runtime overhead) "
<< 1. * serial_total_cpu_time / serial_max_cpu_time_command << "x" << std::endl;
if (enable_profiling && concurent) {
std::cout << "SUCCESS: SYCL Event show concurentcy" << std::endl;
return 0;
}
if (concurent_total_cpu_time <= 1.20 * serial_max_cpu_time_command) {
std::cout << "SUCCESS: Possible Concurent execution" << std::endl;
return 0;
}
std::cout << "FAILURE: No Concurent Execution" << std::endl;
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment