Skip to content

Instantly share code, notes, and snippets.

@TApplencourt
Last active May 6, 2022 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TApplencourt/35d124cf1cf74d240d2c499cb070fbd8 to your computer and use it in GitHub Desktop.
Save TApplencourt/35d124cf1cf74d240d2c499cb070fbd8 to your computer and use it in GitHub Desktop.
L0 concurent
#include <level_zero/ze_api.h>
#include <iostream>
#include <fstream>
#include <memory>
#include <vector>
#include <chrono>
#include <limits>
#include <unistd.h>
#define zeCall(myZeCall) \
do { \
if (myZeCall != ZE_RESULT_SUCCESS) { \
std::cout << "Error at " \
<< #myZeCall << ": " \
<< __FUNCTION__ << ": " \
<< std::dec \
<< __LINE__ << "\n"; \
std::terminate(); \
} \
} while (0);
auto bench_multiple_queue(int n,
ze_context_handle_t context,
ze_device_handle_t device,
ze_kernel_handle_t kernel) {
std::vector<ze_command_list_handle_t> lists= {};
std::vector<ze_event_handle_t> events= {};
std::vector<void *> ptrs= {};
ze_event_pool_desc_t eventPoolDesc = {
ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr,
ZE_EVENT_POOL_FLAG_HOST_VISIBLE, // all events in pool are visible to Host
static_cast<uint32_t>(n) // count
};
ze_event_pool_handle_t eventPool;
zeCall(zeEventPoolCreate(context, &eventPoolDesc, 0, nullptr, &eventPool));
for (int i=0; i<n; i++) {
// Create an immediate command list for direct submission
ze_command_queue_desc_t cmdQueueDesc = {};
cmdQueueDesc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
cmdQueueDesc.index = i % 4; // 4 CCSs
ze_command_list_handle_t cmdList = {};
zeCall(zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdList));
lists.push_back(cmdList);
ze_event_desc_t eventDesc = {
ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
static_cast<uint32_t>(i), // index
0, // no additional memory/cache coherency required on signal
ZE_EVENT_SCOPE_FLAG_HOST // ensure memory coherency across device and Host
// after event completes
};
ze_event_handle_t e;
zeCall(zeEventCreate(eventPool, &eventDesc, &e));
events.push_back(e);
void *ptr = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_host_mem_alloc_desc_t hostDesc = {};
zeCall(zeMemAllocShared(context, &deviceDesc, &hostDesc, 64, 0, device, &ptr));
ptrs.push_back(ptr);
}
std::chrono::high_resolution_clock::time_point timeAtStart = std::chrono::high_resolution_clock::now();
for (int i=0; i < n; i++) {
auto list = lists[i];
const auto e = events[i];
auto ptr = ptrs[i];
ze_group_count_t threadGroupCount = {};
threadGroupCount.groupCountX = 1u;
threadGroupCount.groupCountY = 1u;
threadGroupCount.groupCountZ = 1u;
zeCall(zeKernelSetArgumentValue(kernel, 0, 8, &ptr));
zeCall(zeCommandListAppendLaunchKernel(list, kernel, &threadGroupCount, e, 0 , nullptr));
}
for (int i=0; i < n; i++) {
const auto e = events[i];
zeCall(zeEventHostSynchronize(e, std::numeric_limits<uint64_t>::max()));
}
std::chrono::high_resolution_clock::time_point timeAtEnd = std::chrono::high_resolution_clock::now();
for (int i=0; i<n; i++) {
#ifndef FAST_BUT_ERROR
auto ptr = ptrs[i];
zeCall(zeMemFree(context, ptr));
auto list = lists[i];
zeCall(zeCommandListDestroy(list));
#endif
}
return std::chrono::duration_cast<std::chrono::milliseconds>(timeAtEnd - timeAtStart).count();
}
int main(int argc, char *argv[]) {
zeCall(zeInit(ZE_INIT_FLAG_GPU_ONLY));
uint32_t driverCount = 0;
zeCall(zeDriverGet(&driverCount, nullptr));
ze_driver_handle_t driverHandle;
zeCall(zeDriverGet(&driverCount, &driverHandle));
ze_context_handle_t context;
ze_context_desc_t contextDesc = {};
zeCall(zeContextCreate(driverHandle, &contextDesc, &context));
// Get the root devices
uint32_t deviceCount = 0;
zeCall(zeDeviceGet(driverHandle, &deviceCount, nullptr));
if (deviceCount == 0) {
std::cout << "No devices found \n";
std::terminate();
}
ze_device_handle_t device;
deviceCount = 1;
zeCall(zeDeviceGet(driverHandle, &deviceCount, &device));
// Create kernel
std::string kernelFile = "kernel.ar"; //"kernel_XE_HP_COREcore.spv";
ze_module_format_t kernelFormat = ZE_MODULE_FORMAT_NATIVE; //ZE_MODULE_FORMAT_IL_SPIRV;
std::ifstream file(kernelFile, std::ios_base::in | std::ios_base::binary);
if (false == file.good()) {
std::cout << kernelFile << " file not found\n";
std::terminate();
}
uint32_t spirvSize = 0;
file.seekg(0, file.end);
spirvSize = static_cast<size_t>(file.tellg());
file.seekg(0, file.beg);
auto spirvModule = std::make_unique<char[]>(spirvSize);
file.read(spirvModule.get(), spirvSize);
ze_module_handle_t module;
ze_module_desc_t moduleDesc = {};
moduleDesc.format = kernelFormat;
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(spirvModule.get());
moduleDesc.inputSize = spirvSize;
zeCall(zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
ze_kernel_handle_t kernel;
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = "sleep";
zeCall(zeKernelCreate(module, &kernelDesc, &kernel));
zeCall(zeKernelSetGroupSize(kernel, 1, 1 ,1 ));
const int N_kernel = 4;
#ifndef T1_AFTER
bench_multiple_queue(1, context, device, kernel);
const auto t1 = bench_multiple_queue(1, context, device, kernel);
std::cout << "1 kernel " << t1 << "ms" << std::endl;
#endif
long tN_min = std::numeric_limits<long>::max();
long tN_sum = 0;
const int N = 10;
for (auto i=0; i < N; i++) {
const auto tN = bench_multiple_queue(N_kernel, context, device, kernel);
std::cout << " "<< i << " t" << N_kernel << " "<< tN << "ms" << std::endl;
tN_min = std::min(tN_min,tN);
tN_sum += tN;
}
const auto tN_avg = (1.*tN_sum) / N;
#ifdef T1_AFTER
bench_multiple_queue(1, context, device, kernel);
const auto t1 = bench_multiple_queue(1, context, device, kernel);
std::cout << "1 kernel " << t1 << "ms" << std::endl;
#endif
const auto max_speedup = (1.* N_kernel*t1) / tN_min;
const auto avg_speedup = (1.* N_kernel*t1) / tN_avg;
std::cout << "4 kernels | Average: " << 1.*tN_sum / N << " ms | Min: " << tN_min << " ms" << std::endl;
std::cout << "Maximun speedup " << max_speedup << "x" << std::endl;
std::cout << "Avg speedup " << avg_speedup << "x" << std::endl;
if (max_speedup <= 0.8*N_kernel) {
std::cout << "Not enough parallelism" << std::endl;
return 1;
}
return 0;
}
#include <level_zero/ze_api.h>
#include <iostream>
#include <fstream>
#include <memory>
#include <vector>
#include <chrono>
#include <limits>
#define zeCall(myZeCall) \
do { \
if (myZeCall != ZE_RESULT_SUCCESS) { \
std::cout << "Error at " \
<< #myZeCall << ": " \
<< __FUNCTION__ << ": " \
<< std::dec \
<< __LINE__ << "\n"; \
std::terminate(); \
} \
} while (0);
auto bench_multiple_queue(int n,
ze_context_handle_t context,
ze_device_handle_t device,
ze_kernel_handle_t kernel) {
// Some magic number
const int computeOrdinal = 0;
const int n_index = 4;
std::vector<ze_command_queue_handle_t> queues= {};
std::vector<ze_command_list_handle_t> lists= {};
for (int index=0; index<n_index; index++) {
ze_command_queue_desc_t cmdQueueDesc = {};
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
cmdQueueDesc.ordinal = computeOrdinal;
cmdQueueDesc.index = index;
ze_command_queue_handle_t queue;
zeCall(zeCommandQueueCreate(context, device, &cmdQueueDesc, &queue));
queues.push_back(queue);
ze_command_list_desc_t listDesc = {};
listDesc.commandQueueGroupOrdinal = computeOrdinal;
ze_command_list_handle_t list;
zeCall(zeCommandListCreate(context, device, &listDesc, &list));
lists.push_back(list);
}
std::chrono::high_resolution_clock::time_point timeAtStart = std::chrono::high_resolution_clock::now();
for (int i=0; i < n; i++) {
const auto queue = queues[i];
auto list = lists[i];
ze_group_count_t threadGroupCount = {};
threadGroupCount.groupCountX = 1u;
threadGroupCount.groupCountY = 1u;
threadGroupCount.groupCountZ = 1u;
zeCall(zeCommandListAppendLaunchKernel(list, kernel, &threadGroupCount, nullptr, 0 , nullptr));
zeCall(zeCommandListClose(list));
zeCall(zeCommandQueueExecuteCommandLists(queue,1, &list, nullptr));
}
for (int i=0; i < n; i++) {
const auto queue = queues[i];
zeCall(zeCommandQueueSynchronize(queue, std::numeric_limits<uint64_t>::max()));
}
std::chrono::high_resolution_clock::time_point timeAtEnd = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(timeAtEnd - timeAtStart).count();
}
int main(int argc, char *argv[]) {
zeCall(zeInit(ZE_INIT_FLAG_GPU_ONLY));
uint32_t driverCount = 0;
zeCall(zeDriverGet(&driverCount, nullptr));
ze_driver_handle_t driverHandle;
zeCall(zeDriverGet(&driverCount, &driverHandle));
ze_context_handle_t context;
ze_context_desc_t contextDesc = {};
zeCall(zeContextCreate(driverHandle, &contextDesc, &context));
// Get the root devices
uint32_t deviceCount = 0;
zeCall(zeDeviceGet(driverHandle, &deviceCount, nullptr));
if (deviceCount == 0) {
std::cout << "No devices found \n";
std::terminate();
}
ze_device_handle_t device;
deviceCount = 1;
zeCall(zeDeviceGet(driverHandle, &deviceCount, &device));
// Create kernel
std::string kernelFile = "kernel_XE_HP_COREcore.spv";
ze_module_format_t kernelFormat = ZE_MODULE_FORMAT_IL_SPIRV;
std::ifstream file(kernelFile, std::ios_base::in | std::ios_base::binary);
if (false == file.good()) {
std::cout << kernelFile << " file not found\n";
std::terminate();
}
uint32_t spirvSize = 0;
file.seekg(0, file.end);
spirvSize = static_cast<size_t>(file.tellg());
file.seekg(0, file.beg);
auto spirvModule = std::make_unique<char[]>(spirvSize);
file.read(spirvModule.get(), spirvSize);
ze_module_handle_t module;
ze_module_desc_t moduleDesc = {};
moduleDesc.format = kernelFormat;
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(spirvModule.get());
moduleDesc.inputSize = spirvSize;
zeCall(zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
ze_kernel_handle_t kernel;
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = "sleep";
zeCall(zeKernelCreate(module, &kernelDesc, &kernel));
zeCall(zeKernelSetGroupSize(kernel, 1, 1 ,1 ));
void *ptr = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_host_mem_alloc_desc_t hostDesc = {};
zeCall(zeMemAllocShared(context, &deviceDesc, &hostDesc, 64, 0, device, &ptr));
zeCall(zeKernelSetArgumentValue(kernel, 0, 8, &ptr));
bench_multiple_queue(1, context, device, kernel);
const auto t1 = bench_multiple_queue(1, context, device, kernel);
const auto t4 = bench_multiple_queue(4, context, device, kernel);
const auto slowdown = 1.*t4/t1;
std::cout << "1 kernel " << t1 << "ms" << std::endl;
std::cout << "4 kernels " << t4 << "ms" << std::endl;
std::cout << "Slowdown " << slowdown << std::endl;
if (slowdown >= 1.5) {
std::cout << "Not enough parallelism" << std::endl;
return 1;
}
return 0;
}
thapi_root = ENV["THAPI_ROOT"]
require "/#{thapi_root}/share/ze_bindings"
mode = ARGV.first
driver = ZE::drivers.first
DEVICE = driver.devices.first
CONTEXT = driver.context_create
GROUP_COUNT=1
GROUP_SIZE=256
ptr = CONTEXT.mem_alloc_device(64, DEVICE)
input_module = File.read("kernel_XE_HP_COREcore.bin",mode: "rb")
mod, log = CONTEXT.module_create(DEVICE, input_module, format: :ZE_MODULE_FORMAT_NATIVE)
KERNEL = mod.kernel_create("sleep")
KERNEL.set_group_size(GROUP_SIZE)
p_ptr = FFI::MemoryPointer.new(:pointer)
p_ptr.write_pointer(ptr)
KERNEL.set_argument_value(0, p_ptr)
def bench_serial(n)
# This is what OpenMP and SYCL are doing
# They are reussing one command list
# This avoid creating a command_list each time
# But this is offcourse serial
command_queue = CONTEXT.command_queue_create(DEVICE)
command_list = CONTEXT.command_list_create(DEVICE)
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
n.times {
command_list.append_launch_kernel(KERNEL,GROUP_COUNT)
command_list.close
command_queue.execute_command_lists(command_list)
command_queue.synchronize
command_list.reset
}
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
return t2-t1
end
def bench_immediate(n,serial=false)
# This is what the OpenMP / SYCL can do
# This allow concurent execution
# But create a command list (immediate) for each kernel
# maybe take longer, but who care...
command_list = CONTEXT.command_list_create_immediate(DEVICE, mode: :ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, flags: :ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY)
event_pool = CONTEXT.event_pool_create(1, flags: [:ZE_EVENT_POOL_FLAG_HOST_VISIBLE])
event = event_pool.event_create(0)
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
n.times {
command_list.append_launch_kernel(KERNEL, GROUP_COUNT)
command_list.append_barrier if serial
}
command_list.append_barrier(signal_event: event)
event.host_synchronize(timeout: ZE::UINT64_MAX)
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
return t2-t1
end
def bench_multiple_list(n)
# One queue but multiple command list
command_queue = CONTEXT.command_queue_create(DEVICE)
command_lists = n.times.map { CONTEXT.command_list_create(DEVICE) }
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
command_lists.each { |command_list|
command_list.append_launch_kernel(KERNEL,GROUP_COUNT)
command_list.close
}
command_queue.execute_command_lists(command_lists)
command_queue.synchronize
command_lists.each { |command_list| command_list.reset }
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
return t2-t1
end
def bench_multiple_queue(n)
# This is to mimic what CUDA people do with their serial queue
# Just create multiple command queue, command list, one per kernel in this case
commmands = n.times.map { [CONTEXT.command_queue_create(DEVICE), CONTEXT.command_list_create(DEVICE)] }
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
commmands.each { |command_queue, command_list|
command_list.append_launch_kernel(KERNEL,GROUP_COUNT)
command_list.close
command_queue.execute_command_lists(command_list)
}
commmands.each { |command_queue, command_list|
command_queue.synchronize
command_list.reset
}
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
return t2-t1
end
def bench_batched(n)
# This is how the L0 people designed the API
# One can append multiple kernel to the command list
# Sadly, this seems really hard for a runtime to use
# without user guidance
command_queue = CONTEXT.command_queue_create(DEVICE)
command_list = CONTEXT.command_list_create(DEVICE)
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
n.times {
command_list.append_launch_kernel(KERNEL,GROUP_COUNT)
}
command_list.close
command_queue.execute_command_lists(command_list)
command_queue.synchronize
command_list.reset
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
return t2-t1
end
def wake_up()
# Just wake a the gpu by doing some work
bench_serial(1)
end
def concurent?(f,n)
t1 = send(f,1)
tN = send(f,n)
r = ((tN - t1)/tN).abs <= 0.20
p [f, "n", n, "t1", t1, "tN", tN,r]
r
end
mode = ARGV[0].to_s
n = ARGV.size >= 2 ? ARGV[1].to_i: 5
wake_up
exit(concurent?(mode,n))
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
__kernel void sleep(__global double *ptr) {
double x = (double)get_local_id(1);
double y = (double)get_local_id(0);
for(int i=0; i<1024*64; i++)
{
MAD_64(x, y);
}
ptr[0] = y;
}
ocloc compile -file kernel.cl -device xe_hp_sdv
ruby concurrent.rb bench_immediate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment