Last active
May 6, 2022 22:02
-
-
Save TApplencourt/35d124cf1cf74d240d2c499cb070fbd8 to your computer and use it in GitHub Desktop.
L0 concurent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <level_zero/ze_api.h> | |
#include <iostream> | |
#include <fstream> | |
#include <memory> | |
#include <vector> | |
#include <chrono> | |
#include <limits> | |
#include <unistd.h> | |
#define zeCall(myZeCall) \ | |
do { \ | |
if (myZeCall != ZE_RESULT_SUCCESS) { \ | |
std::cout << "Error at " \ | |
<< #myZeCall << ": " \ | |
<< __FUNCTION__ << ": " \ | |
<< std::dec \ | |
<< __LINE__ << "\n"; \ | |
std::terminate(); \ | |
} \ | |
} while (0); | |
auto bench_multiple_queue(int n, | |
ze_context_handle_t context, | |
ze_device_handle_t device, | |
ze_kernel_handle_t kernel) { | |
std::vector<ze_command_list_handle_t> lists= {}; | |
std::vector<ze_event_handle_t> events= {}; | |
std::vector<void *> ptrs= {}; | |
ze_event_pool_desc_t eventPoolDesc = { | |
ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, | |
ZE_EVENT_POOL_FLAG_HOST_VISIBLE, // all events in pool are visible to Host | |
static_cast<uint32_t>(n) // count | |
}; | |
ze_event_pool_handle_t eventPool; | |
zeCall(zeEventPoolCreate(context, &eventPoolDesc, 0, nullptr, &eventPool)); | |
for (int i=0; i<n; i++) { | |
// Create an immediate command list for direct submission | |
ze_command_queue_desc_t cmdQueueDesc = {}; | |
cmdQueueDesc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC; | |
cmdQueueDesc.index = i % 4; // 4 CCSs | |
ze_command_list_handle_t cmdList = {}; | |
zeCall(zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdList)); | |
lists.push_back(cmdList); | |
ze_event_desc_t eventDesc = { | |
ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, | |
static_cast<uint32_t>(i), // index | |
0, // no additional memory/cache coherency required on signal | |
ZE_EVENT_SCOPE_FLAG_HOST // ensure memory coherency across device and Host | |
// after event completes | |
}; | |
ze_event_handle_t e; | |
zeCall(zeEventCreate(eventPool, &eventDesc, &e)); | |
events.push_back(e); | |
void *ptr = nullptr; | |
ze_device_mem_alloc_desc_t deviceDesc = {}; | |
ze_host_mem_alloc_desc_t hostDesc = {}; | |
zeCall(zeMemAllocShared(context, &deviceDesc, &hostDesc, 64, 0, device, &ptr)); | |
ptrs.push_back(ptr); | |
} | |
std::chrono::high_resolution_clock::time_point timeAtStart = std::chrono::high_resolution_clock::now(); | |
for (int i=0; i < n; i++) { | |
auto list = lists[i]; | |
const auto e = events[i]; | |
auto ptr = ptrs[i]; | |
ze_group_count_t threadGroupCount = {}; | |
threadGroupCount.groupCountX = 1u; | |
threadGroupCount.groupCountY = 1u; | |
threadGroupCount.groupCountZ = 1u; | |
zeCall(zeKernelSetArgumentValue(kernel, 0, 8, &ptr)); | |
zeCall(zeCommandListAppendLaunchKernel(list, kernel, &threadGroupCount, e, 0 , nullptr)); | |
} | |
for (int i=0; i < n; i++) { | |
const auto e = events[i]; | |
zeCall(zeEventHostSynchronize(e, std::numeric_limits<uint64_t>::max())); | |
} | |
std::chrono::high_resolution_clock::time_point timeAtEnd = std::chrono::high_resolution_clock::now(); | |
for (int i=0; i<n; i++) { | |
#ifndef FAST_BUT_ERROR | |
auto ptr = ptrs[i]; | |
zeCall(zeMemFree(context, ptr)); | |
auto list = lists[i]; | |
zeCall(zeCommandListDestroy(list)); | |
#endif | |
} | |
return std::chrono::duration_cast<std::chrono::milliseconds>(timeAtEnd - timeAtStart).count(); | |
} | |
int main(int argc, char *argv[]) { | |
zeCall(zeInit(ZE_INIT_FLAG_GPU_ONLY)); | |
uint32_t driverCount = 0; | |
zeCall(zeDriverGet(&driverCount, nullptr)); | |
ze_driver_handle_t driverHandle; | |
zeCall(zeDriverGet(&driverCount, &driverHandle)); | |
ze_context_handle_t context; | |
ze_context_desc_t contextDesc = {}; | |
zeCall(zeContextCreate(driverHandle, &contextDesc, &context)); | |
// Get the root devices | |
uint32_t deviceCount = 0; | |
zeCall(zeDeviceGet(driverHandle, &deviceCount, nullptr)); | |
if (deviceCount == 0) { | |
std::cout << "No devices found \n"; | |
std::terminate(); | |
} | |
ze_device_handle_t device; | |
deviceCount = 1; | |
zeCall(zeDeviceGet(driverHandle, &deviceCount, &device)); | |
// Create kernel | |
std::string kernelFile = "kernel.ar"; //"kernel_XE_HP_COREcore.spv"; | |
ze_module_format_t kernelFormat = ZE_MODULE_FORMAT_NATIVE; //ZE_MODULE_FORMAT_IL_SPIRV; | |
std::ifstream file(kernelFile, std::ios_base::in | std::ios_base::binary); | |
if (false == file.good()) { | |
std::cout << kernelFile << " file not found\n"; | |
std::terminate(); | |
} | |
uint32_t spirvSize = 0; | |
file.seekg(0, file.end); | |
spirvSize = static_cast<size_t>(file.tellg()); | |
file.seekg(0, file.beg); | |
auto spirvModule = std::make_unique<char[]>(spirvSize); | |
file.read(spirvModule.get(), spirvSize); | |
ze_module_handle_t module; | |
ze_module_desc_t moduleDesc = {}; | |
moduleDesc.format = kernelFormat; | |
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(spirvModule.get()); | |
moduleDesc.inputSize = spirvSize; | |
zeCall(zeModuleCreate(context, device, &moduleDesc, &module, nullptr)); | |
ze_kernel_handle_t kernel; | |
ze_kernel_desc_t kernelDesc = {}; | |
kernelDesc.pKernelName = "sleep"; | |
zeCall(zeKernelCreate(module, &kernelDesc, &kernel)); | |
zeCall(zeKernelSetGroupSize(kernel, 1, 1 ,1 )); | |
const int N_kernel = 4; | |
#ifndef T1_AFTER | |
bench_multiple_queue(1, context, device, kernel); | |
const auto t1 = bench_multiple_queue(1, context, device, kernel); | |
std::cout << "1 kernel " << t1 << "ms" << std::endl; | |
#endif | |
long tN_min = std::numeric_limits<long>::max(); | |
long tN_sum = 0; | |
const int N = 10; | |
for (auto i=0; i < N; i++) { | |
const auto tN = bench_multiple_queue(N_kernel, context, device, kernel); | |
std::cout << " "<< i << " t" << N_kernel << " "<< tN << "ms" << std::endl; | |
tN_min = std::min(tN_min,tN); | |
tN_sum += tN; | |
} | |
const auto tN_avg = (1.*tN_sum) / N; | |
#ifdef T1_AFTER | |
bench_multiple_queue(1, context, device, kernel); | |
const auto t1 = bench_multiple_queue(1, context, device, kernel); | |
std::cout << "1 kernel " << t1 << "ms" << std::endl; | |
#endif | |
const auto max_speedup = (1.* N_kernel*t1) / tN_min; | |
const auto avg_speedup = (1.* N_kernel*t1) / tN_avg; | |
std::cout << "4 kernels | Average: " << 1.*tN_sum / N << " ms | Min: " << tN_min << " ms" << std::endl; | |
std::cout << "Maximun speedup " << max_speedup << "x" << std::endl; | |
std::cout << "Avg speedup " << avg_speedup << "x" << std::endl; | |
if (max_speedup <= 0.8*N_kernel) { | |
std::cout << "Not enough parallelism" << std::endl; | |
return 1; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <level_zero/ze_api.h> | |
#include <iostream> | |
#include <fstream> | |
#include <memory> | |
#include <vector> | |
#include <chrono> | |
#include <limits> | |
#define zeCall(myZeCall) \ | |
do { \ | |
if (myZeCall != ZE_RESULT_SUCCESS) { \ | |
std::cout << "Error at " \ | |
<< #myZeCall << ": " \ | |
<< __FUNCTION__ << ": " \ | |
<< std::dec \ | |
<< __LINE__ << "\n"; \ | |
std::terminate(); \ | |
} \ | |
} while (0); | |
auto bench_multiple_queue(int n, | |
ze_context_handle_t context, | |
ze_device_handle_t device, | |
ze_kernel_handle_t kernel) { | |
// Some magic number | |
const int computeOrdinal = 0; | |
const int n_index = 4; | |
std::vector<ze_command_queue_handle_t> queues= {}; | |
std::vector<ze_command_list_handle_t> lists= {}; | |
for (int index=0; index<n_index; index++) { | |
ze_command_queue_desc_t cmdQueueDesc = {}; | |
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; | |
cmdQueueDesc.ordinal = computeOrdinal; | |
cmdQueueDesc.index = index; | |
ze_command_queue_handle_t queue; | |
zeCall(zeCommandQueueCreate(context, device, &cmdQueueDesc, &queue)); | |
queues.push_back(queue); | |
ze_command_list_desc_t listDesc = {}; | |
listDesc.commandQueueGroupOrdinal = computeOrdinal; | |
ze_command_list_handle_t list; | |
zeCall(zeCommandListCreate(context, device, &listDesc, &list)); | |
lists.push_back(list); | |
} | |
std::chrono::high_resolution_clock::time_point timeAtStart = std::chrono::high_resolution_clock::now(); | |
for (int i=0; i < n; i++) { | |
const auto queue = queues[i]; | |
auto list = lists[i]; | |
ze_group_count_t threadGroupCount = {}; | |
threadGroupCount.groupCountX = 1u; | |
threadGroupCount.groupCountY = 1u; | |
threadGroupCount.groupCountZ = 1u; | |
zeCall(zeCommandListAppendLaunchKernel(list, kernel, &threadGroupCount, nullptr, 0 , nullptr)); | |
zeCall(zeCommandListClose(list)); | |
zeCall(zeCommandQueueExecuteCommandLists(queue,1, &list, nullptr)); | |
} | |
for (int i=0; i < n; i++) { | |
const auto queue = queues[i]; | |
zeCall(zeCommandQueueSynchronize(queue, std::numeric_limits<uint64_t>::max())); | |
} | |
std::chrono::high_resolution_clock::time_point timeAtEnd = std::chrono::high_resolution_clock::now(); | |
return std::chrono::duration_cast<std::chrono::milliseconds>(timeAtEnd - timeAtStart).count(); | |
} | |
int main(int argc, char *argv[]) { | |
zeCall(zeInit(ZE_INIT_FLAG_GPU_ONLY)); | |
uint32_t driverCount = 0; | |
zeCall(zeDriverGet(&driverCount, nullptr)); | |
ze_driver_handle_t driverHandle; | |
zeCall(zeDriverGet(&driverCount, &driverHandle)); | |
ze_context_handle_t context; | |
ze_context_desc_t contextDesc = {}; | |
zeCall(zeContextCreate(driverHandle, &contextDesc, &context)); | |
// Get the root devices | |
uint32_t deviceCount = 0; | |
zeCall(zeDeviceGet(driverHandle, &deviceCount, nullptr)); | |
if (deviceCount == 0) { | |
std::cout << "No devices found \n"; | |
std::terminate(); | |
} | |
ze_device_handle_t device; | |
deviceCount = 1; | |
zeCall(zeDeviceGet(driverHandle, &deviceCount, &device)); | |
// Create kernel | |
std::string kernelFile = "kernel_XE_HP_COREcore.spv"; | |
ze_module_format_t kernelFormat = ZE_MODULE_FORMAT_IL_SPIRV; | |
std::ifstream file(kernelFile, std::ios_base::in | std::ios_base::binary); | |
if (false == file.good()) { | |
std::cout << kernelFile << " file not found\n"; | |
std::terminate(); | |
} | |
uint32_t spirvSize = 0; | |
file.seekg(0, file.end); | |
spirvSize = static_cast<size_t>(file.tellg()); | |
file.seekg(0, file.beg); | |
auto spirvModule = std::make_unique<char[]>(spirvSize); | |
file.read(spirvModule.get(), spirvSize); | |
ze_module_handle_t module; | |
ze_module_desc_t moduleDesc = {}; | |
moduleDesc.format = kernelFormat; | |
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(spirvModule.get()); | |
moduleDesc.inputSize = spirvSize; | |
zeCall(zeModuleCreate(context, device, &moduleDesc, &module, nullptr)); | |
ze_kernel_handle_t kernel; | |
ze_kernel_desc_t kernelDesc = {}; | |
kernelDesc.pKernelName = "sleep"; | |
zeCall(zeKernelCreate(module, &kernelDesc, &kernel)); | |
zeCall(zeKernelSetGroupSize(kernel, 1, 1 ,1 )); | |
void *ptr = nullptr; | |
ze_device_mem_alloc_desc_t deviceDesc = {}; | |
ze_host_mem_alloc_desc_t hostDesc = {}; | |
zeCall(zeMemAllocShared(context, &deviceDesc, &hostDesc, 64, 0, device, &ptr)); | |
zeCall(zeKernelSetArgumentValue(kernel, 0, 8, &ptr)); | |
bench_multiple_queue(1, context, device, kernel); | |
const auto t1 = bench_multiple_queue(1, context, device, kernel); | |
const auto t4 = bench_multiple_queue(4, context, device, kernel); | |
const auto slowdown = 1.*t4/t1; | |
std::cout << "1 kernel " << t1 << "ms" << std::endl; | |
std::cout << "4 kernels " << t4 << "ms" << std::endl; | |
std::cout << "Slowdown " << slowdown << std::endl; | |
if (slowdown >= 1.5) { | |
std::cout << "Not enough parallelism" << std::endl; | |
return 1; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
thapi_root = ENV["THAPI_ROOT"] | |
require "/#{thapi_root}/share/ze_bindings" | |
mode = ARGV.first | |
driver = ZE::drivers.first | |
DEVICE = driver.devices.first | |
CONTEXT = driver.context_create | |
GROUP_COUNT=1 | |
GROUP_SIZE=256 | |
ptr = CONTEXT.mem_alloc_device(64, DEVICE) | |
input_module = File.read("kernel_XE_HP_COREcore.bin",mode: "rb") | |
mod, log = CONTEXT.module_create(DEVICE, input_module, format: :ZE_MODULE_FORMAT_NATIVE) | |
KERNEL = mod.kernel_create("sleep") | |
KERNEL.set_group_size(GROUP_SIZE) | |
p_ptr = FFI::MemoryPointer.new(:pointer) | |
p_ptr.write_pointer(ptr) | |
KERNEL.set_argument_value(0, p_ptr) | |
def bench_serial(n) | |
# This is what OpenMP and SYCL are doing | |
# They are reussing one command list | |
# This avoid creating a command_list each time | |
# But this is offcourse serial | |
command_queue = CONTEXT.command_queue_create(DEVICE) | |
command_list = CONTEXT.command_list_create(DEVICE) | |
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
n.times { | |
command_list.append_launch_kernel(KERNEL,GROUP_COUNT) | |
command_list.close | |
command_queue.execute_command_lists(command_list) | |
command_queue.synchronize | |
command_list.reset | |
} | |
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
return t2-t1 | |
end | |
def bench_immediate(n,serial=false) | |
# This is what the OpenMP / SYCL can do | |
# This allow concurent execution | |
# But create a command list (immediate) for each kernel | |
# maybe take longer, but who care... | |
command_list = CONTEXT.command_list_create_immediate(DEVICE, mode: :ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, flags: :ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY) | |
event_pool = CONTEXT.event_pool_create(1, flags: [:ZE_EVENT_POOL_FLAG_HOST_VISIBLE]) | |
event = event_pool.event_create(0) | |
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
n.times { | |
command_list.append_launch_kernel(KERNEL, GROUP_COUNT) | |
command_list.append_barrier if serial | |
} | |
command_list.append_barrier(signal_event: event) | |
event.host_synchronize(timeout: ZE::UINT64_MAX) | |
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
return t2-t1 | |
end | |
def bench_multiple_list(n) | |
# One queue but multiple command list | |
command_queue = CONTEXT.command_queue_create(DEVICE) | |
command_lists = n.times.map { CONTEXT.command_list_create(DEVICE) } | |
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
command_lists.each { |command_list| | |
command_list.append_launch_kernel(KERNEL,GROUP_COUNT) | |
command_list.close | |
} | |
command_queue.execute_command_lists(command_lists) | |
command_queue.synchronize | |
command_lists.each { |command_list| command_list.reset } | |
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
return t2-t1 | |
end | |
def bench_multiple_queue(n) | |
# This is to mimic what CUDA people do with their serial queue | |
# Just create multiple command queue, command list, one per kernel in this case | |
commmands = n.times.map { [CONTEXT.command_queue_create(DEVICE), CONTEXT.command_list_create(DEVICE)] } | |
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
commmands.each { |command_queue, command_list| | |
command_list.append_launch_kernel(KERNEL,GROUP_COUNT) | |
command_list.close | |
command_queue.execute_command_lists(command_list) | |
} | |
commmands.each { |command_queue, command_list| | |
command_queue.synchronize | |
command_list.reset | |
} | |
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
return t2-t1 | |
end | |
def bench_batched(n) | |
# This is how the L0 people designed the API | |
# One can append multiple kernel to the command list | |
# Sadly, this seems really hard for a runtime to use | |
# without user guidance | |
command_queue = CONTEXT.command_queue_create(DEVICE) | |
command_list = CONTEXT.command_list_create(DEVICE) | |
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
n.times { | |
command_list.append_launch_kernel(KERNEL,GROUP_COUNT) | |
} | |
command_list.close | |
command_queue.execute_command_lists(command_list) | |
command_queue.synchronize | |
command_list.reset | |
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
return t2-t1 | |
end | |
def wake_up() | |
# Just wake a the gpu by doing some work | |
bench_serial(1) | |
end | |
def concurent?(f,n) | |
t1 = send(f,1) | |
tN = send(f,n) | |
r = ((tN - t1)/tN).abs <= 0.20 | |
p [f, "n", n, "t1", t1, "tN", tN,r] | |
r | |
end | |
mode = ARGV[0].to_s | |
n = ARGV.size >= 2 ? ARGV[1].to_i: 5 | |
wake_up | |
exit(concurent?(mode,n)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x); | |
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); | |
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); | |
__kernel void sleep(__global double *ptr) { | |
double x = (double)get_local_id(1); | |
double y = (double)get_local_id(0); | |
for(int i=0; i<1024*64; i++) | |
{ | |
MAD_64(x, y); | |
} | |
ptr[0] = y; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ocloc compile -file kernel.cl -device xe_hp_sdv | |
ruby concurrent.rb bench_immediate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment