From d4700e59f0ae2bcd031c0e504e6d7967fd7c6c3a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 27 Feb 2025 11:02:25 +0000 Subject: [PATCH 01/95] [NATIVECPU] faster enqueue for larger ranges --- .../source/adapters/native_cpu/enqueue.cpp | 173 +++++++++++------- 1 file changed, 104 insertions(+), 69 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6cd1f6af8e660..40232868ea65e 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -63,6 +63,29 @@ static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, } #endif +using IndexT = std::array; +using RangeT = native_cpu::NDRDescT::RangeT; + +static inline void execute_range(native_cpu::state &state, + const ur_kernel_handle_t_ &hKernel, + const std::vector &args, IndexT first, + IndexT lastPlusOne) { + for (size_t g2 = first[2]; g2 < lastPlusOne[2]; g2++) { + for (size_t g1 = first[1]; g1 < lastPlusOne[1]; g1++) { + for (size_t g0 = first[0]; g0 < lastPlusOne[0]; g0 += 1) { + state.update(g0, g1, g2); + hKernel._subhandler(args.data(), &state); + } + } + } +} + +static inline void execute_range(native_cpu::state &state, + const ur_kernel_handle_t_ &hKernel, + IndexT first, IndexT lastPlusOne) { + execute_range(state, hKernel, hKernel.getArgs(), first, lastPlusOne); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -158,89 +181,101 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t new_num_work_groups_0 = numParallelThreads; size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { - futures.emplace_back(tp.schedule_task( - [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { - native_cpu::state resized_state = - getResizedState(ndr, itemsPerThread); - resized_state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &resized_state); - })); - } - // Peel the remaining work items. Since the local size is 1, we iterate - // over the work groups. - for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; - g0++) { - state.update(g0, g1, g2); - kernel->_subhandler(kernel->getArgs().data(), &state); - } - } + for (size_t t = 0; t < numParallelThreads;) { + IndexT first = {t, 0, 0}; + IndexT last = {++t, numWG1, numWG2}; + futures.emplace_back(tp.schedule_task( + [ndr, itemsPerThread, &kernel = *kernel, first, last](size_t) { + native_cpu::state resized_state = + getResizedState(ndr, itemsPerThread); + execute_range(resized_state, kernel, first, last); + })); + } + + size_t start_wg0_remainder = new_num_work_groups_0 * itemsPerThread; + if (start_wg0_remainder < numWG0) { + // Peel the remaining work items. Since the local size is 1, we iterate + // over the work groups. + futures.emplace_back( + tp.schedule_task([state, &kernel = *kernel, start_wg0_remainder, + numWG0, numWG1, numWG2](size_t) mutable { + IndexT first = {start_wg0_remainder, 0, 0}; + IndexT last = {numWG0, numWG1, numWG2}; + execute_range(state, kernel, first, last); + })); } } else { // We are running a parallel_for over an nd_range + const auto numWG0_per_thread = numWG0 / numParallelThreads; - if (numWG1 * numWG2 >= numParallelThreads) { - // Dimensions 1 and 2 have enough work, split them across the threadpool - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - state.update(g0, g1, g2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), - &state); - } - })); - } + if (numWG0_per_thread) { + for (size_t t = 0, WG0_start = 0; t < numParallelThreads; t++) { + IndexT first = {WG0_start, 0, 0}; + WG0_start += numWG0_per_thread; + IndexT last = {WG0_start, numWG1, numWG2}; + futures.emplace_back( + tp.schedule_task([state, numParallelThreads, &kernel = *kernel, + first, last](size_t threadId) mutable { + execute_range(state, kernel, + kernel.getArgs(numParallelThreads, threadId), first, + last); + })); } + size_t start_wg0_remainder = numWG0_per_thread * numParallelThreads; + if (start_wg0_remainder < numWG0) { + IndexT first = {start_wg0_remainder, 0, 0}; + IndexT last = {numWG0, numWG1, numWG2}; + futures.emplace_back( + tp.schedule_task([state, numParallelThreads, &kernel = *kernel, + first, last](size_t threadId) mutable { + execute_range(state, kernel, + kernel.getArgs(numParallelThreads, threadId), first, + last); + })); + } + } else { - // Split dimension 0 across the threadpool // Here we try to create groups of workgroups in order to reduce // synchronization overhead - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - groups.push_back([state, g0, g1, g2, numParallelThreads]( - size_t threadId, - ur_kernel_handle_t_ &kernel) mutable { - state.update(g0, g1, g2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), &state); - }); - } - } - } - auto numGroups = groups.size(); + + // todo: deal with overflow + auto numGroups = numWG2 * numWG1 * numWG0; auto groupsPerThread = numGroups / numParallelThreads; + + IndexT first = {0, 0, 0}; + size_t counter = 0; if (groupsPerThread) { - for (unsigned thread = 0; thread < numParallelThreads; thread++) { - futures.emplace_back( - tp.schedule_task([groups, thread, groupsPerThread, - &kernel = *kernel](size_t threadId) { - for (unsigned i = 0; i < groupsPerThread; i++) { - auto index = thread * groupsPerThread + i; - groups[index](threadId, kernel); - } - })); + for (unsigned g2 = 0; g2 < numWG2; g2++) { + for (unsigned g1 = 0; g1 < numWG1; g1++) { + for (unsigned g0 = 0; g0 < numWG0; g0++) { + if (counter == 0) + first = {g0, g1, g2}; + if (++counter == groupsPerThread) { + IndexT last = {g0 + 1, g1 + 1, g2 + 1}; + futures.emplace_back(tp.schedule_task( + [state, numParallelThreads, &kernel = *kernel, first, + last](size_t threadId) mutable { + execute_range( + state, kernel, + kernel.getArgs(numParallelThreads, threadId), first, + last); + })); + counter = 0; + } + } + } } } - - // schedule the remaining tasks - auto remainder = numGroups % numParallelThreads; - if (remainder) { + if (numGroups % numParallelThreads) { + // we have a remainder + IndexT last = {numWG0, numWG1, numWG2}; futures.emplace_back( - tp.schedule_task([groups, remainder, - scheduled = numParallelThreads * groupsPerThread, - &kernel = *kernel](size_t threadId) { - for (unsigned i = 0; i < remainder; i++) { - auto index = scheduled + i; - groups[index](threadId, kernel); - } + tp.schedule_task([state, numParallelThreads, &kernel = *kernel, + first, last](size_t threadId) mutable { + execute_range(state, kernel, + kernel.getArgs(numParallelThreads, threadId), first, + last); })); } } From b3f221564d541b5f67279c9cde83d00897c2abee Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 28 Aug 2024 10:55:46 +0100 Subject: [PATCH 02/95] [NATIVECPU] use size_t, reserve vector size --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6cd1f6af8e660..c650948b54c35 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -171,7 +171,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. - for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; + for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; g0++) { state.update(g0, g1, g2); kernel->_subhandler(kernel->getArgs().data(), &state); @@ -202,6 +202,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Split dimension 0 across the threadpool // Here we try to create groups of workgroups in order to reduce // synchronization overhead + groups.reserve(numWG2 * numWG1 * numWG0); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { From 780588c534f05520820c885ada6c3993131f35d3 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Oct 2024 15:26:58 +0100 Subject: [PATCH 03/95] [NATIVECPU] added threadpool file to CMakeList --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 17467bfdeff4a..eb4ac9a710546 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -35,6 +35,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp From db924f0fccf69da8f50cab8b0bd3647e32e0531b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Oct 2024 17:32:26 +0100 Subject: [PATCH 04/95] [NATIVECPU] Simple TBB backend --- .../source/adapters/native_cpu/CMakeLists.txt | 41 ++++++++++++++++ .../source/adapters/native_cpu/device.hpp | 4 ++ .../source/adapters/native_cpu/enqueue.cpp | 27 +++++----- .../source/adapters/native_cpu/threadpool.hpp | 49 ++++++++++++++++++- 4 files changed, 107 insertions(+), 14 deletions(-) mode change 100644 => 100755 unified-runtime/source/adapters/native_cpu/enqueue.cpp mode change 100644 => 100755 unified-runtime/source/adapters/native_cpu/threadpool.hpp diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index eb4ac9a710546..9ed5257fe44bd 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -50,6 +50,37 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) +option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON) +if(NATIVECPU_WITH_TBB) + message(STATUS "Building Native CPU adapter with TBB backend.") + + include(FetchContent) + FetchContent_Declare( + tbb + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716 + CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" + GIT_SHALLOW ON + OVERRIDE_FIND_PACKAGE + ) + set(TBB_TEST OFF CACHE INTERNAL "" FORCE) + set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE) + set(TBB_BENCH OFF CACHE INTERNAL "" FORCE) + set(TBB_BUILD ON CACHE INTERNAL "" FORCE) + set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE) + set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE) + set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) + FetchContent_MakeAvailable(tbb) + + FetchContent_GetProperties(tbb) + + if(NOT tbb_POPULATED) + FetchContent_Populate(tbb) + endif() + set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include) + set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR}) +endif() + find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE @@ -62,3 +93,13 @@ target_link_libraries(${TARGET_NAME} PRIVATE target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) + +if(NATIVECPU_WITH_TBB) + target_link_libraries(${TARGET_NAME} PRIVATE + TBB::tbb + ) + target_include_directories(${TARGET_NAME} PRIVATE + "${TBB_SOURCE_DIR_INTERNAL}" + ) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) +endif() diff --git a/unified-runtime/source/adapters/native_cpu/device.hpp b/unified-runtime/source/adapters/native_cpu/device.hpp index 2308c1a7f4597..1a6b0d091acfa 100644 --- a/unified-runtime/source/adapters/native_cpu/device.hpp +++ b/unified-runtime/source/adapters/native_cpu/device.hpp @@ -14,7 +14,11 @@ #include struct ur_device_handle_t_ { +#ifdef NATIVECPU_USE_TBB + native_cpu::TBB_threadpool tp; +#else native_cpu::threadpool_t tp; +#endif ur_device_handle_t_(ur_platform_handle_t ArgPlt); const uint64_t mem_size; diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp old mode 100644 new mode 100755 index c650948b54c35..849120e642d5c --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -106,8 +106,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize); auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); - std::vector> futures; +//<<<<<<< HEAD:unified-runtime/source/adapters/native_cpu/enqueue.cpp +// std::vector> futures; std::vector> groups; +//======= + auto Tasks = native_cpu::getScheduler(tp); +//>>>>>>> 5406b39f26c6 ([NATIVECPU] Simple TBB backend):source/adapters/native_cpu/enqueue.cpp auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; @@ -161,13 +165,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { - futures.emplace_back(tp.schedule_task( + Tasks.schedule( [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); resized_state.update(g0, g1, g2); kernel._subhandler(kernel.getArgs().data(), &resized_state); - })); + }); } // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. @@ -186,8 +190,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, + Tasks.schedule([state, &kernel = *kernel, numWG0, g1, g2, numParallelThreads](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { state.update(g0, g1, g2); @@ -195,7 +198,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel.getArgs(numParallelThreads, threadId).data(), &state); } - })); + }); } } } else { @@ -220,35 +223,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto groupsPerThread = numGroups / numParallelThreads; if (groupsPerThread) { for (unsigned thread = 0; thread < numParallelThreads; thread++) { - futures.emplace_back( - tp.schedule_task([groups, thread, groupsPerThread, + Tasks.schedule([groups, thread, groupsPerThread, &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; groups[index](threadId, kernel); } - })); + }); } } // schedule the remaining tasks auto remainder = numGroups % numParallelThreads; if (remainder) { - futures.emplace_back( - tp.schedule_task([groups, remainder, + Tasks.schedule([groups, remainder, scheduled = numParallelThreads * groupsPerThread, &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < remainder; i++) { auto index = scheduled + i; groups[index](threadId, kernel); } - })); + }); } } } + Tasks.wait(); #endif // NATIVECPU_USE_OCK - event->set_futures(futures); if (phEvent) { *phEvent = event; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp old mode 100644 new mode 100755 index ea64acf1f227c..9b4c179a2d9b2 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -208,6 +208,53 @@ template class threadpool_interface { } }; -using threadpool_t = threadpool_interface; +template class Scheduler { + std::vector> futures; + TP &TPref; + +public: + Scheduler(TP &ref) : TPref(ref) {} + + template void schedule(T &&task) { + futures.emplace_back(TPref.schedule_task(std::forward(task))); + } + inline void wait() { + for (auto &f : futures) + f.get(); + } +}; + +using simple_threadpool_t = threadpool_interface; +inline Scheduler getScheduler(simple_threadpool_t &tp) { + return Scheduler(tp); +} + +using threadpool_t = simple_threadpool_t; } // namespace native_cpu + +#ifdef NATIVECPU_USE_TBB +// Simple TBB backend +#include "oneapi/tbb.h" +namespace native_cpu { + +struct TBB_threadpool { + inline size_t num_threads() const noexcept { return 32; } +}; +template <> class Scheduler { + oneapi::tbb::task_group tasks; + +public: + inline void schedule(worker_task_t &&task) { + tasks.run([&]() { task(0); }); + } + inline void wait() { tasks.wait(); } +}; + +inline Scheduler getScheduler(TBB_threadpool &tp) { + return Scheduler(); +} + +} // namespace native_cpu + +#endif From 150965591b394ae7b568a0e2603bf5bf1b7c5d43 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 23 Oct 2024 17:32:29 +0100 Subject: [PATCH 05/95] [NATIVECPU] more shared code --- .../source/adapters/native_cpu/threadpool.hpp | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 9b4c179a2d9b2..75f7ad6d07ae6 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -208,25 +208,29 @@ template class threadpool_interface { } }; -template class Scheduler { - std::vector> futures; +template struct SchedulerBase { TP &TPref; + SchedulerBase(TP &ref) : TPref(ref) {} +}; -public: - Scheduler(TP &ref) : TPref(ref) {} +template struct Scheduler : SchedulerBase { + using SchedulerBase::SchedulerBase; template void schedule(T &&task) { - futures.emplace_back(TPref.schedule_task(std::forward(task))); + futures.emplace_back(this->TPref.schedule_task(std::forward(task))); } inline void wait() { for (auto &f : futures) f.get(); } + +private: + std::vector> futures; }; using simple_threadpool_t = threadpool_interface; -inline Scheduler getScheduler(simple_threadpool_t &tp) { - return Scheduler(tp); +template inline Scheduler getScheduler(TPType &tp) { + return Scheduler(tp); } using threadpool_t = simple_threadpool_t; @@ -239,22 +243,17 @@ using threadpool_t = simple_threadpool_t; namespace native_cpu { struct TBB_threadpool { + oneapi::tbb::task_group tasks; inline size_t num_threads() const noexcept { return 32; } }; -template <> class Scheduler { - oneapi::tbb::task_group tasks; - -public: - inline void schedule(worker_task_t &&task) { - tasks.run([&]() { task(0); }); +template <> struct Scheduler : SchedulerBase { + using SchedulerBase::SchedulerBase; + template inline void schedule(T &&task) { + TPref.tasks.run(std::function([=]() mutable { task(0); })); } - inline void wait() { tasks.wait(); } + inline void wait() { TPref.tasks.wait(); } }; -inline Scheduler getScheduler(TBB_threadpool &tp) { - return Scheduler(); -} - } // namespace native_cpu #endif From 45ee46cf11300a932402fde7360a17e01d2ee3c5 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Oct 2024 09:55:56 +0100 Subject: [PATCH 06/95] [NATIVECPU] update oneTBB tag --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 9ed5257fe44bd..282cac2afbca4 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -58,7 +58,7 @@ if(NATIVECPU_WITH_TBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716 + GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" GIT_SHALLOW ON OVERRIDE_FIND_PACKAGE From aa7dec8ff0573b9568789e0673f7ee5620ded69e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Oct 2024 11:03:31 +0100 Subject: [PATCH 07/95] [NATIVECPU] added required include not needed by Windows --- unified-runtime/source/adapters/native_cpu/common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/common.cpp b/unified-runtime/source/adapters/native_cpu/common.cpp index ab7c7a07ea426..c283c16c11beb 100644 --- a/unified-runtime/source/adapters/native_cpu/common.cpp +++ b/unified-runtime/source/adapters/native_cpu/common.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "common.hpp" +#include // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR // See urGetLastResult From 29d11f96a5b9e1ff642106a869bd5e2668f78de8 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Oct 2024 18:30:34 +0100 Subject: [PATCH 08/95] [NATIVECPU] added system headers first --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 282cac2afbca4..99cafe4411d11 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -91,6 +91,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE ) target_include_directories(${TARGET_NAME} PRIVATE + "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}" "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) From e202f8da3e3f7484c94988117c4a1f3bfd65559c Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Oct 2024 18:51:10 +0100 Subject: [PATCH 09/95] [NATIVECPU] cmake fix --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 99cafe4411d11..418922ba9e396 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -91,7 +91,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE ) target_include_directories(${TARGET_NAME} PRIVATE - "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}" "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) @@ -99,8 +98,8 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) - target_include_directories(${TARGET_NAME} PRIVATE - "${TBB_SOURCE_DIR_INTERNAL}" - ) +# target_include_directories(${TARGET_NAME} PRIVATE +# "${TBB_SOURCE_DIR_INTERNAL}" +# ) target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From fe8d099956a977203bd6e69012c37c4cfa5033cb Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 25 Oct 2024 19:17:33 +0100 Subject: [PATCH 10/95] [NATIVECPU] removed GIT_SHALLOW --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 418922ba9e396..a92c222d31aa4 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -60,7 +60,6 @@ if(NATIVECPU_WITH_TBB) GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" - GIT_SHALLOW ON OVERRIDE_FIND_PACKAGE ) set(TBB_TEST OFF CACHE INTERNAL "" FORCE) From c2a3f573d6d37cfabbd9d75459766f7ceecc14f6 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 1 Nov 2024 15:41:31 +0000 Subject: [PATCH 11/95] [NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index a92c222d31aa4..6e80dd15b7338 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -69,6 +69,7 @@ if(NATIVECPU_WITH_TBB) set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE) set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE) set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) + set (CMAKE_INCLUDE_CURRENT_DIR OFF) FetchContent_MakeAvailable(tbb) FetchContent_GetProperties(tbb) @@ -97,8 +98,6 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) -# target_include_directories(${TARGET_NAME} PRIVATE -# "${TBB_SOURCE_DIR_INTERNAL}" -# ) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From be5b1343fae33df91719e70ce0955d9eca9087ca Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 1 Nov 2024 15:50:58 +0000 Subject: [PATCH 12/95] [NATIVECPU] workaround for oneTBB casting away const qualifiers --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 6e80dd15b7338..62d0ccd7bdca3 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -98,6 +98,11 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) + if (MSVC) + else() + # oneTBB currently casts away some const qualifiers + target_compile_options(tbb PRIVATE -Wno-cast-qual) + endif() target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From b18401f5c3b29aaff3538ed8df90d0faa426e389 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 1 Nov 2024 17:25:51 +0000 Subject: [PATCH 13/95] [NATIVECPU] workaround for oneTBB casting away const qualifiers --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 62d0ccd7bdca3..ef16981ea8d23 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -102,6 +102,7 @@ if(NATIVECPU_WITH_TBB) else() # oneTBB currently casts away some const qualifiers target_compile_options(tbb PRIVATE -Wno-cast-qual) + target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) From 4bff038383efd33a51b2c261dd649b2132e049f7 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 1 Nov 2024 19:03:56 +0000 Subject: [PATCH 14/95] [NATIVECPU] remove potentially unneeded cmake --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index ef16981ea8d23..7c91d50fff2f5 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -71,14 +71,6 @@ if(NATIVECPU_WITH_TBB) set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) set (CMAKE_INCLUDE_CURRENT_DIR OFF) FetchContent_MakeAvailable(tbb) - - FetchContent_GetProperties(tbb) - - if(NOT tbb_POPULATED) - FetchContent_Populate(tbb) - endif() - set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include) - set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR}) endif() find_package(Threads REQUIRED) From eacf52291b7c33daba499b96f3f412d212b9098c Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 12:51:03 +0000 Subject: [PATCH 15/95] [NATIVECPU] oneTBB disabled by default --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 7c91d50fff2f5..d3cbe735c1074 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -50,9 +50,10 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) -option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON) +# oneTBB is an optional NativeCPU backend and disabled by default. +option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_TBB) - message(STATUS "Building Native CPU adapter with TBB backend.") + message(STATUS "Configuring Native CPU adapter with TBB backend.") include(FetchContent) FetchContent_Declare( From c2996ebfbcd7c6782ce83c997be87b2240e802dd Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 13:20:13 +0000 Subject: [PATCH 16/95] [NATIVECPU] tbb to oneTBB --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 6 +++--- unified-runtime/source/adapters/native_cpu/device.hpp | 2 +- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index d3cbe735c1074..f871881918879 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -51,8 +51,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES ) # oneTBB is an optional NativeCPU backend and disabled by default. -option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF) -if(NATIVECPU_WITH_TBB) +option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) +if(NATIVECPU_WITH_ONETBB) message(STATUS "Configuring Native CPU adapter with TBB backend.") include(FetchContent) @@ -98,5 +98,5 @@ if(NATIVECPU_WITH_TBB) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() - target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB) endif() diff --git a/unified-runtime/source/adapters/native_cpu/device.hpp b/unified-runtime/source/adapters/native_cpu/device.hpp index 1a6b0d091acfa..e9f760293004b 100644 --- a/unified-runtime/source/adapters/native_cpu/device.hpp +++ b/unified-runtime/source/adapters/native_cpu/device.hpp @@ -14,7 +14,7 @@ #include struct ur_device_handle_t_ { -#ifdef NATIVECPU_USE_TBB +#ifdef NATIVECPU_WITH_ONETBB native_cpu::TBB_threadpool tp; #else native_cpu::threadpool_t tp; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 75f7ad6d07ae6..2a100626ccdc1 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -237,7 +237,7 @@ using threadpool_t = simple_threadpool_t; } // namespace native_cpu -#ifdef NATIVECPU_USE_TBB +#ifdef NATIVECPU_WITH_ONETBB // Simple TBB backend #include "oneapi/tbb.h" namespace native_cpu { From 91a6a49069fce1044cdec735bfb215c02666a470 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 14:52:10 +0000 Subject: [PATCH 17/95] [NATIVECPU] improved comment --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index f871881918879..aa6e3cc2a3e45 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -50,7 +50,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) -# oneTBB is an optional NativeCPU backend and disabled by default. +# oneTBB is used as an optional NativeCPU backend and disabled by default. option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_ONETBB) message(STATUS "Configuring Native CPU adapter with TBB backend.") From c1745c702612aaa82ca861353c4f1e1b20331aab Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 15:07:17 +0000 Subject: [PATCH 18/95] [NATIVECPU] tbb to oneTBB --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index aa6e3cc2a3e45..66c412bb98b2c 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -87,7 +87,7 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) -if(NATIVECPU_WITH_TBB) +if(NATIVECPU_WITH_ONETBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) From 488504cb2bb94ae79b26d3dedb6e1e50dc591df8 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 15:13:06 +0000 Subject: [PATCH 19/95] [NATIVECPU] tbb to oneTBB --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 66c412bb98b2c..164db49d98065 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -53,7 +53,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES # oneTBB is used as an optional NativeCPU backend and disabled by default. option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_ONETBB) - message(STATUS "Configuring Native CPU adapter with TBB backend.") + message(STATUS "Configuring Native CPU adapter with oneTBB backend.") include(FetchContent) FetchContent_Declare( From 53013d42dbc65e17d2b2b6fdcaa21274b9206f28 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 4 Nov 2024 15:26:45 +0000 Subject: [PATCH 20/95] [NATIVECPU] num_threads with oneTBB --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 2a100626ccdc1..06889e99581f9 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -244,7 +244,9 @@ namespace native_cpu { struct TBB_threadpool { oneapi::tbb::task_group tasks; - inline size_t num_threads() const noexcept { return 32; } + inline size_t num_threads() const noexcept { + return oneapi::tbb::info::default_concurrency(); + } }; template <> struct Scheduler : SchedulerBase { using SchedulerBase::SchedulerBase; From e8d8ff4b9e1c256cd5dc997ee71cf9dbd95d945e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 6 Nov 2024 13:34:49 +0000 Subject: [PATCH 21/95] [NATIVECPU] added comment to cmake --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 164db49d98065..8614272f95887 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -91,9 +91,9 @@ if(NATIVECPU_WITH_ONETBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) - if (MSVC) - else() + if (NOT MSVC) # oneTBB currently casts away some const qualifiers + # todo: check if compiler actually supports these options target_compile_options(tbb PRIVATE -Wno-cast-qual) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() From 99c76c9d65a89ef7ee3d708c1ab7289e42843592 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 6 Nov 2024 18:25:27 +0000 Subject: [PATCH 22/95] [NATIVECPU] using old task ids with tbb (WIP) --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 06889e99581f9..cc98eec407b36 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -251,7 +251,12 @@ struct TBB_threadpool { template <> struct Scheduler : SchedulerBase { using SchedulerBase::SchedulerBase; template inline void schedule(T &&task) { - TPref.tasks.run(std::function([=]() mutable { task(0); })); + TPref.tasks.run(std::function([=]() mutable { + auto thread_id = tbb::this_task_arena::current_thread_index(); + assert(thread_id >= 0 && + thread_id < oneapi::tbb::info::default_concurrency()); + task(thread_id); + })); } inline void wait() { TPref.tasks.wait(); } }; From 9b400819bb639f65093d64838f7566e57fae617b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 13 Nov 2024 17:54:35 +0000 Subject: [PATCH 23/95] [NATIVECPU] fixed merge from main --- .../source/adapters/native_cpu/device.hpp | 7 +-- .../source/adapters/native_cpu/enqueue.cpp | 16 ++---- .../source/adapters/native_cpu/event.cpp | 4 +- .../source/adapters/native_cpu/event.hpp | 6 +- .../source/adapters/native_cpu/kernel.hpp | 4 +- .../adapters/native_cpu/nativecpu_state.hpp | 6 +- .../source/adapters/native_cpu/threadpool.hpp | 55 ++++++++++++------- 7 files changed, 53 insertions(+), 45 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/device.hpp b/unified-runtime/source/adapters/native_cpu/device.hpp index e9f760293004b..e0cf3872b548c 100644 --- a/unified-runtime/source/adapters/native_cpu/device.hpp +++ b/unified-runtime/source/adapters/native_cpu/device.hpp @@ -14,11 +14,8 @@ #include struct ur_device_handle_t_ { -#ifdef NATIVECPU_WITH_ONETBB - native_cpu::TBB_threadpool tp; -#else - native_cpu::threadpool_t tp; -#endif + native_cpu::ThreadPoolType tp; + ur_device_handle_t_(ur_platform_handle_t ArgPlt); const uint64_t mem_size; diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 849120e642d5c..937b500cf1e7b 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -26,8 +26,8 @@ struct NDRDescT { RangeT GlobalOffset; RangeT GlobalSize; RangeT LocalSize; - NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize) + inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize) : WorkDim(WorkDim) { for (uint32_t I = 0; I < WorkDim; I++) { GlobalOffset[I] = GlobalWorkOffset[I]; @@ -53,8 +53,8 @@ struct NDRDescT { } // namespace native_cpu #ifdef NATIVECPU_USE_OCK -static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, - size_t itemsPerThread) { +static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, + size_t itemsPerThread) { native_cpu::state resized_state( ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], @@ -106,12 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize); auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); -//<<<<<<< HEAD:unified-runtime/source/adapters/native_cpu/enqueue.cpp -// std::vector> futures; std::vector> groups; -//======= auto Tasks = native_cpu::getScheduler(tp); -//>>>>>>> 5406b39f26c6 ([NATIVECPU] Simple TBB backend):source/adapters/native_cpu/enqueue.cpp auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; @@ -185,7 +181,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } else { // We are running a parallel_for over an nd_range - if (numWG1 * numWG2 >= numParallelThreads) { // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { @@ -232,7 +227,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( }); } } - // schedule the remaining tasks auto remainder = numGroups % numParallelThreads; if (remainder) { @@ -248,8 +242,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - Tasks.wait(); #endif // NATIVECPU_USE_OCK + event->set_futures(Tasks.getTaskInfo()); if (phEvent) { *phEvent = event; diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index f981d24f42453..e81c44301dc2b 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -139,9 +139,7 @@ void ur_event_handle_t_::wait() { if (done) { return; } - for (auto &f : futures) { - f.wait(); - } + this->futures.wait(); queue->removeEvent(this); done = true; // The callback may need to acquire the lock, so we unlock it here diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 479c671b38cd1..2c72475a38d0b 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -14,6 +14,7 @@ #include #include #include +#include "threadpool.hpp" struct ur_event_handle_t_ : RefCounted { @@ -42,7 +43,8 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } - void set_futures(std::vector> &fs) { + // todo: get rid of this function + void set_futures(native_cpu::TasksInfoType &&fs) { std::lock_guard lock(mutex); futures = std::move(fs); } @@ -61,7 +63,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t command_type; bool done; std::mutex mutex; - std::vector> futures; + native_cpu::TasksInfoType futures; std::packaged_task callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; diff --git a/unified-runtime/source/adapters/native_cpu/kernel.hpp b/unified-runtime/source/adapters/native_cpu/kernel.hpp index 9e13286f3ebb0..0466c1e0a1444 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.hpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.hpp @@ -22,13 +22,13 @@ using nativecpu_task_t = std::function; struct local_arg_info_t { uint32_t argIndex; size_t argSize; - local_arg_info_t(uint32_t argIndex, size_t argSize) + inline local_arg_info_t(uint32_t argIndex, size_t argSize) : argIndex(argIndex), argSize(argSize) {} }; struct ur_kernel_handle_t_ : RefCounted { - ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} diff --git a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp index 9d6b4f4f06674..c8022293262ba 100644 --- a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp +++ b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp @@ -20,7 +20,7 @@ struct state { size_t MNumGroups[3]; size_t MGlobalOffset[3]; uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size; - state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, + inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, size_t localR1, size_t localR2, size_t globalO0, size_t globalO1, size_t globalO2) : MGlobal_range{globalR0, globalR1, globalR2}, @@ -42,7 +42,7 @@ struct state { SubGroup_size = 1; } - void update(size_t group0, size_t group1, size_t group2, size_t local0, + inline void update(size_t group0, size_t group1, size_t group2, size_t local0, size_t local1, size_t local2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; @@ -58,7 +58,7 @@ struct state { MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2]; } - void update(size_t group0, size_t group1, size_t group2) { + inline void update(size_t group0, size_t group1, size_t group2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index cc98eec407b36..25d6d60e8da39 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -208,24 +208,30 @@ template class threadpool_interface { } }; -template struct SchedulerBase { - TP &TPref; - SchedulerBase(TP &ref) : TPref(ref) {} +class TasksInfo_TP { + using FType = std::future; + std::vector futures; +public: + inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } + inline void wait() { + for (auto &f : futures) + f.wait(); + } }; -template struct Scheduler : SchedulerBase { - using SchedulerBase::SchedulerBase; +template struct Scheduler_base { + TP &ref; + TaskInfo ti; + Scheduler_base(TP &ref_) : ref(ref_) {} + TaskInfo getTaskInfo() { return std::move(ti); } +}; + +template struct Scheduler : Scheduler_base { + using Scheduler_base::Scheduler_base; template void schedule(T &&task) { - futures.emplace_back(this->TPref.schedule_task(std::forward(task))); - } - inline void wait() { - for (auto &f : futures) - f.get(); + this->ti.schedule(this->ref.schedule_task(std::forward(task))); } - -private: - std::vector> futures; }; using simple_threadpool_t = threadpool_interface; @@ -233,8 +239,6 @@ template inline Scheduler getScheduler(TPType &tp) { return Scheduler(tp); } -using threadpool_t = simple_threadpool_t; - } // namespace native_cpu #ifdef NATIVECPU_WITH_ONETBB @@ -248,19 +252,32 @@ struct TBB_threadpool { return oneapi::tbb::info::default_concurrency(); } }; -template <> struct Scheduler : SchedulerBase { - using SchedulerBase::SchedulerBase; + +struct TBB_TasksInfo { + TBB_threadpool *tp; + inline void wait() { tp->tasks.wait(); } +}; + +template <> struct Scheduler : Scheduler_base { + using Scheduler_base::Scheduler_base; template inline void schedule(T &&task) { - TPref.tasks.run(std::function([=]() mutable { + ref.tasks.run(std::function([=]() mutable { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && thread_id < oneapi::tbb::info::default_concurrency()); task(thread_id); })); } - inline void wait() { TPref.tasks.wait(); } }; +using TasksInfoType = TBB_TasksInfo; +using ThreadPoolType = TBB_threadpool; } // namespace native_cpu +#else +// The default backend +namespace native_cpu { +using TasksInfoType = TasksInfo_TP; +using ThreadPoolType = simple_threadpool_t; +} #endif From 07c178d8ab3075870af4e6f3bf9c2716842661e2 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 14 Nov 2024 09:56:49 +0000 Subject: [PATCH 24/95] [NATIVECPU] fix merge with events update --- .../source/adapters/native_cpu/device.hpp | 2 +- .../source/adapters/native_cpu/event.cpp | 3 ++- .../source/adapters/native_cpu/event.hpp | 4 ++-- .../source/adapters/native_cpu/threadpool.hpp | 17 ++++++++++------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/device.hpp b/unified-runtime/source/adapters/native_cpu/device.hpp index e0cf3872b548c..358e9a37b60c2 100644 --- a/unified-runtime/source/adapters/native_cpu/device.hpp +++ b/unified-runtime/source/adapters/native_cpu/device.hpp @@ -14,7 +14,7 @@ #include struct ur_device_handle_t_ { - native_cpu::ThreadPoolType tp; + native_cpu::threadpool_t tp; ur_device_handle_t_(ur_platform_handle_t ArgPlt); diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index e81c44301dc2b..9c8f566e4156d 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -13,6 +13,7 @@ #include "common.hpp" #include "event.hpp" #include "queue.hpp" +#include "device.hpp" #include #include @@ -124,7 +125,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue, ur_command_t command_type) : queue(queue), context(queue->getContext()), command_type(command_type), - done(false) { + done(false), futures(queue->getDevice()->tp) { this->queue->addEvent(this); } diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 2c72475a38d0b..74fd74380857e 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -44,7 +44,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } // todo: get rid of this function - void set_futures(native_cpu::TasksInfoType &&fs) { + void set_futures(native_cpu::tasksinfo_t &&fs) { std::lock_guard lock(mutex); futures = std::move(fs); } @@ -63,7 +63,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t command_type; bool done; std::mutex mutex; - native_cpu::TasksInfoType futures; + native_cpu::tasksinfo_t futures; std::packaged_task callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 25d6d60e8da39..6c7c4a8abe4da 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -207,6 +207,7 @@ template class threadpool_interface { return ret; } }; +using simple_threadpool_t = threadpool_interface; class TasksInfo_TP { using FType = std::future; @@ -217,12 +218,13 @@ class TasksInfo_TP { for (auto &f : futures) f.wait(); } + TasksInfo_TP(simple_threadpool_t &) {} }; template struct Scheduler_base { TP &ref; TaskInfo ti; - Scheduler_base(TP &ref_) : ref(ref_) {} + Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {} TaskInfo getTaskInfo() { return std::move(ti); } }; @@ -234,7 +236,6 @@ template struct Scheduler : Scheduler_base { } }; -using simple_threadpool_t = threadpool_interface; template inline Scheduler getScheduler(TPType &tp) { return Scheduler(tp); } @@ -253,9 +254,11 @@ struct TBB_threadpool { } }; -struct TBB_TasksInfo { +class TBB_TasksInfo { TBB_threadpool *tp; +public: inline void wait() { tp->tasks.wait(); } + TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; template <> struct Scheduler : Scheduler_base { @@ -270,14 +273,14 @@ template <> struct Scheduler : Scheduler_base Date: Thu, 14 Nov 2024 12:29:39 +0000 Subject: [PATCH 25/95] [NATIVECPU] revert noise --- unified-runtime/source/adapters/native_cpu/device.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/device.hpp b/unified-runtime/source/adapters/native_cpu/device.hpp index 358e9a37b60c2..2308c1a7f4597 100644 --- a/unified-runtime/source/adapters/native_cpu/device.hpp +++ b/unified-runtime/source/adapters/native_cpu/device.hpp @@ -15,7 +15,6 @@ struct ur_device_handle_t_ { native_cpu::threadpool_t tp; - ur_device_handle_t_(ur_platform_handle_t ArgPlt); const uint64_t mem_size; From 59d731a03655f5fee8e2806711a13b7339207976 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 14 Nov 2024 12:41:46 +0000 Subject: [PATCH 26/95] [NATIVECPU] fix integer size warnings --- unified-runtime/source/adapters/native_cpu/context.hpp | 2 +- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/context.hpp b/unified-runtime/source/adapters/native_cpu/context.hpp index b9d2d22dd1565..8168e0d10eaab 100644 --- a/unified-runtime/source/adapters/native_cpu/context.hpp +++ b/unified-runtime/source/adapters/native_cpu/context.hpp @@ -116,7 +116,7 @@ struct ur_context_handle_t_ : RefCounted { // We need to ensure that we align to at least alignof(usm_alloc_info), // otherwise its start address may be unaligned. alignment = - std::max(alignment, alignof(native_cpu::usm_alloc_info)); + std::max(alignment, alignof(native_cpu::usm_alloc_info)); void *alloc = native_cpu::malloc_impl(alignment, size); if (!alloc) return nullptr; diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 937b500cf1e7b..65c47713d241a 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -460,7 +460,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; - unsigned steps = size / patternSize; + size_t steps = size / patternSize; for (unsigned i = 0; i < steps; i++) { memcpy(static_cast(startingPtr) + i * patternSize, pPattern, patternSize); @@ -601,7 +601,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( break; } default: { - for (unsigned int step{0}; step < size; step += patternSize) { + for (size_t step{0}; step < size; step += patternSize) { auto *dest = reinterpret_cast( reinterpret_cast(ptr) + step); memcpy(dest, pPattern, patternSize); From e0341ef6868e27bc0feeda97e492068738817252 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 26 Nov 2024 17:26:27 +0000 Subject: [PATCH 27/95] [NATIVECPU] update oneTBB tag --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 8614272f95887..612331d49abbb 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -59,7 +59,7 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 + GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From e719ec006f6ceaffd5264ff48d53991fd3e67cee Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 29 Nov 2024 10:56:53 +0000 Subject: [PATCH 28/95] [NATIVECPU] use oneTBB UXL github --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 612331d49abbb..21b77e948f3c2 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -58,7 +58,7 @@ if(NATIVECPU_WITH_ONETBB) include(FetchContent) FetchContent_Declare( tbb - GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE From 81c3c8210edce370decacac0ceb1c6ac9890c674 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 12 Dec 2024 19:07:34 +0000 Subject: [PATCH 29/95] [NATIVECPU] undefine _DEBUG in release builds for tbb --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 21b77e948f3c2..ad20f54825ae0 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -98,5 +98,11 @@ if(NATIVECPU_WITH_ONETBB) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() + # Undefine _DEBUG option in release builds to find + # release tbbbind + if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + target_compile_options(tbb PRIVATE -U_DEBUG) + endif() + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB) endif() From ecaf51b67fe88800f96adbae19391f56b3059b5d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 27 Jan 2025 15:25:39 +0000 Subject: [PATCH 30/95] [NATIVECPU] oneTBB bump --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index ad20f54825ae0..50559d57ba9da 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -59,7 +59,11 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git - GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 +#commit 9d4578723827f31defd79389819a5fbf659577f7 (HEAD -> master, origin/master, origin/HEAD) +#Author: Konstantin Boyarinov +#Date: Fri Jan 24 23:23:59 2025 +0200 +# Add explicit deduction guides for blocked_nd_range (#1525) + GIT_TAG 9d4578723827f31defd79389819a5fbf659577f7 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From f5d6547c26cb33741b94c53951fe502e5d8d5a38 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 28 Jan 2025 10:09:32 +0000 Subject: [PATCH 31/95] [NATIVECPU] clang-format and removed one inline --- unified-runtime/source/adapters/native_cpu/event.cpp | 2 +- unified-runtime/source/adapters/native_cpu/event.hpp | 2 +- unified-runtime/source/adapters/native_cpu/kernel.hpp | 2 +- .../source/adapters/native_cpu/nativecpu_state.hpp | 6 +++--- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 6 +++++- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index 9c8f566e4156d..57d8359b2eda1 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -11,9 +11,9 @@ #include "ur_api.h" #include "common.hpp" +#include "device.hpp" #include "event.hpp" #include "queue.hpp" -#include "device.hpp" #include #include diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 74fd74380857e..876936284ffe5 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -9,12 +9,12 @@ //===----------------------------------------------------------------------===// #pragma once #include "common.hpp" +#include "threadpool.hpp" #include "ur_api.h" #include #include #include #include -#include "threadpool.hpp" struct ur_event_handle_t_ : RefCounted { diff --git a/unified-runtime/source/adapters/native_cpu/kernel.hpp b/unified-runtime/source/adapters/native_cpu/kernel.hpp index 0466c1e0a1444..3e5d07eed6ee5 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.hpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.hpp @@ -28,7 +28,7 @@ struct local_arg_info_t { struct ur_kernel_handle_t_ : RefCounted { - inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} diff --git a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp index c8022293262ba..b9109f647ecc7 100644 --- a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp +++ b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp @@ -20,9 +20,9 @@ struct state { size_t MNumGroups[3]; size_t MGlobalOffset[3]; uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size; - inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, - size_t localR1, size_t localR2, size_t globalO0, size_t globalO1, - size_t globalO2) + inline state(size_t globalR0, size_t globalR1, size_t globalR2, + size_t localR0, size_t localR1, size_t localR2, size_t globalO0, + size_t globalO1, size_t globalO2) : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0, localR1, localR2}, MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2}, diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 6c7c4a8abe4da..6f08389287179 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -212,6 +212,7 @@ using simple_threadpool_t = threadpool_interface; class TasksInfo_TP { using FType = std::future; std::vector futures; + public: inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } inline void wait() { @@ -256,12 +257,15 @@ struct TBB_threadpool { class TBB_TasksInfo { TBB_threadpool *tp; + public: inline void wait() { tp->tasks.wait(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; -template <> struct Scheduler : Scheduler_base { +template <> +struct Scheduler + : Scheduler_base { using Scheduler_base::Scheduler_base; template inline void schedule(T &&task) { ref.tasks.run(std::function([=]() mutable { From e975e77db9c00aa6a02fa2b2986f2459e597af58 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 28 Jan 2025 12:10:04 +0000 Subject: [PATCH 32/95] [NATIVECPU] clang-format --- unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp | 2 +- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp index b9109f647ecc7..68743c33cf65a 100644 --- a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp +++ b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp @@ -43,7 +43,7 @@ struct state { } inline void update(size_t group0, size_t group1, size_t group2, size_t local0, - size_t local1, size_t local2) { + size_t local1, size_t local2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 6f08389287179..a9a5c7f1b4260 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -286,5 +286,5 @@ using threadpool_t = TBB_threadpool; namespace native_cpu { using tasksinfo_t = TasksInfo_TP; using threadpool_t = simple_threadpool_t; -} +} // namespace native_cpu #endif From 26a5bd055e7a06ef19de8198f0b9b21391d2456e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 28 Jan 2025 12:47:34 +0000 Subject: [PATCH 33/95] [NATIVECPU] removed inline --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 65c47713d241a..d94e5502721e1 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -26,8 +26,8 @@ struct NDRDescT { RangeT GlobalOffset; RangeT GlobalSize; RangeT LocalSize; - inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize) + NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize) : WorkDim(WorkDim) { for (uint32_t I = 0; I < WorkDim; I++) { GlobalOffset[I] = GlobalWorkOffset[I]; From 38a91f71a27cfce075dcecb8c8504f387c1ad8cb Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 28 Jan 2025 13:08:22 +0000 Subject: [PATCH 34/95] [NATIVECPU] renamed wait to wait_all --- unified-runtime/source/adapters/native_cpu/event.cpp | 2 +- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index 57d8359b2eda1..953779ad33966 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -140,7 +140,7 @@ void ur_event_handle_t_::wait() { if (done) { return; } - this->futures.wait(); + this->futures.wait_all(); queue->removeEvent(this); done = true; // The callback may need to acquire the lock, so we unlock it here diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index a9a5c7f1b4260..0a224ce544406 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -215,7 +215,7 @@ class TasksInfo_TP { public: inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } - inline void wait() { + inline void wait_all() { for (auto &f : futures) f.wait(); } @@ -259,7 +259,7 @@ class TBB_TasksInfo { TBB_threadpool *tp; public: - inline void wait() { tp->tasks.wait(); } + inline void wait_all() { tp->tasks.wait(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; From b31bd4433264dc423194e59d69f78229570ce653 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 3 Feb 2025 20:30:51 +0000 Subject: [PATCH 35/95] [NATIVECPU] move --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 0a224ce544406..7da229cd9fec3 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -267,8 +267,8 @@ template <> struct Scheduler : Scheduler_base { using Scheduler_base::Scheduler_base; - template inline void schedule(T &&task) { - ref.tasks.run(std::function([=]() mutable { + template inline void schedule(T &&task_) { + ref.tasks.run(std::function([task = std::move(task_)]() mutable { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && thread_id < oneapi::tbb::info::default_concurrency()); From 960b1d50353f6cca12f8cccf3f5ea9f52123edf5 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 28 Feb 2025 10:00:03 +0000 Subject: [PATCH 36/95] [NATIVECPU] removed unused groups --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 40232868ea65e..e3efa5e3c774d 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -130,7 +130,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); std::vector> futures; - std::vector> groups; auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; From 04bd48abee0f319f6909fd118c90b8405f6e46dc Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 27 Mar 2025 16:32:23 +0000 Subject: [PATCH 37/95] [NATIVECPU] added async memcpy --- .../source/adapters/native_cpu/enqueue.cpp | 58 ++++++++++++++----- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index e3efa5e3c774d..6f33b23f80bd6 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -299,26 +299,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -template +template static inline ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f) { + ur_event_handle_t *phEvent, T &&f, I &&inv) { urEventWait(numEventsInWaitList, phEventWaitList); ur_event_handle_t event = nullptr; if (phEvent) { - event = new ur_event_handle_t_(hQueue, command_type); + ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); + ur_result_t result = inv(std::forward(f), event); + *phEvent = event; + return result; } - ur_result_t result = f(); + return result; +} - if (phEvent) { +struct BlockingWithEvent { + template + ur_result_t operator()(T &&op, ur_event_handle_t event) const { + ur_result_t result = op(); event->tick_end(); - *phEvent = event; + return result; } - return result; +}; + +template +static inline ur_result_t +withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, T &&f) { + return withTimingEvent(command_type, hQueue, numEventsInWaitList, + phEventWaitList, phEvent, std::forward(f), + BlockingWithEvent()); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( @@ -654,18 +671,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = blocking; + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + auto Inv = [blocking, hQueue, size](auto &&f, ur_event_handle_t event) { + if (blocking || size < 100) + return BlockingWithEvent()(f, event); + auto &tp = hQueue->getDevice()->tp; + std::vector> futures; + futures.emplace_back(tp.schedule_task([f](size_t) { f(); })); + event->set_futures(futures); + event->set_callback([event]() { event->tick_end(); }); + return UR_RESULT_SUCCESS; + }; + // blocking op return withTimingEvent( UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - + phEvent, + [pDst, pSrc, size]() { memcpy(pDst, pSrc, size); - return UR_RESULT_SUCCESS; - }); + }, + Inv); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( From 7008b8ba778ea5e79d1c25b49ac1773b61d7265b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 11 Apr 2025 09:30:55 +0100 Subject: [PATCH 38/95] [NATIVECPU] added non-blocking invoker, removed unused variable --- .../source/adapters/native_cpu/enqueue.cpp | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6f33b23f80bd6..67bb4cb00ac7d 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -306,7 +306,6 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, T &&f, I &&inv) { urEventWait(numEventsInWaitList, phEventWaitList); - ur_event_handle_t event = nullptr; if (phEvent) { ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); @@ -327,6 +326,20 @@ struct BlockingWithEvent { } }; +struct NonBlocking { + ur_queue_handle_t hQueue; + NonBlocking(ur_queue_handle_t hQueue) : hQueue(hQueue) {} + template + ur_result_t operator()(T &&op, ur_event_handle_t event) const { + auto &tp = hQueue->getDevice()->tp; + std::vector> futures; + futures.emplace_back(tp.schedule_task([op](size_t) { op(); })); + event->set_futures(futures); + event->set_callback([event]() { event->tick_end(); }); + return UR_RESULT_SUCCESS; + } +}; + template static inline ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, @@ -678,12 +691,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( auto Inv = [blocking, hQueue, size](auto &&f, ur_event_handle_t event) { if (blocking || size < 100) return BlockingWithEvent()(f, event); - auto &tp = hQueue->getDevice()->tp; - std::vector> futures; - futures.emplace_back(tp.schedule_task([f](size_t) { f(); })); - event->set_futures(futures); - event->set_callback([event]() { event->tick_end(); }); - return UR_RESULT_SUCCESS; + return NonBlocking(hQueue)(f, event); }; // blocking op return withTimingEvent( From 2f1b3fed3f33668f3f4445a8e2d43639e455511e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 16 Apr 2025 10:16:28 +0100 Subject: [PATCH 39/95] [NATIVECPU] waiting for events in threads --- .../source/adapters/native_cpu/enqueue.cpp | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6cd1f6af8e660..bd0181f91433b 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -50,6 +50,16 @@ struct NDRDescT { << GlobalOffset[2] << "\n"; } }; + +namespace { +struct WaitInfo { + std::vector events; + WaitInfo() = default; + WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList) + : events(WaitList, WaitList + numEvents) {} + void wait() const { urEventWait(events.size(), events.data()); } +}; +} // namespace } // namespace native_cpu #ifdef NATIVECPU_USE_OCK @@ -69,7 +79,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - urEventWait(numEventsInWaitList, phEventWaitList); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -123,6 +132,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel->updateMemPool(numParallelThreads); #ifndef NATIVECPU_USE_OCK + urEventWait(numEventsInWaitList, phEventWaitList); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { @@ -138,6 +148,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } #else + native_cpu::WaitInfo *const InEvents = + (numEventsInWaitList && phEventWaitList) + ? new native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList) + : nullptr; + bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && @@ -157,15 +172,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t new_num_work_groups_0 = numParallelThreads; size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; + bool doneWaiting = false; for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { - futures.emplace_back(tp.schedule_task( - [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { + futures.emplace_back( + tp.schedule_task([ndr, itemsPerThread, &kernel = *kernel, g0, g1, + g2, InEvents](size_t) { native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); resized_state.update(g0, g1, g2); + if (InEvents) + InEvents->wait(); kernel._subhandler(kernel.getArgs().data(), &resized_state); })); } @@ -174,6 +193,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; g0++) { state.update(g0, g1, g2); + if (InEvents && !doneWaiting) { + InEvents->wait(); + doneWaiting = true; + } kernel->_subhandler(kernel->getArgs().data(), &state); } } @@ -186,11 +209,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { + futures.emplace_back(tp.schedule_task( + [state, &kernel = *kernel, numWG0, g1, g2, numParallelThreads, + InEvents](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { state.update(g0, g1, g2); + if (InEvents) + InEvents->wait(); kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), &state); @@ -205,10 +230,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { - groups.push_back([state, g0, g1, g2, numParallelThreads]( - size_t threadId, - ur_kernel_handle_t_ &kernel) mutable { + groups.push_back([state, g0, g1, g2, numParallelThreads, + InEvents](size_t threadId, + ur_kernel_handle_t_ &kernel) mutable { state.update(g0, g1, g2); + if (InEvents) + InEvents->wait(); kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), &state); }); @@ -252,10 +279,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (phEvent) { *phEvent = event; } - event->set_callback([kernel = std::move(kernel), hKernel, event]() { + event->set_callback([kernel = std::move(kernel), hKernel, event, InEvents]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); + delete InEvents; }); if (hQueue->isInOrder()) { From 8efb1e437658b81404112031fc097b6341ddc3c1 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Apr 2025 16:51:28 +0100 Subject: [PATCH 40/95] [NATIVECPU] ndrange enqueue with less work for main thread --- .../source/adapters/native_cpu/enqueue.cpp | 87 ++++++------------- 1 file changed, 27 insertions(+), 60 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index e3efa5e3c774d..bf63d53fee5b0 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -206,25 +206,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } else { // We are running a parallel_for over an nd_range - const auto numWG0_per_thread = numWG0 / numParallelThreads; - if (numWG0_per_thread) { - for (size_t t = 0, WG0_start = 0; t < numParallelThreads; t++) { - IndexT first = {WG0_start, 0, 0}; - WG0_start += numWG0_per_thread; - IndexT last = {WG0_start, numWG1, numWG2}; - futures.emplace_back( - tp.schedule_task([state, numParallelThreads, &kernel = *kernel, - first, last](size_t threadId) mutable { - execute_range(state, kernel, - kernel.getArgs(numParallelThreads, threadId), first, - last); - })); - } - size_t start_wg0_remainder = numWG0_per_thread * numParallelThreads; - if (start_wg0_remainder < numWG0) { - IndexT first = {start_wg0_remainder, 0, 0}; - IndexT last = {numWG0, numWG1, numWG2}; + const IndexT numWG = {numWG0, numWG1, numWG2}; + IndexT groupsPerThread; + for (size_t t = 0; t < 3; t++) + groupsPerThread[t] = numWG[t] / numParallelThreads; + size_t dim = 0; + if (groupsPerThread[0] == 0) { + if (groupsPerThread[1]) + dim = 1; + else if (groupsPerThread[2]) + dim = 2; + } + IndexT first = {0, 0, 0}, last = numWG; + size_t wg_start = 0; + if (groupsPerThread[dim]) { + for (size_t t = 0; t < numParallelThreads; t++) { + first[dim] = wg_start; + wg_start += groupsPerThread[dim]; + last[dim] = wg_start; futures.emplace_back( tp.schedule_task([state, numParallelThreads, &kernel = *kernel, first, last](size_t threadId) mutable { @@ -233,50 +233,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( last); })); } - - } else { - // Here we try to create groups of workgroups in order to reduce - // synchronization overhead - - // todo: deal with overflow - auto numGroups = numWG2 * numWG1 * numWG0; - auto groupsPerThread = numGroups / numParallelThreads; - - IndexT first = {0, 0, 0}; - size_t counter = 0; - if (groupsPerThread) { - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - if (counter == 0) - first = {g0, g1, g2}; - if (++counter == groupsPerThread) { - IndexT last = {g0 + 1, g1 + 1, g2 + 1}; - futures.emplace_back(tp.schedule_task( - [state, numParallelThreads, &kernel = *kernel, first, - last](size_t threadId) mutable { - execute_range( - state, kernel, + } + if (wg_start < numWG[dim]) { + first[dim] = wg_start; + last[dim] = numWG[dim]; + futures.emplace_back( + tp.schedule_task([state, numParallelThreads, &kernel = *kernel, first, + last](size_t threadId) mutable { + execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), first, last); - })); - counter = 0; - } - } - } - } - } - if (numGroups % numParallelThreads) { - // we have a remainder - IndexT last = {numWG0, numWG1, numWG2}; - futures.emplace_back( - tp.schedule_task([state, numParallelThreads, &kernel = *kernel, - first, last](size_t threadId) mutable { - execute_range(state, kernel, - kernel.getArgs(numParallelThreads, threadId), first, - last); - })); - } + })); } } From 2c5218664cf9d4f8be7333ca576c09e0e3818edc Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Apr 2025 17:02:57 +0100 Subject: [PATCH 41/95] [NATIVECPU] static_assert for pointer type --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index bd0181f91433b..cac2fa1dd2b89 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -55,6 +55,7 @@ namespace { struct WaitInfo { std::vector events; WaitInfo() = default; + static_assert(std::is_pointer_v); WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList) : events(WaitList, WaitList + numEvents) {} void wait() const { urEventWait(events.size(), events.data()); } From 5348490068b7e91713b2f542550c58398ef70e91 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Apr 2025 17:59:40 +0100 Subject: [PATCH 42/95] [NATIVECPU] added anonymous namespace --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 7e16e1fd81e9a..439d52ef43d85 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -284,6 +284,7 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, return result; } +namespace { struct BlockingWithEvent { template ur_result_t operator()(T &&op, ur_event_handle_t event) const { @@ -306,6 +307,7 @@ struct NonBlocking { return UR_RESULT_SUCCESS; } }; +} // namespace template static inline ur_result_t From 1de1251079b23671624f9aa2870415b6bef1eb49 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 22 Apr 2025 19:28:44 +0100 Subject: [PATCH 43/95] [NATIVECPU] separated out Invokers for enqueues --- .../source/adapters/native_cpu/enqueue.cpp | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 439d52ef43d85..23252c88be796 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -276,7 +276,7 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, if (phEvent) { ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); - ur_result_t result = inv(std::forward(f), event); + ur_result_t result = inv(std::forward(f), event, hQueue); *phEvent = event; return result; } @@ -287,7 +287,8 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, namespace { struct BlockingWithEvent { template - ur_result_t operator()(T &&op, ur_event_handle_t event) const { + ur_result_t operator()(T &&op, ur_event_handle_t event, + ur_queue_handle_t) const { ur_result_t result = op(); event->tick_end(); return result; @@ -295,10 +296,9 @@ struct BlockingWithEvent { }; struct NonBlocking { - ur_queue_handle_t hQueue; - NonBlocking(ur_queue_handle_t hQueue) : hQueue(hQueue) {} template - ur_result_t operator()(T &&op, ur_event_handle_t event) const { + ur_result_t operator()(T &&op, ur_event_handle_t event, + ur_queue_handle_t hQueue) const { auto &tp = hQueue->getDevice()->tp; std::vector> futures; futures.emplace_back(tp.schedule_task([op](size_t) { op(); })); @@ -307,6 +307,19 @@ struct NonBlocking { return UR_RESULT_SUCCESS; } }; + +struct Invoker { + const bool blocking; + Invoker(bool blocking) : blocking(blocking) {} + template + ur_result_t operator()(T &&f, ur_event_handle_t event, + ur_queue_handle_t hQueue) const { + if (blocking) + return BlockingWithEvent()(std::forward(f), event, hQueue); + return NonBlocking()(std::forward(f), event, hQueue); + }; +}; + } // namespace template @@ -624,11 +637,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - auto Inv = [blocking, hQueue, size](auto &&f, ur_event_handle_t event) { - if (blocking || size < 100) - return BlockingWithEvent()(f, event); - return NonBlocking(hQueue)(f, event); - }; return withTimingEvent( UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, phEvent, @@ -636,7 +644,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( memcpy(pDst, pSrc, size); return UR_RESULT_SUCCESS; }, - Inv); + Invoker(blocking || size < 100)); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( From 9173f5ebc27c1202cb47a46d89e33a077126a4c3 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 23 Apr 2025 09:51:53 +0100 Subject: [PATCH 44/95] [NATIVECPU] made more memops async --- .../source/adapters/native_cpu/enqueue.cpp | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 23252c88be796..e016e6cc79367 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -406,39 +406,43 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( }); } -static inline ur_result_t doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, - const void *SrcPtr, size_t Size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, - ur_command_t command_type) { - return withTimingEvent(command_type, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, [&]() { - if (SrcPtr != DstPtr && Size) - memmove(DstPtr, SrcPtr, Size); - return UR_RESULT_SUCCESS; - }); +template +static inline ur_result_t +doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, + size_t Size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, ur_command_t command_type, T &&Inv) { + return withTimingEvent( + command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, + [DstPtr, SrcPtr, Size]() { + if (SrcPtr != DstPtr && Size) + memmove(DstPtr, SrcPtr, Size); + return UR_RESULT_SUCCESS; + }, + Inv); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingRead*/, + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *FromPtr = /*Src*/ hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, pDst, FromPtr, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ, + Invoker(blockingRead)); return res; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingWrite*/, + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *ToPtr = hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, ToPtr, pSrc, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE, + Invoker(blockingWrite)); return res; } @@ -477,7 +481,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( const void *SrcPtr = hBufferSrc->_mem + srcOffset; void *DstPtr = hBufferDst->_mem + dstOffset; return doCopy_impl(hQueue, DstPtr, SrcPtr, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY, + BlockingWithEvent() /*TODO: check blocking*/); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( From 7cd7caa8848d25065c676bdf25fc0526d5452f7d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 23 Apr 2025 10:11:31 +0100 Subject: [PATCH 45/95] [NATIVECPU] memop pointer check outside worker lambda --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index e016e6cc79367..baa97cb1af841 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -412,11 +412,15 @@ doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t command_type, T &&Inv) { + if (SrcPtr == DstPtr || Size == 0) + return withTimingEvent( + command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, + []() { return UR_RESULT_SUCCESS; }, BlockingWithEvent()); + return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, [DstPtr, SrcPtr, Size]() { - if (SrcPtr != DstPtr && Size) - memmove(DstPtr, SrcPtr, Size); + memmove(DstPtr, SrcPtr, Size); return UR_RESULT_SUCCESS; }, Inv); From 32ecf0928b749a839b2b3f1a9745c5284cb67a1d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 23 Apr 2025 15:18:07 +0100 Subject: [PATCH 46/95] [NATIVECPU] moved inEvents --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 75b1e63db2fd6..158f472a4382e 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -132,6 +132,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); + native_cpu::WaitInfo *const InEvents = + (numEventsInWaitList && phEventWaitList) + ? new native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList) + : nullptr; + #ifndef NATIVECPU_USE_OCK urEventWait(numEventsInWaitList, phEventWaitList); for (unsigned g2 = 0; g2 < numWG2; g2++) { @@ -149,11 +154,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } #else - native_cpu::WaitInfo *const InEvents = - (numEventsInWaitList && phEventWaitList) - ? new native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList) - : nullptr; - bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && From 4b05062a0f2660b4a10984026eaec09f37f3e53e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Apr 2025 10:40:51 +0100 Subject: [PATCH 47/95] [NATIVECPU] use unique_ptr for WaitInfo --- .../source/adapters/native_cpu/enqueue.cpp | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 158f472a4382e..94fcf47b29be0 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -54,12 +54,21 @@ struct NDRDescT { namespace { struct WaitInfo { std::vector events; - WaitInfo() = default; static_assert(std::is_pointer_v); WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList) : events(WaitList, WaitList + numEvents) {} void wait() const { urEventWait(events.size(), events.data()); } }; + +inline static std::unique_ptr +getWaitInfo(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return (numEventsInWaitList && phEventWaitList) + ? std::make_unique(numEventsInWaitList, + phEventWaitList) + : nullptr; +} + } // namespace } // namespace native_cpu @@ -132,10 +141,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); - native_cpu::WaitInfo *const InEvents = - (numEventsInWaitList && phEventWaitList) - ? new native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList) - : nullptr; + auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); #ifndef NATIVECPU_USE_OCK urEventWait(numEventsInWaitList, phEventWaitList); @@ -180,7 +186,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { futures.emplace_back( tp.schedule_task([ndr, itemsPerThread, &kernel = *kernel, g0, g1, - g2, InEvents](size_t) { + g2, InEvents = InEvents.get()](size_t) { native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); resized_state.update(g0, g1, g2); @@ -212,7 +218,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g1 = 0; g1 < numWG1; g1++) { futures.emplace_back(tp.schedule_task( [state, &kernel = *kernel, numWG0, g1, g2, numParallelThreads, - InEvents](size_t threadId) mutable { + InEvents = InEvents.get()](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { state.update(g0, g1, g2); if (InEvents) @@ -232,8 +238,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { groups.push_back([state, g0, g1, g2, numParallelThreads, - InEvents](size_t threadId, - ur_kernel_handle_t_ &kernel) mutable { + InEvents = InEvents.get()]( + size_t threadId, + ur_kernel_handle_t_ &kernel) mutable { state.update(g0, g1, g2); if (InEvents) InEvents->wait(); @@ -280,11 +287,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (phEvent) { *phEvent = event; } - event->set_callback([kernel = std::move(kernel), hKernel, event, InEvents]() { + event->set_callback([kernel = std::move(kernel), hKernel, event, + InEvents = std::move(InEvents)]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); - delete InEvents; }); if (hQueue->isInOrder()) { From 2722cad08b7c0fbf2fe0925acc188f7e2c23452b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Apr 2025 12:53:51 +0100 Subject: [PATCH 48/95] [NATIVECPU] async memcopy --- .../source/adapters/native_cpu/enqueue.cpp | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 055de9af4d6f8..bd57bdcea2db1 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -294,14 +294,15 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, T &&f, I &&inv) { - urEventWait(numEventsInWaitList, phEventWaitList); if (phEvent) { ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); - ur_result_t result = inv(std::forward(f), event, hQueue); + ur_result_t result = inv(std::forward(f), event, hQueue, + numEventsInWaitList, phEventWaitList); *phEvent = event; return result; } + urEventWait(numEventsInWaitList, phEventWaitList); ur_result_t result = f(); return result; } @@ -309,8 +310,10 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, namespace { struct BlockingWithEvent { template - ur_result_t operator()(T &&op, ur_event_handle_t event, - ur_queue_handle_t) const { + ur_result_t operator()(T &&op, ur_event_handle_t event, ur_queue_handle_t, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) const { + urEventWait(numEventsInWaitList, phEventWaitList); ur_result_t result = op(); event->tick_end(); return result; @@ -320,12 +323,24 @@ struct BlockingWithEvent { struct NonBlocking { template ur_result_t operator()(T &&op, ur_event_handle_t event, - ur_queue_handle_t hQueue) const { + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) const { auto &tp = hQueue->getDevice()->tp; std::vector> futures; - futures.emplace_back(tp.schedule_task([op](size_t) { op(); })); + native_cpu::WaitInfo *const InEvents = + (numEventsInWaitList && phEventWaitList) + ? new native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList) + : nullptr; + futures.emplace_back(tp.schedule_task([op, InEvents](size_t) { + if (InEvents) + InEvents->wait(); + op(); + })); event->set_futures(futures); - event->set_callback([event]() { event->tick_end(); }); + event->set_callback([event, InEvents]() { + event->tick_end(); + delete InEvents; + }); return UR_RESULT_SUCCESS; } }; @@ -335,10 +350,13 @@ struct Invoker { Invoker(bool blocking) : blocking(blocking) {} template ur_result_t operator()(T &&f, ur_event_handle_t event, - ur_queue_handle_t hQueue) const { + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) const { if (blocking) - return BlockingWithEvent()(std::forward(f), event, hQueue); - return NonBlocking()(std::forward(f), event, hQueue); + return BlockingWithEvent()(std::forward(f), event, hQueue, + numEventsInWaitList, phEventWaitList); + return NonBlocking()(std::forward(f), event, hQueue, numEventsInWaitList, + phEventWaitList); }; }; @@ -668,6 +686,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); + if (pSrc == pDst || size == 0) + return withTimingEvent( + UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, phEvent, + []() { return UR_RESULT_SUCCESS; }, BlockingWithEvent()); + return withTimingEvent( UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, phEvent, From 22898b4a8774e6575b44c23715cf1f92349267b0 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Apr 2025 15:38:53 +0100 Subject: [PATCH 49/95] [NATIVECPU] code reuse for memcopies --- .../source/adapters/native_cpu/enqueue.cpp | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 526060075f31d..f77151f3f3200 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -452,7 +452,8 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( }); } -template +template static inline ur_result_t doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, uint32_t numEventsInWaitList, @@ -466,7 +467,7 @@ doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, [DstPtr, SrcPtr, Size]() { - memmove(DstPtr, SrcPtr, Size); + copy_func(DstPtr, SrcPtr, Size); return UR_RESULT_SUCCESS; }, Inv); @@ -692,19 +693,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - if (pSrc == pDst || size == 0) - return withTimingEvent( - UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, []() { return UR_RESULT_SUCCESS; }, BlockingWithEvent()); - - return withTimingEvent( - UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, - [pDst, pSrc, size]() { - memcpy(pDst, pSrc, size); - return UR_RESULT_SUCCESS; - }, - Invoker(blocking || size < 100)); + return doCopy_impl(hQueue, pDst, pSrc, size, numEventsInWaitList, + phEventWaitList, phEvent, UR_COMMAND_USM_MEMCPY, + Invoker(blocking)); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( From 5d12b7ae3474cedef4b50b7719ed743668b7c88c Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 29 Apr 2025 09:58:45 +0100 Subject: [PATCH 50/95] [NATIVECPU] removed invoker --- .../source/adapters/native_cpu/enqueue.cpp | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f77151f3f3200..91b343a2eb51e 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -350,22 +350,6 @@ struct NonBlocking { return UR_RESULT_SUCCESS; } }; - -struct Invoker { - const bool blocking; - Invoker(bool blocking) : blocking(blocking) {} - template - ur_result_t operator()(T &&f, ur_event_handle_t event, - ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) const { - if (blocking) - return BlockingWithEvent()(std::forward(f), event, hQueue, - numEventsInWaitList, phEventWaitList); - return NonBlocking()(std::forward(f), event, hQueue, numEventsInWaitList, - phEventWaitList); - }; -}; - } // namespace template @@ -379,6 +363,21 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, BlockingWithEvent()); } +template +static inline ur_result_t +withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, T &&f, bool blocking) { + if (blocking) + return withTimingEvent(command_type, hQueue, numEventsInWaitList, + phEventWaitList, phEvent, std::forward(f), + BlockingWithEvent()); + return withTimingEvent(command_type, hQueue, numEventsInWaitList, + phEventWaitList, phEvent, std::forward(f), + NonBlocking()); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -452,13 +451,11 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( }); } -template -static inline ur_result_t -doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, - size_t Size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, ur_command_t command_type, T &&Inv) { +template +static inline ur_result_t doCopy_impl( + ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, ur_command_t command_type, bool blocking) { if (SrcPtr == DstPtr || Size == 0) return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, @@ -470,7 +467,7 @@ doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, copy_func(DstPtr, SrcPtr, Size); return UR_RESULT_SUCCESS; }, - Inv); + blocking); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( @@ -481,7 +478,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( void *FromPtr = /*Src*/ hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, pDst, FromPtr, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ, - Invoker(blockingRead)); + blockingRead); return res; } @@ -493,7 +490,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( void *ToPtr = hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, ToPtr, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE, - Invoker(blockingWrite)); + blockingWrite); return res; } @@ -533,7 +530,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( void *DstPtr = hBufferDst->_mem + dstOffset; return doCopy_impl(hQueue, DstPtr, SrcPtr, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY, - BlockingWithEvent() /*TODO: check blocking*/); + true /*TODO: check blocking*/); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( @@ -695,7 +692,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( return doCopy_impl(hQueue, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_USM_MEMCPY, - Invoker(blocking)); + blocking); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( From 400ba0d6da60fe5e218297524ebf516800a421ee Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 29 Apr 2025 10:57:16 +0100 Subject: [PATCH 51/95] [NATIVECPU] removed unneeded function --- .../source/adapters/native_cpu/enqueue.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 91b343a2eb51e..6e688bca37e5b 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -357,18 +357,7 @@ static inline ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f) { - return withTimingEvent(command_type, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, std::forward(f), - BlockingWithEvent()); -} - -template -static inline ur_result_t -withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f, bool blocking) { + ur_event_handle_t *phEvent, T &&f, bool blocking = true) { if (blocking) return withTimingEvent(command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, std::forward(f), From 40f7270fff3927e3b1821667fce0337bc0b67417 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 29 Apr 2025 14:43:12 +0100 Subject: [PATCH 52/95] [NATIVECPU] async wait in noop copy --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6e688bca37e5b..3032c7e6e6381 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -445,10 +445,12 @@ static inline ur_result_t doCopy_impl( ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, ur_command_t command_type, bool blocking) { - if (SrcPtr == DstPtr || Size == 0) + if (SrcPtr == DstPtr || Size == 0) { + bool hasInEvents = numEventsInWaitList && phEventWaitList; return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - []() { return UR_RESULT_SUCCESS; }, BlockingWithEvent()); + []() { return UR_RESULT_SUCCESS; }, blocking || !hasInEvents); + } return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, From bd161bcd058cc623c7ef4629b8ac98f43b4db15b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 29 Apr 2025 16:00:49 +0100 Subject: [PATCH 53/95] [NATIVECPU] async membuffer ops --- .../source/adapters/native_cpu/enqueue.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 3032c7e6e6381..1292091649b3b 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -339,7 +339,7 @@ struct NonBlocking { auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); futures.emplace_back( - tp.schedule_task([op, InEvents = InEvents.get()](size_t) { + tp.schedule_task([op, InEvents = InEvents.get()](size_t) mutable { if (InEvents) InEvents->wait(); op(); @@ -395,7 +395,7 @@ UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( template static inline ur_result_t enqueueMemBufferReadWriteRect_impl( - ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool, + ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool blocking, ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, ur_rect_region_t region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, @@ -408,7 +408,9 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( else command_t = UR_COMMAND_MEM_BUFFER_WRITE_RECT; return withTimingEvent( - command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, [&]() { + command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, + [BufferRowPitch, region, BufferSlicePitch, HostRowPitch, HostSlicePitch, + BufferOffset, HostOffset, Buff, DstMem]() mutable { // TODO: blocking, check other constraints, performance optimizations // More sharing with level_zero where possible @@ -437,7 +439,8 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( } return UR_RESULT_SUCCESS; - }); + }, + blocking); } template @@ -532,7 +535,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return enqueueMemBufferReadWriteRect_impl( - hQueue, hBufferSrc, false /*todo: check blocking*/, srcOrigin, + hQueue, hBufferSrc, true /*todo: check blocking*/, srcOrigin, /*HostOffset*/ dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, hBufferDst->_mem, numEventsInWaitList, phEventWaitList, phEvent); From 870754a2df5d85a89a94739d7acde9bbb83836bf Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 30 Apr 2025 17:20:59 +0100 Subject: [PATCH 54/95] [NATIVECPU] quick fix for in-order queues --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 1292091649b3b..e4926d78c20d3 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -358,7 +358,7 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, T &&f, bool blocking = true) { - if (blocking) + if (blocking || hQueue->isInOrder()) return withTimingEvent(command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, std::forward(f), BlockingWithEvent()); From e11f5966df87f502f32f9c5b25adda9d3129a472 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 30 Apr 2025 19:21:54 +0100 Subject: [PATCH 55/95] [NATIVECPU] construct state inside thread --- .../source/adapters/native_cpu/enqueue.cpp | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index e4926d78c20d3..02b87c1b2ec14 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -72,16 +72,18 @@ getWaitInfo(uint32_t numEventsInWaitList, } // namespace } // namespace native_cpu -#ifdef NATIVECPU_USE_OCK -static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, - size_t itemsPerThread) { +static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, + size_t itemsPerThread) { native_cpu::state resized_state( ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], ndr.GlobalOffset[1], ndr.GlobalOffset[2]); return resized_state; } -#endif + +static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { + return getResizedState(ndr, ndr.LocalSize[0]); +} using IndexT = std::array; using RangeT = native_cpu::NDRDescT::RangeT; @@ -152,10 +154,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; - native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1], - ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1], - ndr.LocalSize[2], ndr.GlobalOffset[0], - ndr.GlobalOffset[1], ndr.GlobalOffset[2]); auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH); event->tick_start(); @@ -166,6 +164,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); #ifndef NATIVECPU_USE_OCK + native_cpu::state state = getState(ndr); urEventWait(numEventsInWaitList, phEventWaitList); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { @@ -221,12 +220,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. futures.emplace_back(tp.schedule_task( - [state, &kernel = *kernel, start_wg0_remainder, numWG0, numWG1, - numWG2, InEvents = InEvents.get()](size_t) mutable { + [ndr, &kernel = *kernel, start_wg0_remainder, numWG0, numWG1, numWG2, + InEvents = InEvents.get()](size_t) mutable { IndexT first = {start_wg0_remainder, 0, 0}; IndexT last = {numWG0, numWG1, numWG2}; if (InEvents) InEvents->wait(); + native_cpu::state state = getState(ndr); execute_range(state, kernel, first, last); })); } @@ -252,10 +252,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( wg_start += groupsPerThread[dim]; last[dim] = wg_start; futures.emplace_back(tp.schedule_task( - [state, numParallelThreads, &kernel = *kernel, first, last, + [ndr, numParallelThreads, &kernel = *kernel, first, last, InEvents = InEvents.get()](size_t threadId) mutable { if (InEvents) InEvents->wait(); + native_cpu::state state = getState(ndr); execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), first, last); @@ -266,10 +267,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( first[dim] = wg_start; last[dim] = numWG[dim]; futures.emplace_back(tp.schedule_task( - [state, numParallelThreads, &kernel = *kernel, first, last, + [ndr, numParallelThreads, &kernel = *kernel, first, last, InEvents = InEvents.get()](size_t threadId) mutable { if (InEvents) InEvents->wait(); + native_cpu::state state = getState(ndr); execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), first, last); From b4069d1c8d0ff8d4c58b449dfd0572c17e657a82 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 1 May 2025 14:00:19 +0100 Subject: [PATCH 56/95] [NATIVECPU] update comments --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 02b87c1b2ec14..7b36f8985a0c9 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -413,7 +413,7 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, [BufferRowPitch, region, BufferSlicePitch, HostRowPitch, HostSlicePitch, BufferOffset, HostOffset, Buff, DstMem]() mutable { - // TODO: blocking, check other constraints, performance optimizations + // TODO: check other constraints, performance optimizations // More sharing with level_zero where possible if (BufferRowPitch == 0) @@ -526,7 +526,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( void *DstPtr = hBufferDst->_mem + dstOffset; return doCopy_impl(hQueue, DstPtr, SrcPtr, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY, - true /*TODO: check blocking*/); + true /*TODO: check false for non-blocking*/); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( @@ -537,7 +537,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return enqueueMemBufferReadWriteRect_impl( - hQueue, hBufferSrc, true /*todo: check blocking*/, srcOrigin, + hQueue, hBufferSrc, true /*todo: check false for non-blocking*/, + srcOrigin, /*HostOffset*/ dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, hBufferDst->_mem, numEventsInWaitList, phEventWaitList, phEvent); From dfc67d8a7342f0906cf7effedeac35a96bc08e31 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 2 May 2025 14:39:31 +0100 Subject: [PATCH 57/95] [NATIVECPU] removed nullptr check for pHEventWaitList --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 7b36f8985a0c9..c14a97954d8e8 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -63,10 +63,9 @@ struct WaitInfo { inline static std::unique_ptr getWaitInfo(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { - return (numEventsInWaitList && phEventWaitList) - ? std::make_unique(numEventsInWaitList, - phEventWaitList) - : nullptr; + return (numEventsInWaitList) ? std::make_unique( + numEventsInWaitList, phEventWaitList) + : nullptr; } } // namespace From a25b2c7b3589e6c4328bda78b1b906c6d036b1c4 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 2 May 2025 15:17:37 +0100 Subject: [PATCH 58/95] [NATIVECPU] updated oneTBB tag --- .../source/adapters/native_cpu/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index bd1136d70c623..1cff4ef22e0d2 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -60,11 +60,11 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git -#commit 9d4578723827f31defd79389819a5fbf659577f7 (HEAD -> master, origin/master, origin/HEAD) -#Author: Konstantin Boyarinov -#Date: Fri Jan 24 23:23:59 2025 +0200 -# Add explicit deduction guides for blocked_nd_range (#1525) - GIT_TAG 9d4578723827f31defd79389819a5fbf659577f7 +#commit 7dfe7e744ac583bd1c202f6e0eff4f51269bc524 (HEAD -> master, origin/master, origin/HEAD) +#Author: Olga Malysheva +#Date: Wed Apr 30 21:29:20 2025 +0200 +# Update compiler and linker options (#1719) + GIT_TAG 7dfe7e744ac583bd1c202f6e0eff4f51269bc524 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From 3074b16b826e738ba98e6b86e05c9aa37df0484b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 6 May 2025 14:48:06 +0100 Subject: [PATCH 59/95] [NATIVECPU] removed unneeded mutable --- .../source/adapters/native_cpu/enqueue.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index c14a97954d8e8..73c19b86b7a1e 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -218,9 +218,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (start_wg0_remainder < numWG0) { // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. - futures.emplace_back(tp.schedule_task( - [ndr, &kernel = *kernel, start_wg0_remainder, numWG0, numWG1, numWG2, - InEvents = InEvents.get()](size_t) mutable { + futures.emplace_back( + tp.schedule_task([ndr, &kernel = *kernel, start_wg0_remainder, numWG0, + numWG1, numWG2, InEvents = InEvents.get()](size_t) { IndexT first = {start_wg0_remainder, 0, 0}; IndexT last = {numWG0, numWG1, numWG2}; if (InEvents) @@ -252,7 +252,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( last[dim] = wg_start; futures.emplace_back(tp.schedule_task( [ndr, numParallelThreads, &kernel = *kernel, first, last, - InEvents = InEvents.get()](size_t threadId) mutable { + InEvents = InEvents.get()](size_t threadId) { if (InEvents) InEvents->wait(); native_cpu::state state = getState(ndr); @@ -265,9 +265,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (wg_start < numWG[dim]) { first[dim] = wg_start; last[dim] = numWG[dim]; - futures.emplace_back(tp.schedule_task( - [ndr, numParallelThreads, &kernel = *kernel, first, last, - InEvents = InEvents.get()](size_t threadId) mutable { + futures.emplace_back( + tp.schedule_task([ndr, numParallelThreads, &kernel = *kernel, first, + last, InEvents = InEvents.get()](size_t threadId) { if (InEvents) InEvents->wait(); native_cpu::state state = getState(ndr); From 070f0cfe36743c31811746a1202af60f18df59ce Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 6 May 2025 15:26:39 +0100 Subject: [PATCH 60/95] [NATIVECPU] moved lambda code from enqueueMemBufferReadWriteRect_impl into seperate function to be able to remove mutable --- .../source/adapters/native_cpu/enqueue.cpp | 68 +++++++++++-------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 73c19b86b7a1e..3be5c115dc594 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -340,7 +340,7 @@ struct NonBlocking { auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); futures.emplace_back( - tp.schedule_task([op, InEvents = InEvents.get()](size_t) mutable { + tp.schedule_task([op, InEvents = InEvents.get()](size_t) { if (InEvents) InEvents->wait(); op(); @@ -394,6 +394,40 @@ UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( phEventWaitList, phEvent); } +template +static inline void MemBufferReadWriteRect_impl( + ur_mem_handle_t Buff, ur_rect_offset_t BufferOffset, + ur_rect_offset_t HostOffset, ur_rect_region_t region, size_t BufferRowPitch, + size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, + typename std::conditional::type DstMem) { + // TODO: check other constraints, performance optimizations + // More sharing with level_zero where possible + + if (BufferRowPitch == 0) + BufferRowPitch = region.width; + if (BufferSlicePitch == 0) + BufferSlicePitch = BufferRowPitch * region.height; + if (HostRowPitch == 0) + HostRowPitch = region.width; + if (HostSlicePitch == 0) + HostSlicePitch = HostRowPitch * region.height; + for (size_t w = 0; w < region.width; w++) + for (size_t h = 0; h < region.height; h++) + for (size_t d = 0; d < region.depth; d++) { + size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch + + (h + BufferOffset.y) * BufferRowPitch + w + + BufferOffset.x; + size_t host_origin = (d + HostOffset.z) * HostSlicePitch + + (h + HostOffset.y) * HostRowPitch + w + + HostOffset.x; + int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign]; + if constexpr (IsRead) + ur_cast(DstMem)[host_origin] = buff_mem; + else + buff_mem = ur_cast(DstMem)[host_origin]; + } +} + template static inline ur_result_t enqueueMemBufferReadWriteRect_impl( ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool blocking, @@ -411,34 +445,10 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( return withTimingEvent( command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, [BufferRowPitch, region, BufferSlicePitch, HostRowPitch, HostSlicePitch, - BufferOffset, HostOffset, Buff, DstMem]() mutable { - // TODO: check other constraints, performance optimizations - // More sharing with level_zero where possible - - if (BufferRowPitch == 0) - BufferRowPitch = region.width; - if (BufferSlicePitch == 0) - BufferSlicePitch = BufferRowPitch * region.height; - if (HostRowPitch == 0) - HostRowPitch = region.width; - if (HostSlicePitch == 0) - HostSlicePitch = HostRowPitch * region.height; - for (size_t w = 0; w < region.width; w++) - for (size_t h = 0; h < region.height; h++) - for (size_t d = 0; d < region.depth; d++) { - size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch + - (h + BufferOffset.y) * BufferRowPitch + w + - BufferOffset.x; - size_t host_origin = (d + HostOffset.z) * HostSlicePitch + - (h + HostOffset.y) * HostRowPitch + w + - HostOffset.x; - int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign]; - if constexpr (IsRead) - ur_cast(DstMem)[host_origin] = buff_mem; - else - buff_mem = ur_cast(DstMem)[host_origin]; - } - + BufferOffset, HostOffset, Buff, DstMem]() { + MemBufferReadWriteRect_impl( + Buff, BufferOffset, HostOffset, region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, DstMem); return UR_RESULT_SUCCESS; }, blocking); From 3207ffacd7674ce1966bfb21ca689a522ffa83cd Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 7 May 2025 17:03:47 +0100 Subject: [PATCH 61/95] [NATIVECPU] simplified event generation --- .../source/adapters/native_cpu/enqueue.cpp | 62 +++++-------------- 1 file changed, 14 insertions(+), 48 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 3be5c115dc594..7dffab6f3e681 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -298,74 +298,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -template +template static inline ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f, I &&inv) { + ur_event_handle_t *phEvent, T &&f, bool blocking = true) { if (phEvent) { ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); - event->tick_start(); - ur_result_t result = inv(std::forward(f), event, hQueue, - numEventsInWaitList, phEventWaitList); *phEvent = event; - return result; - } - urEventWait(numEventsInWaitList, phEventWaitList); - ur_result_t result = f(); - return result; -} - -namespace { -struct BlockingWithEvent { - template - ur_result_t operator()(T &&op, ur_event_handle_t event, ur_queue_handle_t, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) const { - urEventWait(numEventsInWaitList, phEventWaitList); - ur_result_t result = op(); - event->tick_end(); - return result; - } -}; - -struct NonBlocking { - template - ur_result_t operator()(T &&op, ur_event_handle_t event, - ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) const { + event->tick_start(); + if (blocking || hQueue->isInOrder()) { + urEventWait(numEventsInWaitList, phEventWaitList); + ur_result_t result = f(); + event->tick_end(); + return result; + } auto &tp = hQueue->getDevice()->tp; std::vector> futures; auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); futures.emplace_back( - tp.schedule_task([op, InEvents = InEvents.get()](size_t) { + tp.schedule_task([f, InEvents = InEvents.get()](size_t) { if (InEvents) InEvents->wait(); - op(); + f(); })); event->set_futures(futures); event->set_callback( [event, InEvents = std::move(InEvents)]() { event->tick_end(); }); return UR_RESULT_SUCCESS; } -}; -} // namespace - -template -static inline ur_result_t -withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f, bool blocking = true) { - if (blocking || hQueue->isInOrder()) - return withTimingEvent(command_type, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, std::forward(f), - BlockingWithEvent()); - return withTimingEvent(command_type, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, std::forward(f), - NonBlocking()); + urEventWait(numEventsInWaitList, phEventWaitList); + ur_result_t result = f(); + return result; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( From 6e1f722492dd8e6565cf562095b9995f0385bb99 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 7 May 2025 19:29:33 +0100 Subject: [PATCH 62/95] [NATIVECPU] added interface to disable waiting in threads (for oneTBB) --- .../source/adapters/native_cpu/enqueue.cpp | 4 ++++ .../source/adapters/native_cpu/threadpool.hpp | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index af777a29e04ce..52e30ad8022b5 100755 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -63,6 +63,10 @@ struct WaitInfo { inline static std::unique_ptr getWaitInfo(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList) { + if (!native_cpu::tasksinfo_t::CanWaitInThread()) { + urEventWait(numEventsInWaitList, phEventWaitList); + return nullptr; + } return (numEventsInWaitList) ? std::make_unique( numEventsInWaitList, phEventWaitList) : nullptr; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 7da229cd9fec3..b8180421260b4 100755 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -220,6 +220,7 @@ class TasksInfo_TP { f.wait(); } TasksInfo_TP(simple_threadpool_t &) {} + static constexpr bool CanWaitInThread() { return true; } }; template struct Scheduler_base { @@ -248,8 +249,12 @@ template inline Scheduler getScheduler(TPType &tp) { #include "oneapi/tbb.h" namespace native_cpu { -struct TBB_threadpool { +class TBB_threadpool { oneapi::tbb::task_group tasks; + +public: + void wait_all() { tasks.wait(); } + oneapi::tbb::task_group &Tasks() { return tasks; } inline size_t num_threads() const noexcept { return oneapi::tbb::info::default_concurrency(); } @@ -259,8 +264,9 @@ class TBB_TasksInfo { TBB_threadpool *tp; public: - inline void wait_all() { tp->tasks.wait(); } + inline void wait_all() { tp->wait_all(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} + static constexpr bool CanWaitInThread() { return false; } }; template <> @@ -268,7 +274,7 @@ struct Scheduler : Scheduler_base { using Scheduler_base::Scheduler_base; template inline void schedule(T &&task_) { - ref.tasks.run(std::function([task = std::move(task_)]() mutable { + ref.Tasks().run(std::function([task = std::move(task_)]() mutable { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && thread_id < oneapi::tbb::info::default_concurrency()); From 2a557f930a3f277b5e3b033e16949c381c33a3ab Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 7 May 2025 19:44:29 +0100 Subject: [PATCH 63/95] [NATIVECPU] removed the now unneeded std::function wrapper for oneTBB --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100755 => 100644 unified-runtime/source/adapters/native_cpu/threadpool.hpp diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp old mode 100755 new mode 100644 index b8180421260b4..db59deed28de4 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -274,12 +274,12 @@ struct Scheduler : Scheduler_base { using Scheduler_base::Scheduler_base; template inline void schedule(T &&task_) { - ref.Tasks().run(std::function([task = std::move(task_)]() mutable { + ref.Tasks().run([task = std::move(task_)]() { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && thread_id < oneapi::tbb::info::default_concurrency()); task(thread_id); - })); + }); } }; From 29c201cf2fff843f4ede3bf1e237b6d0a7c45639 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 7 May 2025 19:46:13 +0100 Subject: [PATCH 64/95] [NATIVECPU] revert accidental filemode change --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 unified-runtime/source/adapters/native_cpu/enqueue.cpp diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp old mode 100755 new mode 100644 From 941932bd307323be10c6c746cba3a389cdac0dec Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 9 May 2025 09:06:45 +0100 Subject: [PATCH 65/95] [NATIVECPU] replaced function pointer template parameter --- .../source/adapters/native_cpu/enqueue.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) mode change 100755 => 100644 unified-runtime/source/adapters/native_cpu/enqueue.cpp diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp old mode 100755 new mode 100644 index 7dffab6f3e681..bf3766d486519 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -420,7 +420,7 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( blocking); } -template +template static inline ur_result_t doCopy_impl( ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -435,7 +435,11 @@ static inline ur_result_t doCopy_impl( return withTimingEvent( command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, [DstPtr, SrcPtr, Size]() { - copy_func(DstPtr, SrcPtr, Size); + if constexpr (AllowPartialOverlap) { + memmove(DstPtr, SrcPtr, Size); + } else { + memcpy(DstPtr, SrcPtr, Size); + } return UR_RESULT_SUCCESS; }, blocking); @@ -662,9 +666,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - return doCopy_impl(hQueue, pDst, pSrc, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_USM_MEMCPY, - blocking); + return doCopy_impl( + hQueue, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent, + UR_COMMAND_USM_MEMCPY, blocking); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( From 153277991cce79687688a91d956764da235fae07 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 13 May 2025 16:43:01 +0100 Subject: [PATCH 66/95] [NATIVECPU] simplified WaitInfo --- .../source/adapters/native_cpu/enqueue.cpp | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 0146924f344ab..526a7a8ed33a0 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -52,20 +52,27 @@ struct NDRDescT { }; namespace { -struct WaitInfo { - std::vector events; +class WaitInfo { + std::vector *const events; static_assert(std::is_pointer_v); + +public: WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList) - : events(WaitList, WaitList + numEvents) {} - void wait() const { urEventWait(events.size(), events.data()); } + : events(numEvents ? new std::vector( + WaitList, WaitList + numEvents) + : nullptr) {} + void wait() const { + if (events) + urEventWait(events->size(), events->data()); + } + std::unique_ptr> getUniquePtr() { + return std::unique_ptr>(events); + } }; -inline static std::unique_ptr -getWaitInfo(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) { - return (numEventsInWaitList) ? std::make_unique( - numEventsInWaitList, phEventWaitList) - : nullptr; +inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList); } } // namespace @@ -203,15 +210,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (size_t t = 0; t < numParallelThreads;) { IndexT first = {t, 0, 0}; IndexT last = {++t, numWG1, numWG2}; - futures.emplace_back( - tp.schedule_task([ndr, itemsPerThread, &kernel = *kernel, first, last, - InEvents = InEvents.get()](size_t) { - native_cpu::state resized_state = - getResizedState(ndr, itemsPerThread); - if (InEvents) - InEvents->wait(); - execute_range(resized_state, kernel, first, last); - })); + futures.emplace_back(tp.schedule_task([ndr, itemsPerThread, + &kernel = *kernel, first, last, + InEvents](size_t) { + native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); + InEvents.wait(); + execute_range(resized_state, kernel, first, last); + })); } size_t start_wg0_remainder = new_num_work_groups_0 * itemsPerThread; @@ -220,11 +225,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // over the work groups. futures.emplace_back( tp.schedule_task([ndr, &kernel = *kernel, start_wg0_remainder, numWG0, - numWG1, numWG2, InEvents = InEvents.get()](size_t) { + numWG1, numWG2, InEvents](size_t) { IndexT first = {start_wg0_remainder, 0, 0}; IndexT last = {numWG0, numWG1, numWG2}; - if (InEvents) - InEvents->wait(); + InEvents.wait(); native_cpu::state state = getState(ndr); execute_range(state, kernel, first, last); })); @@ -250,11 +254,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( first[dim] = wg_start; wg_start += groupsPerThread[dim]; last[dim] = wg_start; - futures.emplace_back(tp.schedule_task( - [ndr, numParallelThreads, &kernel = *kernel, first, last, - InEvents = InEvents.get()](size_t threadId) { - if (InEvents) - InEvents->wait(); + futures.emplace_back( + tp.schedule_task([ndr, numParallelThreads, &kernel = *kernel, first, + last, InEvents](size_t threadId) { + InEvents.wait(); native_cpu::state state = getState(ndr); execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), first, @@ -267,9 +270,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( last[dim] = numWG[dim]; futures.emplace_back( tp.schedule_task([ndr, numParallelThreads, &kernel = *kernel, first, - last, InEvents = InEvents.get()](size_t threadId) { - if (InEvents) - InEvents->wait(); + last, InEvents](size_t threadId) { + InEvents.wait(); native_cpu::state state = getState(ndr); execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), first, @@ -285,7 +287,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *phEvent = event; } event->set_callback([kernel = std::move(kernel), hKernel, event, - InEvents = std::move(InEvents)]() { + InEvents = InEvents.getUniquePtr()]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); @@ -318,15 +320,13 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, std::vector> futures; auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); - futures.emplace_back( - tp.schedule_task([f, InEvents = InEvents.get()](size_t) { - if (InEvents) - InEvents->wait(); - f(); - })); + futures.emplace_back(tp.schedule_task([f, InEvents](size_t) { + InEvents.wait(); + f(); + })); event->set_futures(futures); event->set_callback( - [event, InEvents = std::move(InEvents)]() { event->tick_end(); }); + [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); }); return UR_RESULT_SUCCESS; } urEventWait(numEventsInWaitList, phEventWaitList); From ffe66d0614ef6c0ad7d0226eec26d52e8e8b937e Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 26 May 2025 19:43:12 +0100 Subject: [PATCH 67/95] [NATIVECPU] added mutex to backend queue --- .../source/adapters/native_cpu/event.cpp | 4 ++-- .../source/adapters/native_cpu/event.hpp | 2 +- .../source/adapters/native_cpu/queue.hpp | 17 ++++++++++++++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index 91b8fb302eb18..a69c90cbe9655 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -121,7 +121,7 @@ ur_event_handle_t_::~ur_event_handle_t_() { } } -void ur_event_handle_t_::wait() { +void ur_event_handle_t_::wait(bool needQueueLock) { std::unique_lock lock(mutex); if (done) { return; @@ -129,7 +129,7 @@ void ur_event_handle_t_::wait() { for (auto &f : futures) { f.wait(); } - queue->removeEvent(this); + queue->removeEvent(this, needQueueLock); done = true; // The callback may need to acquire the lock, so we unlock it here lock.unlock(); diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 479c671b38cd1..4ddeb1ae87b2a 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -25,7 +25,7 @@ struct ur_event_handle_t_ : RefCounted { callback = std::packaged_task(std::forward(cb)); } - void wait(); + void wait(bool needQueueLock = true); uint32_t getExecutionStatus() { // TODO: add support for UR_EVENT_STATUS_RUNNING diff --git a/unified-runtime/source/adapters/native_cpu/queue.hpp b/unified-runtime/source/adapters/native_cpu/queue.hpp index 1369b49a10984..cb655e9119e76 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.hpp +++ b/unified-runtime/source/adapters/native_cpu/queue.hpp @@ -27,16 +27,26 @@ struct ur_queue_handle_t_ : RefCounted { ur_context_handle_t getContext() const { return context; } - void addEvent(ur_event_handle_t event) { events.insert(event); } + void addEvent(ur_event_handle_t event) { + std::lock_guard lock(mutex); + events.insert(event); + } - void removeEvent(ur_event_handle_t event) { events.erase(event); } + void removeEvent(ur_event_handle_t event, bool needQueueLock) { + if (needQueueLock) { + std::unique_lock lock(mutex); + events.erase(event); + } else + events.erase(event); + } void finish() { + std::lock_guard lock(mutex); while (!events.empty()) { auto ev = *events.begin(); // ur_event_handle_t_::wait removes itself from the events set in the // queue - ev->wait(); + ev->wait(false /*lock already taken*/); } events.clear(); } @@ -58,4 +68,5 @@ struct ur_queue_handle_t_ : RefCounted { std::set events; const bool inOrder; const bool profilingEnabled; + std::mutex mutex; }; From c95ebe720c8f2dfb4048a36336d3a0e8c041a88b Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 2 Jun 2025 09:55:47 +0100 Subject: [PATCH 68/95] [NATIVECPU] renamed flag to lock mutex --- unified-runtime/source/adapters/native_cpu/event.cpp | 4 ++-- unified-runtime/source/adapters/native_cpu/event.hpp | 2 +- unified-runtime/source/adapters/native_cpu/queue.hpp | 11 ++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index a69c90cbe9655..326596215cb40 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -121,7 +121,7 @@ ur_event_handle_t_::~ur_event_handle_t_() { } } -void ur_event_handle_t_::wait(bool needQueueLock) { +void ur_event_handle_t_::wait(bool queue_already_locked) { std::unique_lock lock(mutex); if (done) { return; @@ -129,7 +129,7 @@ void ur_event_handle_t_::wait(bool needQueueLock) { for (auto &f : futures) { f.wait(); } - queue->removeEvent(this, needQueueLock); + queue->removeEvent(this, queue_already_locked); done = true; // The callback may need to acquire the lock, so we unlock it here lock.unlock(); diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 4ddeb1ae87b2a..a5faf8f24385c 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -25,7 +25,7 @@ struct ur_event_handle_t_ : RefCounted { callback = std::packaged_task(std::forward(cb)); } - void wait(bool needQueueLock = true); + void wait(bool queue_already_locked = false); uint32_t getExecutionStatus() { // TODO: add support for UR_EVENT_STATUS_RUNNING diff --git a/unified-runtime/source/adapters/native_cpu/queue.hpp b/unified-runtime/source/adapters/native_cpu/queue.hpp index cb655e9119e76..da0f0ada17135 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.hpp +++ b/unified-runtime/source/adapters/native_cpu/queue.hpp @@ -32,12 +32,13 @@ struct ur_queue_handle_t_ : RefCounted { events.insert(event); } - void removeEvent(ur_event_handle_t event, bool needQueueLock) { - if (needQueueLock) { - std::unique_lock lock(mutex); + void removeEvent(ur_event_handle_t event, bool queue_already_locked) { + if (queue_already_locked) { events.erase(event); - } else + } else { + std::unique_lock lock(mutex); events.erase(event); + } } void finish() { @@ -46,7 +47,7 @@ struct ur_queue_handle_t_ : RefCounted { auto ev = *events.begin(); // ur_event_handle_t_::wait removes itself from the events set in the // queue - ev->wait(false /*lock already taken*/); + ev->wait(true /*mutex is already locked*/); } events.clear(); } From 6fcea0f0d655849d4881f70e339a506eba178cee Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 4 Jun 2025 17:16:39 +0100 Subject: [PATCH 69/95] [NATIVECPU] launch ranges with number of work items that is multiple of vector width --- .../source/adapters/native_cpu/enqueue.cpp | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 57df579b88086..bff27211596f8 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -197,8 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #else bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; - if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && - !kernel->hasLocalArgs()) { + if (isLocalSizeOne && !kernel->hasLocalArgs()) { // If the local size is one, we make the assumption that we are running a // parallel_for over a sycl::range. // Todo: we could add more compiler checks and @@ -212,10 +211,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // divide the global range by the number of threads, set that as the local // size and peel everything else. + // The number of items per kernel invocation should ideally be at least a + // multiple of the applied vector width, which we currently assume to be 8. + // TODO: Encode this and other kernel capabilities in the binary so we can + // use actual values to efficiently enqueue kernels instead of relying on + // assumptions. + const size_t itemsPerKernelInvocation = 8; + size_t new_num_work_groups_0 = numParallelThreads; size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; + if (itemsPerThread < itemsPerKernelInvocation) { + if (itemsPerKernelInvocation <= numWG0) + itemsPerThread = itemsPerKernelInvocation; + else if (itemsPerThread == 0) + itemsPerThread = numWG0; + } else if (itemsPerThread > itemsPerKernelInvocation) { + // Launch kernel with number of items that is the next multiple of the + // vector width. + const size_t nextMult = (itemsPerThread + itemsPerKernelInvocation - 1) / + itemsPerKernelInvocation * + itemsPerKernelInvocation; + if (nextMult < numWG0) + itemsPerThread = nextMult; + } - for (size_t t = 0; t < numParallelThreads;) { + size_t wg0_index = 0; + for (size_t t = 0; (wg0_index + itemsPerThread) <= numWG0; + wg0_index += itemsPerThread) { IndexT first = {t, 0, 0}; IndexT last = {++t, numWG1, numWG2}; futures.emplace_back(tp.schedule_task([ndr, itemsPerThread, @@ -227,14 +249,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( })); } - size_t start_wg0_remainder = new_num_work_groups_0 * itemsPerThread; - if (start_wg0_remainder < numWG0) { + if (wg0_index < numWG0) { // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. futures.emplace_back( - tp.schedule_task([ndr, &kernel = *kernel, start_wg0_remainder, numWG0, - numWG1, numWG2, InEvents](size_t) { - IndexT first = {start_wg0_remainder, 0, 0}; + tp.schedule_task([ndr, &kernel = *kernel, wg0_index, numWG0, numWG1, + numWG2, InEvents](size_t) { + IndexT first = {wg0_index, 0, 0}; IndexT last = {numWG0, numWG1, numWG2}; InEvents.wait(); native_cpu::state state = getState(ndr); From d86f429f433260d66d8ec98ce724d383fa09a469 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 5 Jun 2025 12:09:18 +0100 Subject: [PATCH 70/95] [NATIVECPU] used lock_guard --- unified-runtime/source/adapters/native_cpu/queue.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/queue.hpp b/unified-runtime/source/adapters/native_cpu/queue.hpp index da0f0ada17135..6da396244e58b 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.hpp +++ b/unified-runtime/source/adapters/native_cpu/queue.hpp @@ -36,7 +36,7 @@ struct ur_queue_handle_t_ : RefCounted { if (queue_already_locked) { events.erase(event); } else { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); events.erase(event); } } From ddb908f5e1a6e2322f6061efb7e22b08a347847a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 5 Jun 2025 13:20:46 +0100 Subject: [PATCH 71/95] [NATIVECPU] removed unused local --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index bff27211596f8..3b5d1b8433e34 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -218,7 +218,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // assumptions. const size_t itemsPerKernelInvocation = 8; - size_t new_num_work_groups_0 = numParallelThreads; size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; if (itemsPerThread < itemsPerKernelInvocation) { if (itemsPerKernelInvocation <= numWG0) From 1d629032915211e8e16a08ba948eba94e27a0deb Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 9 Jun 2025 18:44:44 +0100 Subject: [PATCH 72/95] [NATIVECPU] removed reference captures in enqueue lambdas --- .../source/adapters/native_cpu/enqueue.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 3b5d1b8433e34..8fff9391fcb0e 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -369,7 +369,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( // TODO: the wait here should be async return withTimingEvent(UR_COMMAND_EVENTS_WAIT, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( @@ -377,7 +377,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return withTimingEvent(UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( @@ -556,12 +556,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return withTimingEvent( UR_COMMAND_MEM_BUFFER_FILL, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - + phEvent, [hBuffer, offset, size, patternSize, pPattern]() { // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; @@ -615,7 +613,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_event_handle_t *phEvent, void **ppRetMap) { return withTimingEvent(UR_COMMAND_MEM_BUFFER_MAP, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, [&]() { + phEventWaitList, phEvent, + [ppRetMap, hBuffer, offset]() { *ppRetMap = hBuffer->_mem + offset; return UR_RESULT_SUCCESS; }); @@ -627,7 +626,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_event_handle_t *phEvent) { return withTimingEvent(UR_COMMAND_MEM_UNMAP, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( @@ -636,7 +635,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return withTimingEvent( UR_COMMAND_USM_FILL, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { + phEvent, [ptr, pPattern, patternSize, size]() { UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(patternSize != 0, UR_RESULT_ERROR_INVALID_SIZE) From 11ebe05a40bee5a34baf538e612b591f302e314f Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Wed, 11 Jun 2025 15:53:47 +0100 Subject: [PATCH 73/95] [NATIVECPU] bump oneTBB version --- .../source/adapters/native_cpu/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 1cff4ef22e0d2..05e683eb039ea 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -60,11 +60,11 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git -#commit 7dfe7e744ac583bd1c202f6e0eff4f51269bc524 (HEAD -> master, origin/master, origin/HEAD) -#Author: Olga Malysheva -#Date: Wed Apr 30 21:29:20 2025 +0200 -# Update compiler and linker options (#1719) - GIT_TAG 7dfe7e744ac583bd1c202f6e0eff4f51269bc524 +#commit 54f3611e12b77ae40bf919eca65e7ff2218fdc34 (HEAD -> master, origin/master, origin/HEAD) +#Author: Alexandra +#Date: Fri Jun 6 15:02:22 2025 +0200 +# Update README.md (#1750) + GIT_TAG 54f3611e12b77ae40bf919eca65e7ff2218fdc34 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From 37ccfcae14abbd43bd4d1fdbde0aba9512f1b7d4 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 12 Jun 2025 12:52:26 +0100 Subject: [PATCH 74/95] [NATIVECPU] added option to turn off waiting in threads for oneTBB --- .../source/adapters/native_cpu/enqueue.cpp | 13 ++++++++++--- .../source/adapters/native_cpu/threadpool.hpp | 3 ++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index a1ad3e9e82e98..55f74419eb8d7 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -70,8 +70,14 @@ class WaitInfo { } }; +template inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList) { + const ur_event_handle_t *phEventWaitList, + const T &scheduler) { + if (numEventsInWaitList && !scheduler.CanWaitInThread()) { + urEventWait(numEventsInWaitList, phEventWaitList); + numEventsInWaitList = 0; + } return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList); } @@ -175,7 +181,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); - auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); + auto InEvents = + native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks); #ifndef NATIVECPU_USE_OCK native_cpu::state state = getState(ndr); @@ -343,7 +350,7 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, auto &tp = hQueue->getDevice()->tp; auto Tasks = native_cpu::getScheduler(tp); auto InEvents = - native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); + native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks); Tasks.schedule([f, InEvents](size_t) { InEvents.wait(); f(); diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index db59deed28de4..78f2ba19ed293 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -220,7 +220,6 @@ class TasksInfo_TP { f.wait(); } TasksInfo_TP(simple_threadpool_t &) {} - static constexpr bool CanWaitInThread() { return true; } }; template struct Scheduler_base { @@ -228,6 +227,7 @@ template struct Scheduler_base { TaskInfo ti; Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {} TaskInfo getTaskInfo() { return std::move(ti); } + static constexpr bool CanWaitInThread() { return true; } }; template struct Scheduler : Scheduler_base { @@ -281,6 +281,7 @@ struct Scheduler task(thread_id); }); } + static constexpr bool CanWaitInThread() { return false; } }; using tasksinfo_t = TBB_TasksInfo; From dadfcd314989a4f876d055f3ecba6ecbf1749a28 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 13 Jun 2025 09:27:59 +0100 Subject: [PATCH 75/95] [NATIVECPU] added tbb::parallel_for for ranges --- .../source/adapters/native_cpu/enqueue.cpp | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 55f74419eb8d7..8f316a00c819f 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -120,6 +120,23 @@ static inline void execute_range(native_cpu::state &state, execute_range(state, hKernel, hKernel.getArgs(), first, lastPlusOne); } +#ifdef NATIVECPU_WITH_ONETBB +class nativecpu_tbb_executor { + const native_cpu::NDRDescT ndr; + const ur_kernel_handle_t_ &hKernel; + const size_t itemsPerThread; +public: + void operator()(const tbb::blocked_range3d &r) const { + auto state = getResizedState(ndr, itemsPerThread); + const IndexT first = {r.pages().begin(), r.rows().begin() , r.cols().begin()}; + const IndexT last_plus_one = {r.pages().end() , r.rows().end() , r.cols().end() }; + execute_range(state, hKernel, first, last_plus_one); + } + nativecpu_tbb_executor(const native_cpu::NDRDescT &n, const ur_kernel_handle_t_ &k, size_t itemsPerThreadP) : ndr(n), hKernel(k), itemsPerThread(itemsPerThreadP) {} +}; +#define NATIVECPU_WITH_ONETBB_PARALLELFOR +#endif + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -240,7 +257,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (nextMult < numWG0) itemsPerThread = nextMult; } - +#ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR + const size_t wg0_num = ndr.GlobalSize[0] / itemsPerThread; + if (wg0_num) { + tbb::blocked_range3d range(0, wg0_num, 0, numWG1, 0, numWG2); + nativecpu_tbb_executor tbb_ex(ndr, *kernel, itemsPerThread); + tbb::parallel_for(range, tbb_ex); + } + size_t wg0_index = wg0_num * itemsPerThread; +#else size_t wg0_index = 0; for (size_t t = 0; (wg0_index + itemsPerThread) <= numWG0; wg0_index += itemsPerThread) { @@ -253,10 +278,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( execute_range(resized_state, kernel, first, last); }); } - +#endif if (wg0_index < numWG0) { // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. +#ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR + tbb::blocked_range3d range(wg0_index, numWG0, 0, numWG1, 0, numWG2); + nativecpu_tbb_executor tbb_ex(ndr, *kernel, 1); + tbb::parallel_for(range, tbb_ex); +#else Tasks.schedule([ndr, &kernel = *kernel, wg0_index, numWG0, numWG1, numWG2, InEvents](size_t) { IndexT first = {wg0_index, 0, 0}; @@ -265,6 +295,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( native_cpu::state state = getState(ndr); execute_range(state, kernel, first, last); }); +#endif } } else { // We are running a parallel_for over an nd_range @@ -283,6 +314,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( IndexT first = {0, 0, 0}, last = numWG; size_t wg_start = 0; if (groupsPerThread[dim]) { +#ifdef TTTTTT + NATIVECPU_WITH_ONETBB_PARALLELFOR + tbb::blocked_range3d range(wg0_index, numWG0, 0, numWG1, 0, numWG2); + nativecpu_tbb_executor tbb_ex(ndr, *kernel, 1); + tbb::parallel_for(range, tbb_ex); +#else for (size_t t = 0; t < numParallelThreads; t++) { first[dim] = wg_start; wg_start += groupsPerThread[dim]; @@ -296,6 +333,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( last); }); } +#endif } if (wg_start < numWG[dim]) { first[dim] = wg_start; From 3e978db557cd0dd0636974b5cc6f82ab98c302b0 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 13 Jun 2025 18:47:42 +0100 Subject: [PATCH 76/95] [NATIVECPU] all kernel launches now use tbb::parallel_for --- .../source/adapters/native_cpu/enqueue.cpp | 76 +++++++++++++++---- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 8f316a00c819f..22d63fb51cfd0 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -85,16 +85,19 @@ inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList, } // namespace native_cpu static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, - size_t itemsPerThread) { + size_t itemsPerThread, + size_t dim) { + auto local_size = ndr.LocalSize; + local_size[dim] = itemsPerThread; native_cpu::state resized_state( - ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, - ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], - ndr.GlobalOffset[1], ndr.GlobalOffset[2]); + ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], local_size[0], + local_size[1], local_size[2], ndr.GlobalOffset[0], ndr.GlobalOffset[1], + ndr.GlobalOffset[2]); return resized_state; } static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { - return getResizedState(ndr, ndr.LocalSize[0]); + return getResizedState(ndr, ndr.LocalSize[0], 0); } using IndexT = std::array; @@ -123,17 +126,50 @@ static inline void execute_range(native_cpu::state &state, #ifdef NATIVECPU_WITH_ONETBB class nativecpu_tbb_executor { const native_cpu::NDRDescT ndr; - const ur_kernel_handle_t_ &hKernel; const size_t itemsPerThread; + +protected: + const ur_kernel_handle_t_ &hKernel; + + void execute(const tbb::blocked_range3d &r, + const std::vector &args, size_t dim) const { + auto state = getResizedState(ndr, itemsPerThread, dim); + const IndexT first = {r.pages().begin(), r.rows().begin(), + r.cols().begin()}; + const IndexT last_plus_one = {r.pages().end(), r.rows().end(), + r.cols().end()}; + execute_range(state, hKernel, args, first, last_plus_one); + } + public: void operator()(const tbb::blocked_range3d &r) const { - auto state = getResizedState(ndr, itemsPerThread); - const IndexT first = {r.pages().begin(), r.rows().begin() , r.cols().begin()}; - const IndexT last_plus_one = {r.pages().end() , r.rows().end() , r.cols().end() }; - execute_range(state, hKernel, first, last_plus_one); + execute(r, hKernel.getArgs(), 0); } - nativecpu_tbb_executor(const native_cpu::NDRDescT &n, const ur_kernel_handle_t_ &k, size_t itemsPerThreadP) : ndr(n), hKernel(k), itemsPerThread(itemsPerThreadP) {} + nativecpu_tbb_executor(const native_cpu::NDRDescT &n, + const ur_kernel_handle_t_ &k, size_t itemsPerThreadP) + : ndr(n), hKernel(k), itemsPerThread(itemsPerThreadP) {} }; + +class nativecpu_tbb_nd_executor : nativecpu_tbb_executor { + const size_t numParallelThreads, dimension; + +public: + nativecpu_tbb_nd_executor(const native_cpu::NDRDescT &n, + const ur_kernel_handle_t_ &k, + size_t numParallelThreads, size_t dim) + : nativecpu_tbb_executor(n, k, n.LocalSize[0]), + numParallelThreads(numParallelThreads), dimension(dim) {} + + void operator()(const tbb::blocked_range3d &r) const { + auto thread_id = tbb::this_task_arena::current_thread_index(); + assert(thread_id >= 0 && + thread_id < oneapi::tbb::info::default_concurrency()); + + auto args = this->hKernel.getArgs(numParallelThreads, thread_id); + execute(r, args, dimension); + } +}; + #define NATIVECPU_WITH_ONETBB_PARALLELFOR #endif @@ -283,7 +319,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. #ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR - tbb::blocked_range3d range(wg0_index, numWG0, 0, numWG1, 0, numWG2); + tbb::blocked_range3d range(wg0_index, numWG0, 0, numWG1, 0, + numWG2); nativecpu_tbb_executor tbb_ex(ndr, *kernel, 1); tbb::parallel_for(range, tbb_ex); #else @@ -314,10 +351,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( IndexT first = {0, 0, 0}, last = numWG; size_t wg_start = 0; if (groupsPerThread[dim]) { -#ifdef TTTTTT - NATIVECPU_WITH_ONETBB_PARALLELFOR - tbb::blocked_range3d range(wg0_index, numWG0, 0, numWG1, 0, numWG2); - nativecpu_tbb_executor tbb_ex(ndr, *kernel, 1); +#ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR + tbb::blocked_range3d range(first[0], last[0], first[1], last[1], + first[2], last[2]); + nativecpu_tbb_nd_executor tbb_ex(ndr, *kernel, numParallelThreads, dim); tbb::parallel_for(range, tbb_ex); #else for (size_t t = 0; t < numParallelThreads; t++) { @@ -338,6 +375,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (wg_start < numWG[dim]) { first[dim] = wg_start; last[dim] = numWG[dim]; +#ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR + tbb::blocked_range3d range(first[0], last[0], first[1], last[1], + first[2], last[2]); + nativecpu_tbb_nd_executor tbb_ex(ndr, *kernel, numParallelThreads, dim); + tbb::parallel_for(range, tbb_ex); +#else Tasks.schedule([ndr, numParallelThreads, &kernel = *kernel, first, last, InEvents](size_t threadId) { InEvents.wait(); @@ -346,6 +389,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel.getArgs(numParallelThreads, threadId), first, last); }); +#endif } } From 0e28a6e14809344634f848309de4a0d4e2be0c10 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 13 Jun 2025 19:07:29 +0100 Subject: [PATCH 77/95] [NATIVECPU] added function to get thread id from tbb --- .../source/adapters/native_cpu/enqueue.cpp | 5 +---- .../source/adapters/native_cpu/threadpool.hpp | 11 ++++++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 22d63fb51cfd0..d33a903bb5a1c 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -161,10 +161,7 @@ class nativecpu_tbb_nd_executor : nativecpu_tbb_executor { numParallelThreads(numParallelThreads), dimension(dim) {} void operator()(const tbb::blocked_range3d &r) const { - auto thread_id = tbb::this_task_arena::current_thread_index(); - assert(thread_id >= 0 && - thread_id < oneapi::tbb::info::default_concurrency()); - + auto thread_id = native_cpu::getTBBThreadID(); auto args = this->hKernel.getArgs(numParallelThreads, thread_id); execute(r, args, dimension); } diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 78f2ba19ed293..f60c7b6d7cbc4 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -269,15 +269,20 @@ class TBB_TasksInfo { static constexpr bool CanWaitInThread() { return false; } }; +inline auto getTBBThreadID() { + auto thread_id = tbb::this_task_arena::current_thread_index(); + assert(thread_id >= 0 && + thread_id < oneapi::tbb::info::default_concurrency()); + return thread_id; +} + template <> struct Scheduler : Scheduler_base { using Scheduler_base::Scheduler_base; template inline void schedule(T &&task_) { ref.Tasks().run([task = std::move(task_)]() { - auto thread_id = tbb::this_task_arena::current_thread_index(); - assert(thread_id >= 0 && - thread_id < oneapi::tbb::info::default_concurrency()); + auto thread_id = getTBBThreadID(); task(thread_id); }); } From 5e0b99db40a6b28dea8ac68449a06735a8b1b605 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Jul 2025 15:06:34 +0100 Subject: [PATCH 78/95] [NATIVECPU] removed unneeded capture --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f524b6c3f85ce..156993f89f3a0 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -180,9 +180,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( futures.emplace_back( tp.schedule_task([ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3], numWG0, numWG1, -#ifndef NATIVECPU_USE_OCK - localSize = ndr.LocalSize, -#endif numParallelThreads](size_t threadId) mutable { auto state = getState(ndr); InEvents.wait(); @@ -194,9 +191,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), &state); #else - for (size_t local2 = 0; local2 < localSize[2]; ++local2) { - for (size_t local1 = 0; local1 < localSize[1]; ++local1) { - for (size_t local0 = 0; local0 < localSize[0]; ++local0) { + for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { + for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { + for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { state.update(g0, g1, g2, local0, local1, local2); kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), From f05bba10e28ef28ef443e85cb598e62d9f929b2c Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Jul 2025 15:15:39 +0100 Subject: [PATCH 79/95] [NATIVECPU] removed mutable from task lambda --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 156993f89f3a0..f98ab905c47c4 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -180,7 +180,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( futures.emplace_back( tp.schedule_task([ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3], numWG0, numWG1, - numParallelThreads](size_t threadId) mutable { + numParallelThreads](size_t threadId) { auto state = getState(ndr); InEvents.wait(); for (size_t g0 = rangeStart[0], g1 = rangeStart[1], From 58ffb890266e71a4636a76c23552af12a93287c1 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Jul 2025 16:31:59 +0100 Subject: [PATCH 80/95] [NATIVECPU] clang-format --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f98ab905c47c4..86da10bbffef7 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -177,10 +177,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( rangeEnd[0] = rangeEnd[3] % numWG0; rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1; rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1); - futures.emplace_back( - tp.schedule_task([ndr, InEvents, &kernel = *kernel, rangeStart, - rangeEnd = rangeEnd[3], numWG0, numWG1, - numParallelThreads](size_t threadId) { + futures.emplace_back(tp.schedule_task( + [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3], + numWG0, numWG1, numParallelThreads](size_t threadId) { auto state = getState(ndr); InEvents.wait(); for (size_t g0 = rangeStart[0], g1 = rangeStart[1], From f6b68dc38c28838199c2209eea171ac54d0a1ea0 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 24 Jul 2025 16:45:39 +0100 Subject: [PATCH 81/95] [NATIVECPU] clang-format --- .../source/adapters/native_cpu/enqueue.cpp | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f28d3dc324de3..472f933104c9d 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -185,38 +185,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1; rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1); Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart, - rangeEnd = rangeEnd[3], numWG0, numWG1, - numParallelThreads](size_t threadId) { - auto state = getState(ndr); - InEvents.wait(); - for (size_t g0 = rangeStart[0], g1 = rangeStart[1], - g2 = rangeStart[2], g3 = rangeStart[3]; - g3 < rangeEnd; ++g3) { + rangeEnd = rangeEnd[3], numWG0, numWG1, + numParallelThreads](size_t threadId) { + auto state = getState(ndr); + InEvents.wait(); + for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2], + g3 = rangeStart[3]; + g3 < rangeEnd; ++g3) { #ifdef NATIVECPU_USE_OCK - state.update(g0, g1, g2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), &state); + state.update(g0, g1, g2); + kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), + &state); #else - for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { - for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { - for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { - state.update(g0, g1, g2, local0, local1, local2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), - &state); - } - } + for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { + for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { + for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { + state.update(g0, g1, g2, local0, local1, local2); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), &state); } + } + } #endif - if (++g0 == numWG0) { - g0 = 0; - if (++g1 == numWG1) { - g1 = 0; - ++g2; - } - } + if (++g0 == numWG0) { + g0 = 0; + if (++g1 == numWG1) { + g1 = 0; + ++g2; } - }); + } + } + }); rangeStart = rangeEnd; } event->set_futures(Tasks.getTaskInfo()); From faa03d2defa66a389c945c7e427166025c621e96 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 5 Nov 2024 10:54:19 +0000 Subject: [PATCH 82/95] [SYCL][NATIVECPU] update docs for oneTBB integration --- sycl/doc/design/SYCLNativeCPU.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index b7fbb47d1064c..8a7e17d2ea8e4 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. +# oneTBB integration + +SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time: + +``` +python3 buildbot/configure.py \ + --native_cpu \ + --cmake-opt=-DNATIVECPU_WITH_ONETBB=On +``` + +This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual. + +By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++. + # Supported features and current limitations The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu). From cfcc3254a37a731e284f6926d080dbaa5543df20 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 5 Nov 2024 10:57:33 +0000 Subject: [PATCH 83/95] [SYCL][NATIVECPU] fixed heading for oneTBB integration --- sycl/doc/design/SYCLNativeCPU.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index 8a7e17d2ea8e4..2ace9543328b4 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -62,7 +62,7 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. -# oneTBB integration +### oneTBB integration SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time: From a26eb5832de2850f1e72d0535a102d62cc56e494 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Jul 2025 10:51:04 +0100 Subject: [PATCH 84/95] [NATIVECPU] removed unused code --- .../source/adapters/native_cpu/common.cpp | 1 - unified-runtime/source/adapters/native_cpu/event.cpp | 2 +- unified-runtime/source/adapters/native_cpu/event.hpp | 2 +- .../source/adapters/native_cpu/kernel.hpp | 2 +- .../source/adapters/native_cpu/nativecpu_state.hpp | 12 ++++++------ .../source/adapters/native_cpu/threadpool.hpp | 6 +++--- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/common.cpp b/unified-runtime/source/adapters/native_cpu/common.cpp index 768a0f6f9a143..328393b6898f1 100644 --- a/unified-runtime/source/adapters/native_cpu/common.cpp +++ b/unified-runtime/source/adapters/native_cpu/common.cpp @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include "common.hpp" -#include // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR thread_local int32_t ErrorMessageCode = 0; diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index 3c6e36c477279..7af1e3cd65860 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -122,7 +122,7 @@ ur_event_handle_t_::~ur_event_handle_t_() { } } -void ur_event_handle_t_::wait(bool queue_already_locked) { +void ur_event_handle_t_::wait() { std::unique_lock lock(mutex); if (done) { return; diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index c5b6c83cc7c47..876936284ffe5 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -26,7 +26,7 @@ struct ur_event_handle_t_ : RefCounted { callback = std::packaged_task(std::forward(cb)); } - void wait(bool queue_already_locked = false); + void wait(); uint32_t getExecutionStatus() { // TODO: add support for UR_EVENT_STATUS_RUNNING diff --git a/unified-runtime/source/adapters/native_cpu/kernel.hpp b/unified-runtime/source/adapters/native_cpu/kernel.hpp index 517d04e9e9d17..8daf23feb65f5 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.hpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.hpp @@ -23,7 +23,7 @@ using nativecpu_task_t = std::function; struct local_arg_info_t { uint32_t argIndex; size_t argSize; - inline local_arg_info_t(uint32_t argIndex, size_t argSize) + local_arg_info_t(uint32_t argIndex, size_t argSize) : argIndex(argIndex), argSize(argSize) {} }; diff --git a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp index 68743c33cf65a..9d6b4f4f06674 100644 --- a/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp +++ b/unified-runtime/source/adapters/native_cpu/nativecpu_state.hpp @@ -20,9 +20,9 @@ struct state { size_t MNumGroups[3]; size_t MGlobalOffset[3]; uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size; - inline state(size_t globalR0, size_t globalR1, size_t globalR2, - size_t localR0, size_t localR1, size_t localR2, size_t globalO0, - size_t globalO1, size_t globalO2) + state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, + size_t localR1, size_t localR2, size_t globalO0, size_t globalO1, + size_t globalO2) : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0, localR1, localR2}, MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2}, @@ -42,8 +42,8 @@ struct state { SubGroup_size = 1; } - inline void update(size_t group0, size_t group1, size_t group2, size_t local0, - size_t local1, size_t local2) { + void update(size_t group0, size_t group1, size_t group2, size_t local0, + size_t local1, size_t local2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; @@ -58,7 +58,7 @@ struct state { MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2]; } - inline void update(size_t group0, size_t group1, size_t group2) { + void update(size_t group0, size_t group1, size_t group2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 78f2ba19ed293..6b6105a0ce9f2 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -214,8 +214,8 @@ class TasksInfo_TP { std::vector futures; public: - inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } - inline void wait_all() { + void schedule(FType &&f) { futures.emplace_back(std::move(f)); } + void wait_all() { for (auto &f : futures) f.wait(); } @@ -264,7 +264,7 @@ class TBB_TasksInfo { TBB_threadpool *tp; public: - inline void wait_all() { tp->wait_all(); } + void wait_all() { tp->wait_all(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} static constexpr bool CanWaitInThread() { return false; } }; From 271cf93bd30ea10e1ecc241509809945fe10779a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Jul 2025 16:37:35 +0100 Subject: [PATCH 85/95] [NATIVECPU] revert to size_t --- unified-runtime/source/adapters/native_cpu/context.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/context.hpp b/unified-runtime/source/adapters/native_cpu/context.hpp index 8168e0d10eaab..b9d2d22dd1565 100644 --- a/unified-runtime/source/adapters/native_cpu/context.hpp +++ b/unified-runtime/source/adapters/native_cpu/context.hpp @@ -116,7 +116,7 @@ struct ur_context_handle_t_ : RefCounted { // We need to ensure that we align to at least alignof(usm_alloc_info), // otherwise its start address may be unaligned. alignment = - std::max(alignment, alignof(native_cpu::usm_alloc_info)); + std::max(alignment, alignof(native_cpu::usm_alloc_info)); void *alloc = native_cpu::malloc_impl(alignment, size); if (!alloc) return nullptr; From 5784a93f2f78c2ff00f505460882bd30de96b7bd Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 28 Jul 2025 16:38:07 +0100 Subject: [PATCH 86/95] [NATIVECPU] remove inline --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 6b6105a0ce9f2..39bca9d2c488b 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -255,7 +255,7 @@ class TBB_threadpool { public: void wait_all() { tasks.wait(); } oneapi::tbb::task_group &Tasks() { return tasks; } - inline size_t num_threads() const noexcept { + size_t num_threads() const noexcept { return oneapi::tbb::info::default_concurrency(); } }; @@ -273,7 +273,7 @@ template <> struct Scheduler : Scheduler_base { using Scheduler_base::Scheduler_base; - template inline void schedule(T &&task_) { + template void schedule(T &&task_) { ref.Tasks().run([task = std::move(task_)]() { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && From 0e0b45464bfe3f4a030b13dec704aef4c2331268 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 18 Aug 2025 17:47:58 +0100 Subject: [PATCH 87/95] [NATIVECPU] remove comment --- unified-runtime/source/adapters/native_cpu/event.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp index 876936284ffe5..c71e7593686cd 100644 --- a/unified-runtime/source/adapters/native_cpu/event.hpp +++ b/unified-runtime/source/adapters/native_cpu/event.hpp @@ -43,7 +43,6 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } - // todo: get rid of this function void set_futures(native_cpu::tasksinfo_t &&fs) { std::lock_guard lock(mutex); futures = std::move(fs); From a009bd23d4625c3704432e7941e67a5195dbd110 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Mon, 18 Aug 2025 18:38:47 +0100 Subject: [PATCH 88/95] [NATIVECPU] removed unused function --- unified-runtime/source/adapters/native_cpu/threadpool.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp index 39bca9d2c488b..3010b60238092 100644 --- a/unified-runtime/source/adapters/native_cpu/threadpool.hpp +++ b/unified-runtime/source/adapters/native_cpu/threadpool.hpp @@ -266,7 +266,6 @@ class TBB_TasksInfo { public: void wait_all() { tp->wait_all(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} - static constexpr bool CanWaitInThread() { return false; } }; template <> From a44fc998600e6487c79e07ff36ba65d98f2e3311 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 19 Aug 2025 16:04:59 +0100 Subject: [PATCH 89/95] [NATIVECPU] update oneTBB --- .../source/adapters/native_cpu/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index 00d67baed092a..d0f87ea914ebf 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -61,11 +61,11 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git -#commit 54f3611e12b77ae40bf919eca65e7ff2218fdc34 (HEAD -> master, origin/master, origin/HEAD) -#Author: Alexandra -#Date: Fri Jun 6 15:02:22 2025 +0200 -# Update README.md (#1750) - GIT_TAG 54f3611e12b77ae40bf919eca65e7ff2218fdc34 +# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD) +# Author: Ilya Isaev +# Date: Mon Aug 18 10:35:26 2025 +0200 +# Improve task_arena interoperability with task_groups (#1784) + GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From 6a5f9d1b9841568cd7bf8b0ab0047272ddea6d8f Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 19 Aug 2025 17:02:55 +0100 Subject: [PATCH 90/95] [NATIVECPU] add -Wno-stringop-overflow for oneTBB --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index d0f87ea914ebf..fef062787c0e5 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -100,7 +100,7 @@ if(NATIVECPU_WITH_ONETBB) if (NOT MSVC) # oneTBB currently casts away some const qualifiers # todo: check if compiler actually supports these options - target_compile_options(tbb PRIVATE -Wno-cast-qual) + target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() From 87f3e17f141823ad70612e8f1bf7275fd1ea33f2 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Tue, 19 Aug 2025 17:22:13 +0100 Subject: [PATCH 91/95] [NATIVECPU] add -Wno-unknown-warning-option for oneTBB --- unified-runtime/source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt index fef062787c0e5..d6854bd9c9e99 100644 --- a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt +++ b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt @@ -100,7 +100,7 @@ if(NATIVECPU_WITH_ONETBB) if (NOT MSVC) # oneTBB currently casts away some const qualifiers # todo: check if compiler actually supports these options - target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow) + target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() From 1b294bb80960105961b537515132e2e511f94364 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 21 Aug 2025 12:29:00 +0100 Subject: [PATCH 92/95] [NATIVECPU] remove unneeded code --- unified-runtime/source/adapters/native_cpu/enqueue.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f4a584b19b614..2c28b8cebc812 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -312,20 +312,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (wg_start < numWG[dim]) { first[dim] = wg_start; last[dim] = numWG[dim]; -#ifdef NATIVECPU_WITH_ONETBB_PARALLELFOR tbb::blocked_range3d range(first[0], last[0], first[1], last[1], first[2], last[2]); nativecpu_tbb_nd_executor tbb_ex(ndr, *kernel, numParallelThreads); tbb::parallel_for(range, tbb_ex); -#else - Tasks.schedule([ndr, numParallelThreads, &kernel = *kernel, first, last, - InEvents](size_t threadId) { - InEvents.wait(); - native_cpu::state state = getState(ndr); - execute_range(state, kernel, kernel.getArgs(numParallelThreads, threadId), - first, last); - }); -#endif } #endif // NATIVECPU_WITH_ONETBB_PARALLELFOR From 53d5b87f6cb464f986cd9164c4620ee8d8d09326 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Thu, 21 Aug 2025 13:34:08 +0100 Subject: [PATCH 93/95] [NATIEVCPU] sharing kernel invoke --- .../source/adapters/native_cpu/enqueue.cpp | 84 +++++++++---------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 2c28b8cebc812..d85d33d91e012 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -99,66 +99,73 @@ static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { using IndexT = std::array; using RangeT = native_cpu::NDRDescT::RangeT; +static inline void invoke_kernel(native_cpu::state &state, + const ur_kernel_handle_t_ &kernel, size_t g0, + size_t g1, size_t g2, + size_t numParallelThreads, size_t threadId, + const native_cpu::NDRDescT &ndr) { +#ifdef NATIVECPU_USE_OCK + state.update(g0, g1, g2); + kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), + &state); +#else + for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { + for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { + for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { + state.update(g0, g1, g2, local0, local1, local2); + kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), + &state); + } + } + } +#endif +} + static inline void execute_range(native_cpu::state &state, const ur_kernel_handle_t_ &hKernel, - const std::vector &args, IndexT first, - IndexT lastPlusOne) { + IndexT first, IndexT lastPlusOne, + size_t numParallelThreads, size_t threadId, + const native_cpu::NDRDescT &ndr) { for (size_t g2 = first[2]; g2 < lastPlusOne[2]; g2++) { for (size_t g1 = first[1]; g1 < lastPlusOne[1]; g1++) { for (size_t g0 = first[0]; g0 < lastPlusOne[0]; g0 += 1) { - state.update(g0, g1, g2); - hKernel._subhandler(args.data(), &state); + invoke_kernel(state, hKernel, g0, g1, g2, numParallelThreads, threadId, + ndr); } } } } -static inline void execute_range(native_cpu::state &state, - const ur_kernel_handle_t_ &hKernel, - IndexT first, IndexT lastPlusOne) { - execute_range(state, hKernel, hKernel.getArgs(), first, lastPlusOne); -} - class nativecpu_tbb_executor { const native_cpu::NDRDescT ndr; protected: const ur_kernel_handle_t_ &hKernel; + const size_t numParallelThreads; - void execute(const tbb::blocked_range3d &r, - const std::vector &args) const { + void execute(const tbb::blocked_range3d &r, size_t threadId) const { auto state = getState(ndr); const IndexT first = {r.pages().begin(), r.rows().begin(), r.cols().begin()}; const IndexT last_plus_one = {r.pages().end(), r.rows().end(), r.cols().end()}; - execute_range(state, hKernel, args, first, last_plus_one); + execute_range(state, hKernel, first, last_plus_one, numParallelThreads, + threadId, ndr); } public: void operator()(const tbb::blocked_range3d &r) const { - execute(r, hKernel.getArgs()); + auto thread_id = native_cpu::getTBBThreadID(); + execute(r, thread_id); } nativecpu_tbb_executor(const native_cpu::NDRDescT &n, - const ur_kernel_handle_t_ &k) - : ndr(n), hKernel(k) {} + const ur_kernel_handle_t_ &k, + const size_t numParallelThreads) + : ndr(n), hKernel(k), numParallelThreads(numParallelThreads) {} }; -class nativecpu_tbb_nd_executor : nativecpu_tbb_executor { - const size_t numParallelThreads; - -public: - nativecpu_tbb_nd_executor(const native_cpu::NDRDescT &n, - const ur_kernel_handle_t_ &k, - size_t numParallelThreads) - : nativecpu_tbb_executor(n, k), numParallelThreads(numParallelThreads) {} +using nativecpu_tbb_nd_executor = nativecpu_tbb_executor; - void operator()(const tbb::blocked_range3d &r) const { - auto thread_id = native_cpu::getTBBThreadID(); - auto args = this->hKernel.getArgs(numParallelThreads, thread_id); - execute(r, args); - } -}; #endif UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( @@ -262,21 +269,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2], g3 = rangeStart[3]; g3 < rangeEnd; ++g3) { -#ifdef NATIVECPU_USE_OCK - state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), - &state); -#else - for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { - for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { - for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { - state.update(g0, g1, g2, local0, local1, local2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), &state); - } - } - } -#endif + invoke_kernel(state, kernel, g0, g1, g2, numParallelThreads, threadId, + ndr); if (++g0 == numWG0) { g0 = 0; if (++g1 == numWG1) { From ea19145424986523500193aef460d59a548ff9a4 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky Date: Fri, 22 Aug 2025 12:58:54 +0100 Subject: [PATCH 94/95] [NATIVECPU] supporting tbb:parallel_for for 1D/2D/3D --- .../source/adapters/native_cpu/enqueue.cpp | 76 ++++++++++++++----- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index d85d33d91e012..ce22cabfd6db7 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -136,6 +136,8 @@ static inline void execute_range(native_cpu::state &state, } } +namespace native_cpu { + class nativecpu_tbb_executor { const native_cpu::NDRDescT ndr; @@ -143,20 +145,24 @@ class nativecpu_tbb_executor { const ur_kernel_handle_t_ &hKernel; const size_t numParallelThreads; - void execute(const tbb::blocked_range3d &r, size_t threadId) const { + void execute(IndexT first, IndexT last_plus_one) const { auto state = getState(ndr); - const IndexT first = {r.pages().begin(), r.rows().begin(), - r.cols().begin()}; - const IndexT last_plus_one = {r.pages().end(), r.rows().end(), - r.cols().end()}; + auto threadId = native_cpu::getTBBThreadID(); execute_range(state, hKernel, first, last_plus_one, numParallelThreads, threadId, ndr); } public: void operator()(const tbb::blocked_range3d &r) const { - auto thread_id = native_cpu::getTBBThreadID(); - execute(r, thread_id); + execute({r.pages().begin(), r.rows().begin(), r.cols().begin()}, + {r.pages().end(), r.rows().end(), r.cols().end()}); + } + void operator()(const tbb::blocked_range2d &r) const { + execute({r.rows().begin(), r.cols().begin(), 0}, + {r.rows().end(), r.cols().end(), 1}); + } + void operator()(const tbb::blocked_range &r) const { + execute({r.begin(), 0, 0}, {r.end(), 1, 1}); } nativecpu_tbb_executor(const native_cpu::NDRDescT &n, const ur_kernel_handle_t_ &k, @@ -164,9 +170,33 @@ class nativecpu_tbb_executor { : ndr(n), hKernel(k), numParallelThreads(numParallelThreads) {} }; -using nativecpu_tbb_nd_executor = nativecpu_tbb_executor; +using tbb_nd_executor = nativecpu_tbb_executor; + +template