[SYCL][NATIVECPU] NativeCPU with optional oneTBB backend (#16803)

uwedolinsky · web-flow · commit 2ae009eec6d5 · 2025-09-02T13:15:43.000+02:00
Initial PR to add new CMake option for Native CPU to use oneTBB as its backend. The default is the original NativeCPU flow without oneTBB. The PR testing the new option with oneTBB enabled: #15979
diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
@@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia
 
 The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
 
+### oneTBB integration
+
+SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time:
+
+```
+python3 buildbot/configure.py \
+  --native_cpu \
+  --cmake-opt=-DNATIVECPU_WITH_ONETBB=On
+```
+
+This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual.
+
+By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++.
+
 # Supported features and current limitations
 
 The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).
diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt
@@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
@@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
+# oneTBB is used as an optional NativeCPU backend and disabled by default.
+option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
+if(NATIVECPU_WITH_ONETBB)
+  message(STATUS "Configuring Native CPU adapter with oneTBB backend.")
+
+  include(FetchContent)
+  FetchContent_Declare(
+    tbb
+    GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
+# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD)
+# Author: Ilya Isaev <ilya.isaev@intel.com>
+# Date:   Mon Aug 18 10:35:26 2025 +0200
+# Improve task_arena interoperability with task_groups (#1784)
+    GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e
+    CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
+    OVERRIDE_FIND_PACKAGE
+  )
+  set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
+  set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
+  set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
+  set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
+  set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
+  set (CMAKE_INCLUDE_CURRENT_DIR OFF)
+  FetchContent_MakeAvailable(tbb)
+endif()
+
 find_package(Threads REQUIRED)
 
 target_link_libraries(${TARGET_NAME} PRIVATE
@@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
 target_include_directories(${TARGET_NAME} PRIVATE
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
+
+if(NATIVECPU_WITH_ONETBB)
+  target_link_libraries(${TARGET_NAME} PRIVATE
+        TBB::tbb
+  )
+  if (NOT MSVC)
+    # oneTBB currently casts away some const qualifiers
+    # todo: check if compiler actually supports these options
+    target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option)
+    target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
+  endif()
+
+  # Undefine _DEBUG option in release builds to find
+  # release tbbbind
+  if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+    target_compile_options(tbb PRIVATE -U_DEBUG)
+  endif()
+
+  target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
+endif()
diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
@@ -70,8 +70,18 @@ class WaitInfo {
   }
 };
 
+template <class T>
 inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
-                                   const ur_event_handle_t *phEventWaitList) {
+                                   const ur_event_handle_t *phEventWaitList,
+                                   const T &scheduler) {
+  if (numEventsInWaitList && !scheduler.CanWaitInThread()) {
+    // Waiting for dependent events in threads launched by the enqueue may
+    // not work correctly for some backend/schedulers, so we have the safe
+    // option here to wait in the main thread instead (potentially at the
+    // expense of performance).
+    urEventWait(numEventsInWaitList, phEventWaitList);
+    numEventsInWaitList = 0;
+  }
   return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
 }
 
@@ -151,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
-  std::vector<std::future<void>> futures;
+  auto Tasks = native_cpu::getScheduler(tp);
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -162,7 +172,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
   kernel->updateMemPool(numParallelThreads);
 
-  auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
+  auto InEvents =
+      native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
 
   const size_t numWG = numWG0 * numWG1 * numWG2;
   const size_t numWGPerThread = numWG / numParallelThreads;
@@ -177,42 +188,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     rangeEnd[0] = rangeEnd[3] % numWG0;
     rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
     rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
-    futures.emplace_back(tp.schedule_task(
-        [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
-         numWG0, numWG1, numParallelThreads](size_t threadId) {
-          auto state = getState(ndr);
-          InEvents.wait();
-          for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
-                      g2 = rangeStart[2], g3 = rangeStart[3];
-               g3 < rangeEnd; ++g3) {
+    Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart,
+                    rangeEnd = rangeEnd[3], numWG0, numWG1,
+                    numParallelThreads](size_t threadId) {
+      auto state = getState(ndr);
+      InEvents.wait();
+      for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2],
+                  g3 = rangeStart[3];
+           g3 < rangeEnd; ++g3) {
 #ifdef NATIVECPU_USE_OCK
-            state.update(g0, g1, g2);
-            kernel._subhandler(
-                kernel.getArgs(numParallelThreads, threadId).data(), &state);
+        state.update(g0, g1, g2);
+        kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(),
+                           &state);
 #else
-            for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
-              for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
-                for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
-                  state.update(g0, g1, g2, local0, local1, local2);
-                  kernel._subhandler(
-                      kernel.getArgs(numParallelThreads, threadId).data(),
-                      &state);
-                }
-              }
+        for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
+          for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
+            for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
+              state.update(g0, g1, g2, local0, local1, local2);
+              kernel._subhandler(
+                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
             }
+          }
+        }
 #endif
-            if (++g0 == numWG0) {
-              g0 = 0;
-              if (++g1 == numWG1) {
-                g1 = 0;
-                ++g2;
-              }
-            }
+        if (++g0 == numWG0) {
+          g0 = 0;
+          if (++g1 == numWG1) {
+            g1 = 0;
+            ++g2;
           }
-        }));
+        }
+      }
+    });
     rangeStart = rangeEnd;
   }
-  event->set_futures(futures);
+  event->set_tasksinfo(Tasks.getMovedTaskInfo());
 
   if (phEvent) {
     *phEvent = event;
@@ -248,14 +258,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
       return result;
     }
     auto &tp = hQueue->getDevice()->tp;
-    std::vector<std::future<void>> futures;
+    auto Tasks = native_cpu::getScheduler(tp);
     auto InEvents =
-        native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
-    futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
+        native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
+    Tasks.schedule([f, InEvents](size_t) {
       InEvents.wait();
       f();
-    }));
-    event->set_futures(futures);
+    });
+    event->set_tasksinfo(Tasks.getMovedTaskInfo());
     event->set_callback(
         [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
     return UR_RESULT_SUCCESS;
@@ -465,7 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
         // TODO: error checking
         // TODO: handle async
         void *startingPtr = hBuffer->_mem + offset;
-        unsigned steps = size / patternSize;
+        size_t steps = size / patternSize;
         for (unsigned i = 0; i < steps; i++) {
           memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
                  patternSize);
@@ -575,7 +585,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
           break;
         }
         default: {
-          for (unsigned int step{0}; step < size; step += patternSize) {
+          for (size_t step{0}; step < size; step += patternSize) {
             auto *dest = reinterpret_cast<void *>(
                 reinterpret_cast<uint8_t *>(ptr) + step);
             memcpy(dest, pPattern, patternSize);
diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp
@@ -11,6 +11,7 @@
 #include "ur_api.h"
 
 #include "common.hpp"
+#include "device.hpp"
 #include "event.hpp"
 #include "queue.hpp"
 #include <cstdint>
@@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/,
 ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
                                        ur_command_t command_type)
     : queue(queue), context(queue->getContext()), command_type(command_type),
-      done(false) {
+      done(false), tasksinfo(queue->getDevice()->tp) {
   this->queue->addEvent(this);
 }
 
@@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() {
   if (done) {
     return;
   }
-  for (auto &f : futures) {
-    f.wait();
-  }
+  this->tasksinfo.wait_all();
   queue->removeEvent(this);
   done = true;
   // The callback may need to acquire the lock, so we unlock it here
diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 #include "common.hpp"
+#include "threadpool.hpp"
 #include "ur_api.h"
 #include <cstdint>
 #include <future>
@@ -42,9 +43,9 @@ struct ur_event_handle_t_ : RefCounted {
 
   ur_command_t getCommandType() const { return command_type; }
 
-  void set_futures(std::vector<std::future<void>> &fs) {
+  void set_tasksinfo(native_cpu::tasksinfo_t &&fs) {
     std::lock_guard<std::mutex> lock(mutex);
-    futures = std::move(fs);
+    tasksinfo = std::move(fs);
   }
 
   void tick_start();
@@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
   ur_command_t command_type;
   bool done;
   std::mutex mutex;
-  std::vector<std::future<void>> futures;
+  native_cpu::tasksinfo_t tasksinfo;
   std::packaged_task<void()> callback;
   uint64_t timestamp_start = 0;
   uint64_t timestamp_end = 0;
diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp
@@ -207,7 +207,90 @@ template <typename ThreadPoolT> class threadpool_interface {
     return ret;
   }
 };
+using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
 
-using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
+class TasksInfo_TP {
+  using FType = std::future<void>;
+  std::vector<FType> futures;
 
+public:
+  void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
+  void wait_all() {
+    for (auto &f : futures)
+      f.wait();
+  }
+  TasksInfo_TP(simple_threadpool_t &) {}
+};
+
+template <class TP, class TaskInfo> struct Scheduler_base {
+  TP &ref;
+  TaskInfo ti;
+  Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
+  TaskInfo getMovedTaskInfo() { return std::move(ti); }
+  static constexpr bool CanWaitInThread() { return true; }
+};
+
+template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
+  using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;
+
+  template <class T> void schedule(T &&task) {
+    this->ti.schedule(this->ref.schedule_task(std::forward<T>(task)));
+  }
+};
+
+template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
+  return Scheduler<TPType>(tp);
+}
+
+} // namespace native_cpu
+
+#ifdef NATIVECPU_WITH_ONETBB
+// Simple TBB backend
+#include "oneapi/tbb.h"
+namespace native_cpu {
+
+class TBB_threadpool {
+  oneapi::tbb::task_group tasks;
+
+public:
+  void wait_all() { tasks.wait(); }
+  oneapi::tbb::task_group &Tasks() { return tasks; }
+  size_t num_threads() const noexcept {
+    return oneapi::tbb::info::default_concurrency();
+  }
+};
+
+class TBB_TasksInfo {
+  TBB_threadpool *tp;
+
+public:
+  void wait_all() { tp->wait_all(); }
+  TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
+};
+
+template <>
+struct Scheduler<TBB_threadpool>
+    : Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
+  using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
+  template <class T> void schedule(T &&task_) {
+    ref.Tasks().run([task = std::move(task_)]() {
+      auto thread_id = tbb::this_task_arena::current_thread_index();
+      assert(thread_id >= 0 &&
+             thread_id < oneapi::tbb::info::default_concurrency());
+      task(thread_id);
+    });
+  }
+  static constexpr bool CanWaitInThread() { return false; }
+};
+
+using tasksinfo_t = TBB_TasksInfo;
+using threadpool_t = TBB_threadpool;
+} // namespace native_cpu
+
+#else
+// The default backend
+namespace native_cpu {
+using tasksinfo_t = TasksInfo_TP;
+using threadpool_t = simple_threadpool_t;
 } // namespace native_cpu
+#endif