Skip to content

Commit 2ae009e

Browse files
authored
[SYCL][NATIVECPU] NativeCPU with optional oneTBB backend (#16803)
Initial PR to add new CMake option for Native CPU to use oneTBB as its backend. The default is the original NativeCPU flow without oneTBB. The PR testing the new option with oneTBB enabled: #15979
1 parent 57fb0b8 commit 2ae009e

File tree

6 files changed

+203
-47
lines changed

6 files changed

+203
-47
lines changed

sycl/doc/design/SYCLNativeCPU.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia
6262

6363
The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`.
6464

65+
### oneTBB integration
66+
67+
SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time:
68+
69+
```
70+
python3 buildbot/configure.py \
71+
--native_cpu \
72+
--cmake-opt=-DNATIVECPU_WITH_ONETBB=On
73+
```
74+
75+
This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual.
76+
77+
By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++.
78+
6579
# Supported features and current limitations
6680

6781
The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).

unified-runtime/source/adapters/native_cpu/CMakeLists.txt

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
3737
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
3838
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
40+
${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
4041
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
4142
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
4243
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
@@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
5152
SOVERSION "${PROJECT_VERSION_MAJOR}"
5253
)
5354

55+
# oneTBB is used as an optional NativeCPU backend and disabled by default.
56+
option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
57+
if(NATIVECPU_WITH_ONETBB)
58+
message(STATUS "Configuring Native CPU adapter with oneTBB backend.")
59+
60+
include(FetchContent)
61+
FetchContent_Declare(
62+
tbb
63+
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
64+
# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD)
65+
# Author: Ilya Isaev <[email protected]>
66+
# Date: Mon Aug 18 10:35:26 2025 +0200
67+
# Improve task_arena interoperability with task_groups (#1784)
68+
GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e
69+
CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
70+
OVERRIDE_FIND_PACKAGE
71+
)
72+
set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
73+
set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
74+
set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
75+
set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
76+
set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
77+
set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
78+
set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
79+
set (CMAKE_INCLUDE_CURRENT_DIR OFF)
80+
FetchContent_MakeAvailable(tbb)
81+
endif()
82+
5483
find_package(Threads REQUIRED)
5584

5685
target_link_libraries(${TARGET_NAME} PRIVATE
@@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
6392
target_include_directories(${TARGET_NAME} PRIVATE
6493
"${CMAKE_CURRENT_SOURCE_DIR}/../../"
6594
)
95+
96+
if(NATIVECPU_WITH_ONETBB)
97+
target_link_libraries(${TARGET_NAME} PRIVATE
98+
TBB::tbb
99+
)
100+
if (NOT MSVC)
101+
# oneTBB currently casts away some const qualifiers
102+
# todo: check if compiler actually supports these options
103+
target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option)
104+
target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
105+
endif()
106+
107+
# Undefine _DEBUG option in release builds to find
108+
# release tbbbind
109+
if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
110+
target_compile_options(tbb PRIVATE -U_DEBUG)
111+
endif()
112+
113+
target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
114+
endif()

unified-runtime/source/adapters/native_cpu/enqueue.cpp

100644100755
Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,18 @@ class WaitInfo {
7070
}
7171
};
7272

73+
template <class T>
7374
inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
74-
const ur_event_handle_t *phEventWaitList) {
75+
const ur_event_handle_t *phEventWaitList,
76+
const T &scheduler) {
77+
if (numEventsInWaitList && !scheduler.CanWaitInThread()) {
78+
// Waiting for dependent events in threads launched by the enqueue may
79+
// not work correctly for some backend/schedulers, so we have the safe
80+
// option here to wait in the main thread instead (potentially at the
81+
// expense of performance).
82+
urEventWait(numEventsInWaitList, phEventWaitList);
83+
numEventsInWaitList = 0;
84+
}
7585
return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
7686
}
7787

@@ -151,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
151161

152162
auto &tp = hQueue->getDevice()->tp;
153163
const size_t numParallelThreads = tp.num_threads();
154-
std::vector<std::future<void>> futures;
164+
auto Tasks = native_cpu::getScheduler(tp);
155165
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
156166
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
157167
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -162,7 +172,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
162172
auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
163173
kernel->updateMemPool(numParallelThreads);
164174

165-
auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
175+
auto InEvents =
176+
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
166177

167178
const size_t numWG = numWG0 * numWG1 * numWG2;
168179
const size_t numWGPerThread = numWG / numParallelThreads;
@@ -177,42 +188,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
177188
rangeEnd[0] = rangeEnd[3] % numWG0;
178189
rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
179190
rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
180-
futures.emplace_back(tp.schedule_task(
181-
[ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
182-
numWG0, numWG1, numParallelThreads](size_t threadId) {
183-
auto state = getState(ndr);
184-
InEvents.wait();
185-
for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
186-
g2 = rangeStart[2], g3 = rangeStart[3];
187-
g3 < rangeEnd; ++g3) {
191+
Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart,
192+
rangeEnd = rangeEnd[3], numWG0, numWG1,
193+
numParallelThreads](size_t threadId) {
194+
auto state = getState(ndr);
195+
InEvents.wait();
196+
for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2],
197+
g3 = rangeStart[3];
198+
g3 < rangeEnd; ++g3) {
188199
#ifdef NATIVECPU_USE_OCK
189-
state.update(g0, g1, g2);
190-
kernel._subhandler(
191-
kernel.getArgs(numParallelThreads, threadId).data(), &state);
200+
state.update(g0, g1, g2);
201+
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(),
202+
&state);
192203
#else
193-
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
194-
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
195-
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
196-
state.update(g0, g1, g2, local0, local1, local2);
197-
kernel._subhandler(
198-
kernel.getArgs(numParallelThreads, threadId).data(),
199-
&state);
200-
}
201-
}
204+
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
205+
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
206+
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
207+
state.update(g0, g1, g2, local0, local1, local2);
208+
kernel._subhandler(
209+
kernel.getArgs(numParallelThreads, threadId).data(), &state);
202210
}
211+
}
212+
}
203213
#endif
204-
if (++g0 == numWG0) {
205-
g0 = 0;
206-
if (++g1 == numWG1) {
207-
g1 = 0;
208-
++g2;
209-
}
210-
}
214+
if (++g0 == numWG0) {
215+
g0 = 0;
216+
if (++g1 == numWG1) {
217+
g1 = 0;
218+
++g2;
211219
}
212-
}));
220+
}
221+
}
222+
});
213223
rangeStart = rangeEnd;
214224
}
215-
event->set_futures(futures);
225+
event->set_tasksinfo(Tasks.getMovedTaskInfo());
216226

217227
if (phEvent) {
218228
*phEvent = event;
@@ -248,14 +258,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
248258
return result;
249259
}
250260
auto &tp = hQueue->getDevice()->tp;
251-
std::vector<std::future<void>> futures;
261+
auto Tasks = native_cpu::getScheduler(tp);
252262
auto InEvents =
253-
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
254-
futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
263+
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
264+
Tasks.schedule([f, InEvents](size_t) {
255265
InEvents.wait();
256266
f();
257-
}));
258-
event->set_futures(futures);
267+
});
268+
event->set_tasksinfo(Tasks.getMovedTaskInfo());
259269
event->set_callback(
260270
[event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
261271
return UR_RESULT_SUCCESS;
@@ -465,7 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
465475
// TODO: error checking
466476
// TODO: handle async
467477
void *startingPtr = hBuffer->_mem + offset;
468-
unsigned steps = size / patternSize;
478+
size_t steps = size / patternSize;
469479
for (unsigned i = 0; i < steps; i++) {
470480
memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
471481
patternSize);
@@ -575,7 +585,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
575585
break;
576586
}
577587
default: {
578-
for (unsigned int step{0}; step < size; step += patternSize) {
588+
for (size_t step{0}; step < size; step += patternSize) {
579589
auto *dest = reinterpret_cast<void *>(
580590
reinterpret_cast<uint8_t *>(ptr) + step);
581591
memcpy(dest, pPattern, patternSize);

unified-runtime/source/adapters/native_cpu/event.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "ur_api.h"
1212

1313
#include "common.hpp"
14+
#include "device.hpp"
1415
#include "event.hpp"
1516
#include "queue.hpp"
1617
#include <cstdint>
@@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/,
111112
ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
112113
ur_command_t command_type)
113114
: queue(queue), context(queue->getContext()), command_type(command_type),
114-
done(false) {
115+
done(false), tasksinfo(queue->getDevice()->tp) {
115116
this->queue->addEvent(this);
116117
}
117118

@@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() {
126127
if (done) {
127128
return;
128129
}
129-
for (auto &f : futures) {
130-
f.wait();
131-
}
130+
this->tasksinfo.wait_all();
132131
queue->removeEvent(this);
133132
done = true;
134133
// The callback may need to acquire the lock, so we unlock it here

unified-runtime/source/adapters/native_cpu/event.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111
#include "common.hpp"
12+
#include "threadpool.hpp"
1213
#include "ur_api.h"
1314
#include <cstdint>
1415
#include <future>
@@ -42,9 +43,9 @@ struct ur_event_handle_t_ : RefCounted {
4243

4344
ur_command_t getCommandType() const { return command_type; }
4445

45-
void set_futures(std::vector<std::future<void>> &fs) {
46+
void set_tasksinfo(native_cpu::tasksinfo_t &&fs) {
4647
std::lock_guard<std::mutex> lock(mutex);
47-
futures = std::move(fs);
48+
tasksinfo = std::move(fs);
4849
}
4950

5051
void tick_start();
@@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
6162
ur_command_t command_type;
6263
bool done;
6364
std::mutex mutex;
64-
std::vector<std::future<void>> futures;
65+
native_cpu::tasksinfo_t tasksinfo;
6566
std::packaged_task<void()> callback;
6667
uint64_t timestamp_start = 0;
6768
uint64_t timestamp_end = 0;

unified-runtime/source/adapters/native_cpu/threadpool.hpp

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,90 @@ template <typename ThreadPoolT> class threadpool_interface {
207207
return ret;
208208
}
209209
};
210+
using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
210211

211-
using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
212+
class TasksInfo_TP {
213+
using FType = std::future<void>;
214+
std::vector<FType> futures;
212215

216+
public:
217+
void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
218+
void wait_all() {
219+
for (auto &f : futures)
220+
f.wait();
221+
}
222+
TasksInfo_TP(simple_threadpool_t &) {}
223+
};
224+
225+
template <class TP, class TaskInfo> struct Scheduler_base {
226+
TP &ref;
227+
TaskInfo ti;
228+
Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
229+
TaskInfo getMovedTaskInfo() { return std::move(ti); }
230+
static constexpr bool CanWaitInThread() { return true; }
231+
};
232+
233+
template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
234+
using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;
235+
236+
template <class T> void schedule(T &&task) {
237+
this->ti.schedule(this->ref.schedule_task(std::forward<T>(task)));
238+
}
239+
};
240+
241+
template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
242+
return Scheduler<TPType>(tp);
243+
}
244+
245+
} // namespace native_cpu
246+
247+
#ifdef NATIVECPU_WITH_ONETBB
248+
// Simple TBB backend
249+
#include "oneapi/tbb.h"
250+
namespace native_cpu {
251+
252+
class TBB_threadpool {
253+
oneapi::tbb::task_group tasks;
254+
255+
public:
256+
void wait_all() { tasks.wait(); }
257+
oneapi::tbb::task_group &Tasks() { return tasks; }
258+
size_t num_threads() const noexcept {
259+
return oneapi::tbb::info::default_concurrency();
260+
}
261+
};
262+
263+
class TBB_TasksInfo {
264+
TBB_threadpool *tp;
265+
266+
public:
267+
void wait_all() { tp->wait_all(); }
268+
TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
269+
};
270+
271+
template <>
272+
struct Scheduler<TBB_threadpool>
273+
: Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
274+
using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
275+
template <class T> void schedule(T &&task_) {
276+
ref.Tasks().run([task = std::move(task_)]() {
277+
auto thread_id = tbb::this_task_arena::current_thread_index();
278+
assert(thread_id >= 0 &&
279+
thread_id < oneapi::tbb::info::default_concurrency());
280+
task(thread_id);
281+
});
282+
}
283+
static constexpr bool CanWaitInThread() { return false; }
284+
};
285+
286+
using tasksinfo_t = TBB_TasksInfo;
287+
using threadpool_t = TBB_threadpool;
288+
} // namespace native_cpu
289+
290+
#else
291+
// The default backend
292+
namespace native_cpu {
293+
using tasksinfo_t = TasksInfo_TP;
294+
using threadpool_t = simple_threadpool_t;
213295
} // namespace native_cpu
296+
#endif

0 commit comments

Comments
 (0)