Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions intel_extension_for_deepspeed/op_builder/cpu_adagrad.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def absolute_name(self):

def sources(self):
return [
sycl_kernel_path('csrc/adagrad/cpu_adagrad.dp.cpp'),
sycl_kernel_path('csrc/adam/custom_sycl_kernel.dp.cpp'),
sycl_kernel_path('csrc/adagrad/cpu_adagrad.cpp'),
sycl_kernel_path('csrc/adam/custom_sycl_kernel.cpp'),
]

def include_paths(self):
Expand Down
4 changes: 2 additions & 2 deletions intel_extension_for_deepspeed/op_builder/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def absolute_name(self):

def sources(self):
return [
sycl_kernel_path('csrc/adam/cpu_adam.dp.cpp'),
sycl_kernel_path('csrc/adam/custom_sycl_kernel.dp.cpp'),
sycl_kernel_path('csrc/adam/cpu_adam.cpp'),
sycl_kernel_path('csrc/adam/custom_sycl_kernel.cpp'),
]

def libraries_args(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void Adagrad_Optimizer::Step_1(float* _params,
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); }
if ((t / TILE) >= 2) { _streams[_buf_index].wait(); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void Adam_Optimizer::Step(float* _params,
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); }
if ((t / TILE) >= 2) { _streams[_buf_index].wait(); }

#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH) {
Expand Down Expand Up @@ -129,7 +129,7 @@ void Adam_Optimizer::Step(float* _params,
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); }
if ((t / TILE) >= 2) { _streams[_buf_index].wait(); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
Expand Down Expand Up @@ -213,7 +213,7 @@ void Adam_Optimizer::Step_4(float* _params,
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); }
if ((t / TILE) >= 2) { _streams[_buf_index].wait(); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
AVX_Data grad_4[4];
Expand Down Expand Up @@ -420,7 +420,7 @@ void Adam_Optimizer::Step_8(float* _params,
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); }
if ((t / TILE) >= 2) { _streams[_buf_index].wait(); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
AVX_Data grad_4[8];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ void param_update_kernel(const float* input,
if (id < size) { output[id] = (sycl::half)input[id]; }
}

void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue* stream)
void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue stream)
{
int threads = 1024;

sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1);
sycl::range<3> block_dim(1, 1, threads);

stream->submit([&](sycl::handler& cgh) {
stream.submit([&](sycl::handler& cgh) {
cgh.parallel_for(
sycl::nd_range<3>(grid_dim * block_dim, block_dim),
[=](sycl::nd_item<3> item_ct1) { param_update_kernel(input, output, size, item_ct1); });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#error "Unsupported compiler"
#endif

#include "multi_tensor_apply.dp.hpp"
#include "multi_tensor_apply.hpp"
#include "type_shim.hpp"

#define BLOCK_SIZE 512
Expand Down Expand Up @@ -117,7 +117,7 @@ void test_queue_with_accessor(void)
c10::impl::VirtualGuardImpl impl(type_);
auto device_ = c10::Device(type_);
c10::Stream dpcpp_stream = impl.getStream(device_);
sycl::queue* stream = &(xpu::get_queue_from_stream(dpcpp_stream));
sycl::queue stream = (xpu::get_queue_from_stream(dpcpp_stream));
sycl::default_selector d_selector;
static auto exception_handler = [](sycl::exception_list e_list) {
for (std::exception_ptr const& e : e_list) {
Expand Down Expand Up @@ -159,20 +159,20 @@ void test_queue_with_accessor(void)
dq.wait();
printf("done\n");
printf("submit xpu::stream without accessor ");
stream->submit([&](sycl::handler& cgh) {
stream.submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl::nd_range<1>(320 * 512, 512), [=](sycl::nd_item<1> item_ct1) {});
});
stream->wait();
stream.wait();
printf("done\n");
printf("submit xpu::stream with accessor ");
stream->submit([&](sycl::handler& cgh) {
stream.submit([&](sycl::handler& cgh) {
sycl::accessor tl_block_to_tensor(block_to_tensor_buf, cgh, sycl::read_only);
sycl::accessor tl_block_to_chunk(block_to_chunk_buf, cgh, sycl::read_only);
sycl::accessor tl_addresses(addresses_buf, cgh, sycl::read_only);
sycl::accessor tl_sizes(sizes_buf, cgh, sycl::read_only);
cgh.parallel_for(sycl::nd_range<1>(320 * 512, 512), [=](sycl::nd_item<1> item_ct1) {});
});
stream->wait();
stream.wait();
printf("done\n");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
CHECK_CONTIGUOUS(x)

template <typename T>
inline void print_nan(sycl::queue* stream, int bsz, const T* buf, char* name)
inline void print_nan(sycl::queue stream, int bsz, const T* buf, char* name)
{
T temp_tensor[10000];
bool has_nan = false;
stream->wait();
stream->memcpy(temp_tensor, buf, bsz * sizeof(T));
stream->wait();
stream.wait();
stream.memcpy(temp_tensor, buf, bsz * sizeof(T));
stream.wait();
for (int i = 0; i < bsz; i++) {
if (isnan(float(temp_tensor[i]))) { has_nan = true; }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class SyclContext {
auto device_ = c10::Device(type_);
c10::Stream dpcpp_stream = impl.getStream(device_);
_gen = new oneapi::mkl::rng::philox4x32x10(xpu::get_queue_from_stream(dpcpp_stream), 123);
if ((_onemklQ = &xpu::get_queue_from_stream(dpcpp_stream), 0) != 0) {
if ((_onemklQ = xpu::get_queue_from_stream(dpcpp_stream), 0) != 0) {
auto message = std::string("Fail to create onemkl queue.");
std::cerr << message << std::endl;
throw std::runtime_error(message);
Expand Down Expand Up @@ -88,30 +88,30 @@ class SyclContext {

void* GetWorkSpace() { return _workspace; }

sycl::queue* GetCurrentStream()
sycl::queue GetCurrentStream()
{
// get current pytorch stream.
// return &xpu::dpcpp::getCurrentDPCPPStream().dpcpp_queue();
// return xpu::dpcpp::getCurrentDPCPPStream().dpcpp_queue();

auto type_ = c10::DeviceType::XPU;
c10::impl::VirtualGuardImpl impl(type_);
auto device_ = c10::Device(type_);
c10::Stream dpcpp_stream = impl.getStream(device_);
return &xpu::get_queue_from_stream(dpcpp_stream);
return xpu::get_queue_from_stream(dpcpp_stream);
}

sycl::queue* GetNewStream()
sycl::queue GetNewStream()
{
auto type_ = c10::DeviceType::XPU;
c10::impl::VirtualGuardImpl impl(type_);
auto device_ = c10::Device(type_);
c10::Stream dpcpp_stream = impl.getStream(device_);
c10::Stream stream = impl.getStreamFromGlobalPool(device_, /*isHighPriority=*/false);

return &xpu::get_queue_from_stream(dpcpp_stream);
return xpu::get_queue_from_stream(dpcpp_stream);
}

sycl::queue* GetOneMKLQ() { return _onemklQ; }
sycl::queue GetOneMKLQ() { return _onemklQ; }

std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
{
Expand Down Expand Up @@ -144,7 +144,7 @@ class SyclContext {

private:
oneapi::mkl::rng::philox4x32x10* _gen;
sycl::queue* _onemklQ;
sycl::queue _onemklQ;
void* _workspace;
uint64_t _seed;
uint64_t _curr_offset;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ class Adagrad_Optimizer {
{
_streams[0] = ::SyclContext::Instance().GetCurrentStream();
_streams[1] = ::SyclContext::Instance().GetNewStream();
sycl::queue& q_ct1 = *_streams[0];
sycl::queue& q_ct1 = _streams[0];

*_doubled_buffer = sycl::malloc_host<float>(TILE, q_ct1);
*(_doubled_buffer + 1) = sycl::malloc_host<float>(TILE, q_ct1);
}
~Adagrad_Optimizer()
{
sycl::queue& q_ct1 = *_streams[0];
sycl::queue& q_ct1 = _streams[0];
sycl::free(_doubled_buffer[0], q_ct1);
sycl::free(_doubled_buffer[1], q_ct1);
}
Expand All @@ -55,7 +55,7 @@ class Adagrad_Optimizer {
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) _streams[i]->wait();
for (int i = 0; i < 2; i++) _streams[i].wait();
}
inline void IncrementStep(size_t step)
{
Expand All @@ -80,5 +80,5 @@ class Adagrad_Optimizer {
float* _doubled_buffer[2];
bool _buf_index;

sycl::queue* _streams[2];
sycl::queue _streams[2];
};
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ class Adam_Optimizer {
{
_streams[0] = ::SyclContext::Instance().GetCurrentStream();
_streams[1] = ::SyclContext::Instance().GetNewStream();
sycl::queue& q_ct1 = *_streams[0];
sycl::queue& q_ct1 = _streams[0];

*_doubled_buffer = sycl::malloc_host<float>(TILE, q_ct1);
*(_doubled_buffer + 1) = sycl::malloc_host<float>(TILE, q_ct1);
}
~Adam_Optimizer()
{
sycl::queue& q_ct1 = *_streams[0];
sycl::queue& q_ct1 = _streams[0];
sycl::free(_doubled_buffer[0], q_ct1);
sycl::free(_doubled_buffer[1], q_ct1);
}
Expand All @@ -101,7 +101,7 @@ class Adam_Optimizer {
bool half_precision = false);
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) _streams[i]->wait();
for (int i = 0; i < 2; i++) _streams[i].wait();
}
inline void IncrementStep(size_t step, float beta1, float beta2)
{
Expand Down Expand Up @@ -166,5 +166,5 @@ class Adam_Optimizer {
bool _buf_index;
bool _adamw_mode;

sycl::queue* _streams[2];
sycl::queue _streams[2];
};
Loading