diff --git a/intel_extension_for_deepspeed/op_builder/cpu_adagrad.py b/intel_extension_for_deepspeed/op_builder/cpu_adagrad.py index 796fb93..bd2ff89 100644 --- a/intel_extension_for_deepspeed/op_builder/cpu_adagrad.py +++ b/intel_extension_for_deepspeed/op_builder/cpu_adagrad.py @@ -16,8 +16,8 @@ def absolute_name(self): def sources(self): return [ - sycl_kernel_path('csrc/adagrad/cpu_adagrad.dp.cpp'), - sycl_kernel_path('csrc/adam/custom_sycl_kernel.dp.cpp'), + sycl_kernel_path('csrc/adagrad/cpu_adagrad.cpp'), + sycl_kernel_path('csrc/adam/custom_sycl_kernel.cpp'), ] def include_paths(self): diff --git a/intel_extension_for_deepspeed/op_builder/cpu_adam.py b/intel_extension_for_deepspeed/op_builder/cpu_adam.py index f0bd5ca..faaa90c 100644 --- a/intel_extension_for_deepspeed/op_builder/cpu_adam.py +++ b/intel_extension_for_deepspeed/op_builder/cpu_adam.py @@ -16,8 +16,8 @@ def absolute_name(self): def sources(self): return [ - sycl_kernel_path('csrc/adam/cpu_adam.dp.cpp'), - sycl_kernel_path('csrc/adam/custom_sycl_kernel.dp.cpp'), + sycl_kernel_path('csrc/adam/cpu_adam.cpp'), + sycl_kernel_path('csrc/adam/custom_sycl_kernel.cpp'), ] def libraries_args(self): diff --git a/intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.cpp similarity index 99% rename from intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.cpp index a872c3e..e603f7e 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.cpp @@ -43,7 +43,7 @@ void Adagrad_Optimizer::Step_1(float* _params, size_t copy_size = TILE; if ((t + TILE) > _param_size) copy_size = _param_size - t; size_t offset = copy_size + t; - if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); } + if ((t / TILE) >= 2) { _streams[_buf_index].wait(); } #pragma omp parallel for for (size_t k = t; k < offset; k++) { float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.cpp similarity index 99% rename from intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.cpp index 5d8a72b..e3e466f 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.cpp @@ -75,7 +75,7 @@ void Adam_Optimizer::Step(float* _params, size_t copy_size = TILE; if ((t + TILE) > rounded_size) copy_size = rounded_size - t; size_t offset = copy_size + t; - if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); } + if ((t / TILE) >= 2) { _streams[_buf_index].wait(); } #pragma omp parallel for for (size_t i = t; i < offset; i += SIMD_WIDTH) { @@ -129,7 +129,7 @@ void Adam_Optimizer::Step(float* _params, size_t copy_size = TILE; if ((t + TILE) > _param_size) copy_size = _param_size - t; size_t offset = copy_size + t; - if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); } + if ((t / TILE) >= 2) { _streams[_buf_index].wait(); } #pragma omp parallel for for (size_t k = t; k < offset; k++) { float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; @@ -213,7 +213,7 @@ void Adam_Optimizer::Step_4(float* _params, size_t copy_size = TILE; if ((t + TILE) > rounded_size) copy_size = rounded_size - t; size_t offset = copy_size + t; - if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); } + if ((t / TILE) >= 2) { _streams[_buf_index].wait(); } #pragma omp parallel for for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) { AVX_Data grad_4[4]; @@ -420,7 +420,7 @@ void Adam_Optimizer::Step_8(float* _params, size_t copy_size = TILE; if ((t + TILE) > rounded_size) copy_size = rounded_size - t; size_t offset = copy_size + t; - if ((t / TILE) >= 2) { _streams[_buf_index]->wait(); } + if ((t / TILE) >= 2) { _streams[_buf_index].wait(); } #pragma omp parallel for for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) { AVX_Data grad_4[8]; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.cpp similarity index 92% rename from intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.cpp index 2c073ee..cb97a4c 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/adam/custom_sycl_kernel.cpp @@ -17,14 +17,14 @@ void param_update_kernel(const float* input, if (id < size) { output[id] = (sycl::half)input[id]; } } -void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue* stream) +void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue stream) { int threads = 1024; sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1); sycl::range<3> block_dim(1, 1, threads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { param_update_kernel(input, output, size, item_ct1); }); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.cpp similarity index 97% rename from intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.cpp index 2e1e674..fd7021f 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.cpp @@ -11,7 +11,7 @@ #error "Unsupported compiler" #endif -#include "multi_tensor_apply.dp.hpp" +#include "multi_tensor_apply.hpp" #include "type_shim.hpp" #define BLOCK_SIZE 512 @@ -117,7 +117,7 @@ void test_queue_with_accessor(void) c10::impl::VirtualGuardImpl impl(type_); auto device_ = c10::Device(type_); c10::Stream dpcpp_stream = impl.getStream(device_); - sycl::queue* stream = &(xpu::get_queue_from_stream(dpcpp_stream)); + sycl::queue stream = (xpu::get_queue_from_stream(dpcpp_stream)); sycl::default_selector d_selector; static auto exception_handler = [](sycl::exception_list e_list) { for (std::exception_ptr const& e : e_list) { @@ -159,20 +159,20 @@ void test_queue_with_accessor(void) dq.wait(); printf("done\n"); printf("submit xpu::stream without accessor "); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::nd_range<1>(320 * 512, 512), [=](sycl::nd_item<1> item_ct1) {}); }); - stream->wait(); + stream.wait(); printf("done\n"); printf("submit xpu::stream with accessor "); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor tl_block_to_tensor(block_to_tensor_buf, cgh, sycl::read_only); sycl::accessor tl_block_to_chunk(block_to_chunk_buf, cgh, sycl::read_only); sycl::accessor tl_addresses(addresses_buf, cgh, sycl::read_only); sycl::accessor tl_sizes(sizes_buf, cgh, sycl::read_only); cgh.parallel_for(sycl::nd_range<1>(320 * 512, 512), [=](sycl::nd_item<1> item_ct1) {}); }); - stream->wait(); + stream.wait(); printf("done\n"); } diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/common.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/common.hpp index 953bd3c..cb88740 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/common.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/common.hpp @@ -8,13 +8,13 @@ CHECK_CONTIGUOUS(x) template -inline void print_nan(sycl::queue* stream, int bsz, const T* buf, char* name) +inline void print_nan(sycl::queue stream, int bsz, const T* buf, char* name) { T temp_tensor[10000]; bool has_nan = false; - stream->wait(); - stream->memcpy(temp_tensor, buf, bsz * sizeof(T)); - stream->wait(); + stream.wait(); + stream.memcpy(temp_tensor, buf, bsz * sizeof(T)); + stream.wait(); for (int i = 0; i < bsz; i++) { if (isnan(float(temp_tensor[i]))) { has_nan = true; } } diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/context.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/context.hpp index 4e90bd5..4851fd8 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/context.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/context.hpp @@ -51,7 +51,7 @@ class SyclContext { auto device_ = c10::Device(type_); c10::Stream dpcpp_stream = impl.getStream(device_); _gen = new oneapi::mkl::rng::philox4x32x10(xpu::get_queue_from_stream(dpcpp_stream), 123); - if ((_onemklQ = &xpu::get_queue_from_stream(dpcpp_stream), 0) != 0) { + if ((_onemklQ = xpu::get_queue_from_stream(dpcpp_stream), 0) != 0) { auto message = std::string("Fail to create onemkl queue."); std::cerr << message << std::endl; throw std::runtime_error(message); @@ -88,19 +88,19 @@ class SyclContext { void* GetWorkSpace() { return _workspace; } - sycl::queue* GetCurrentStream() + sycl::queue GetCurrentStream() { // get current pytorch stream. - // return &xpu::dpcpp::getCurrentDPCPPStream().dpcpp_queue(); + // return xpu::dpcpp::getCurrentDPCPPStream().dpcpp_queue(); auto type_ = c10::DeviceType::XPU; c10::impl::VirtualGuardImpl impl(type_); auto device_ = c10::Device(type_); c10::Stream dpcpp_stream = impl.getStream(device_); - return &xpu::get_queue_from_stream(dpcpp_stream); + return xpu::get_queue_from_stream(dpcpp_stream); } - sycl::queue* GetNewStream() + sycl::queue GetNewStream() { auto type_ = c10::DeviceType::XPU; c10::impl::VirtualGuardImpl impl(type_); @@ -108,10 +108,10 @@ class SyclContext { c10::Stream dpcpp_stream = impl.getStream(device_); c10::Stream stream = impl.getStreamFromGlobalPool(device_, /*isHighPriority=*/false); - return &xpu::get_queue_from_stream(dpcpp_stream); + return xpu::get_queue_from_stream(dpcpp_stream); } - sycl::queue* GetOneMKLQ() { return _onemklQ; } + sycl::queue GetOneMKLQ() { return _onemklQ; } std::pair IncrementOffset(uint64_t offset_inc) { @@ -144,7 +144,7 @@ class SyclContext { private: oneapi::mkl::rng::philox4x32x10* _gen; - sycl::queue* _onemklQ; + sycl::queue _onemklQ; void* _workspace; uint64_t _seed; uint64_t _curr_offset; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.hpp index 09dcf21..9354b57 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.hpp @@ -38,14 +38,14 @@ class Adagrad_Optimizer { { _streams[0] = ::SyclContext::Instance().GetCurrentStream(); _streams[1] = ::SyclContext::Instance().GetNewStream(); - sycl::queue& q_ct1 = *_streams[0]; + sycl::queue& q_ct1 = _streams[0]; *_doubled_buffer = sycl::malloc_host(TILE, q_ct1); *(_doubled_buffer + 1) = sycl::malloc_host(TILE, q_ct1); } ~Adagrad_Optimizer() { - sycl::queue& q_ct1 = *_streams[0]; + sycl::queue& q_ct1 = _streams[0]; sycl::free(_doubled_buffer[0], q_ct1); sycl::free(_doubled_buffer[1], q_ct1); } @@ -55,7 +55,7 @@ class Adagrad_Optimizer { STEP(8) inline void SynchronizeStreams() { - for (int i = 0; i < 2; i++) _streams[i]->wait(); + for (int i = 0; i < 2; i++) _streams[i].wait(); } inline void IncrementStep(size_t step) { @@ -80,5 +80,5 @@ class Adagrad_Optimizer { float* _doubled_buffer[2]; bool _buf_index; - sycl::queue* _streams[2]; + sycl::queue _streams[2]; }; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adam.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adam.hpp index 715c161..e90c470 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adam.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adam.hpp @@ -67,14 +67,14 @@ class Adam_Optimizer { { _streams[0] = ::SyclContext::Instance().GetCurrentStream(); _streams[1] = ::SyclContext::Instance().GetNewStream(); - sycl::queue& q_ct1 = *_streams[0]; + sycl::queue& q_ct1 = _streams[0]; *_doubled_buffer = sycl::malloc_host(TILE, q_ct1); *(_doubled_buffer + 1) = sycl::malloc_host(TILE, q_ct1); } ~Adam_Optimizer() { - sycl::queue& q_ct1 = *_streams[0]; + sycl::queue& q_ct1 = _streams[0]; sycl::free(_doubled_buffer[0], q_ct1); sycl::free(_doubled_buffer[1], q_ct1); } @@ -101,7 +101,7 @@ class Adam_Optimizer { bool half_precision = false); inline void SynchronizeStreams() { - for (int i = 0; i < 2; i++) _streams[i]->wait(); + for (int i = 0; i < 2; i++) _streams[i].wait(); } inline void IncrementStep(size_t step, float beta1, float beta2) { @@ -166,5 +166,5 @@ class Adam_Optimizer { bool _buf_index; bool _adamw_mode; - sycl::queue* _streams[2]; + sycl::queue _streams[2]; }; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/custom_sycl_layers.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/custom_sycl_layers.hpp index c5a1a89..1313100 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/custom_sycl_layers.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/custom_sycl_layers.hpp @@ -37,25 +37,25 @@ void launch_qunatize_kernel(T* vals, int total_count, int group_num, int num_bits, - sycl::queue* stream); + sycl::queue stream); template void launch_sr_qunatize_kernel(T* vals, int total_count, int group_num, int num_bits, - sycl::queue* stream); + sycl::queue stream); template void launch_qunatize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, - sycl::queue* stream); + sycl::queue stream); template void launch_sr_qunatize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, - sycl::queue* stream); + sycl::queue stream); // Fused bias add with gelu activation template void launch_bias_gelu(const T* input, @@ -63,14 +63,14 @@ void launch_bias_gelu(const T* input, T* output, int intermediate_size, int batch_size, - sycl::queue* stream); + sycl::queue stream); template void launch_gelu(const T* input, T* output, int intermediate_size, int batch_size, - sycl::queue* stream); + sycl::queue stream); template void launch_d_gelu(T* d_output, @@ -78,7 +78,7 @@ void launch_d_gelu(T* d_output, const T* bias, int intermediate_size, int batch_size, - sycl::queue* stream); + sycl::queue stream); // Custom fused bias add with layer normalization template @@ -89,7 +89,7 @@ void launch_bias_residual_layer_norm(T* vals, float epsilon, int batch_size, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm, bool training, T* vars, @@ -103,7 +103,7 @@ void launch_bias_residual_layer_norm(T* vals, float epsilon, int batch_size, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm, bool training, T* vars); @@ -120,7 +120,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* inp_grad, int batch_size, int hidden_dim, - sycl::queue* stream[2]); + sycl::queue stream[2]); template void launch_layerNorm_backward_fused_add(const T* out_grad1, const T* out_grad2, @@ -132,7 +132,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* inp_grad, int batch_size, int hidden_dim, - sycl::queue* stream[2], + sycl::queue stream[2], bool invertible = false, const T* betta = nullptr); @@ -147,7 +147,7 @@ void launch_layerNorm_backward(const T* out_grad, T* inp_grad, int batch_size, int hidden_dim, - sycl::queue* stream[2]); + sycl::queue stream[2]); template void launch_layerNorm_backward(const T* out_grad, @@ -159,7 +159,7 @@ void launch_layerNorm_backward(const T* out_grad, T* inp_grad, int batch_size, int hidden_dim, - sycl::queue* stream[2], + sycl::queue stream[2], bool invertible = false, const T* betta = nullptr); @@ -176,10 +176,10 @@ void launch_layerNorm_backward_nreversible(const T* out_grad, T* inp_grad, int batch_size, int hidden_dim, - sycl::queue* stream[2]); + sycl::queue stream[2]); template -void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, sycl::queue* stream); +void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, sycl::queue stream); template void launch_attn_softmax_backward(T* out_grad, @@ -187,7 +187,7 @@ void launch_attn_softmax_backward(T* out_grad, int batch_size, int heads, int seq_length, - sycl::queue* stream); + sycl::queue stream); template void launch_attn_softmax_backward_v2(T* out_grad, @@ -195,7 +195,7 @@ void launch_attn_softmax_backward_v2(T* out_grad, int batch_size, int heads, int seq_length, - sycl::queue* stream); + sycl::queue stream); // Custom softmax with scaling and attention mask addition template @@ -204,7 +204,7 @@ void launch_attn_softmax(T* vals, int batch_size, int heads, int sequence_length, - sycl::queue* stream); + sycl::queue stream); template void launch_transform_0213(T* output, @@ -213,7 +213,7 @@ void launch_transform_0213(T* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream); + sycl::queue stream); // Custom bias add template @@ -224,7 +224,7 @@ void launch_bias_add_transform_0213(T* outputs, int seq_length, int hidden_dim, int heads, - sycl::queue* stream, + sycl::queue stream, int trans_count); // 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3] @@ -235,7 +235,7 @@ void launch_transform4d_0213(T* out, int heads, int seq_length, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, int trans_count); template @@ -245,7 +245,7 @@ void launch_dropout(T* vals, int batch, int dim, float ratio, - sycl::queue* stream); + sycl::queue stream); template void launch_dropout(T* vals_out, @@ -254,7 +254,7 @@ void launch_dropout(T* vals_out, int total_count, int dim, float ratio, - sycl::queue* stream, + sycl::queue stream, bool bwd = false); template @@ -266,10 +266,10 @@ void launch_dropout(T* out, int batch, int dim, float ratio, - sycl::queue* stream); + sycl::queue stream); template -void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, sycl::queue* stream); +void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, sycl::queue stream); template void launch_dropout_grad(T* vals_out, @@ -277,13 +277,13 @@ void launch_dropout_grad(T* vals_out, uint8_t* mask, int total_count, float ratio, - sycl::queue* stream); + sycl::queue stream); template void launch_fuse_transpose_bias_kernel(const T* inp, T* out, int rows, int cols, - sycl::queue* stream); + sycl::queue stream); -void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue* stream); +void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/dropout.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/dropout.hpp index 69cb6c0..c1cd0bc 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/dropout.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/dropout.hpp @@ -28,13 +28,13 @@ class Dropout { virtual ~Dropout() {} - void Forward(int bsz, T* out, const T* vals, sycl::queue* stream, bool bwd = false) + void Forward(int bsz, T* out, const T* vals, sycl::queue stream, bool bwd = false) { launch_dropout( out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd); } - void ForwardWithBias(int bsz, T* vals, const T* bias, sycl::queue* stream) + void ForwardWithBias(int bsz, T* vals, const T* bias, sycl::queue stream) { launch_dropout(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream); } @@ -44,18 +44,18 @@ class Dropout { const T* vals, const T* residual, const T* bias, - sycl::queue* stream) + sycl::queue stream) { launch_dropout( out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream); } - void Backward(int bsz, T* d_vals, sycl::queue* stream) + void Backward(int bsz, T* d_vals, sycl::queue stream) { launch_dropout_grad(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream); } - void Backward(int bsz, T* d_vals_out, const T* d_vals, sycl::queue* stream) + void Backward(int bsz, T* d_vals_out, const T* d_vals, sycl::queue stream) { launch_dropout_grad( d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/ds_transformer_sycl.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/ds_transformer_sycl.hpp index cfee1fc..f996cd8 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/ds_transformer_sycl.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/ds_transformer_sycl.hpp @@ -160,8 +160,8 @@ class BertTransformerLayer { bool _pre_or_postLayerNorm; - sycl::queue* _onemklQ; - sycl::queue* _stream; + sycl::queue _onemklQ; + sycl::queue _stream; // layers FeedForward _qkv_linear; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/feed_forward.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/feed_forward.hpp index 6a24aae..7d994f7 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/feed_forward.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/feed_forward.hpp @@ -26,7 +26,7 @@ class FeedForward { ~FeedForward() {} - void Forward(int bsz, const T* input_ptr, const T* weights, T* out, sycl::queue* _Q) + void Forward(int bsz, const T* input_ptr, const T* weights, T* out, sycl::queue _Q) { if constexpr (std::is_same_v) { float alpha = 1.0f; @@ -64,8 +64,8 @@ class FeedForward { const T* weights, T* weights_grad, T* bias_grad, - sycl::queue* _Q, - sycl::queue* stream, + sycl::queue _Q, + sycl::queue stream, T* inp_grad_out = nullptr, T* out_grad_trans_out = nullptr) { diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/gelu.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/gelu.hpp index f773eee..3e43f50 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/gelu.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/gelu.hpp @@ -26,12 +26,12 @@ class Gelu { const T* input_buf, const T* bias, T* output, - sycl::queue* stream) + sycl::queue stream) { launch_bias_gelu(input_buf, bias, output, _config.intermediate_size, bsz, stream); } - void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, sycl::queue* stream) + void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, sycl::queue stream) { launch_d_gelu(d_output, input_buf, bias, _config.intermediate_size, bsz, stream); } diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/gemm_test.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/gemm_test.hpp index 8a7f44d..493865a 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/gemm_test.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/gemm_test.hpp @@ -24,7 +24,7 @@ class GemmTest { int k, oneapi::mkl::transpose ta, oneapi::mkl::transpose tb, - sycl::queue* h) + sycl::queue h) : M(m), N(n), K(k), transa(ta), transb(tb), handle(h) { dpct::device_ext& dev_ct1 = dpct::get_current_device(); @@ -134,7 +134,7 @@ class GemmTest { private: int M, N, K; - sycl::queue* handle; + sycl::queue handle; oneapi::mkl::transpose transa, transb; T *A, *B, *C; }; @@ -148,7 +148,7 @@ class StridedGemmTest { int k, oneapi::mkl::transpose ta, oneapi::mkl::transpose tb, - sycl::queue* h) + sycl::queue h) : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h) { dpct::device_ext& dev_ct1 = dpct::get_current_device(); @@ -297,7 +297,7 @@ class StridedGemmTest { private: int bsz, M, N, K; - sycl::queue* handle; + sycl::queue handle; oneapi::mkl::transpose transa, transb; T *A, *B, *C; }; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/general_kernels.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/general_kernels.hpp index ea2135f..90095dc 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/general_kernels.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/general_kernels.hpp @@ -19,7 +19,7 @@ void launch_fused_add2(T* out, int batch_size, int seq_length, int hidden_size, - sycl::queue* stream); + sycl::queue stream); template void launch_fused_add4(T* out, @@ -30,7 +30,7 @@ void launch_fused_add4(T* out, int batch_size, int seq_length, int hidden_size, - sycl::queue* stream); + sycl::queue stream); template void launch_fused_add3(T* out, @@ -40,4 +40,4 @@ void launch_fused_add3(T* out, int batch_size, int seq_length, int hidden_size, - sycl::queue* stream); + sycl::queue stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.dp.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.hpp similarity index 98% rename from intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.dp.hpp rename to intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.hpp index 0c22494..e098989 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.dp.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/multi_tensor_apply.hpp @@ -95,7 +95,7 @@ void multi_tensor_apply(int block_size, TensorListMetadata tl; - sycl::queue* stream = SyclContext::Instance().GetCurrentStream(); + sycl::queue stream = SyclContext::Instance().GetCurrentStream(); tl.start_tensor_this_launch = 0; int loc_block_info = 0; @@ -125,7 +125,7 @@ void multi_tensor_apply(int block_size, sycl::buffer addresses_buf(&(tl.addresses[0][0]), {4, 36}); sycl::buffer sizes_buf(&(tl.sizes[0]), {36}); sycl::buffer data_buf(data_ptr, noop_flag.numel()); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor tl_block_to_tensor(block_to_tensor_buf, cgh, sycl::read_only); sycl::accessor tl_block_to_chunk(block_to_chunk_buf, cgh, sycl::read_only); sycl::accessor tl_addresses(addresses_buf, cgh, sycl::read_only); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/normalize_layer.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/normalize_layer.hpp index 58f6284..c9db752 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/normalize_layer.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/normalize_layer.hpp @@ -47,7 +47,7 @@ class Normalize_Layer { const T* residual, const T* gamma, const T* betta, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm = false) { launch_bias_residual_layer_norm(vals, @@ -69,7 +69,7 @@ class Normalize_Layer { const T* residual, const T* gamma, const T* betta, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm = false) { launch_bias_residual_layer_norm(vals, @@ -90,7 +90,7 @@ class Normalize_Layer { const T* gamma, T* gamma_grad, T* betta_grad, - sycl::queue* stream[2], + sycl::queue stream[2], T* inp_grad_out, const T* norm_in = nullptr) { @@ -113,7 +113,7 @@ class Normalize_Layer { const T* betta, T* gamma_grad, T* betta_grad, - sycl::queue* stream[2], + sycl::queue stream[2], T* inp_grad_out, const T* norm_out) { @@ -137,7 +137,7 @@ class Normalize_Layer { const T* gamma, T* gamma_grad, T* betta_grad, - sycl::queue* stream[2], + sycl::queue stream[2], T* inp_grad_out, const T* norm_in = nullptr) { @@ -162,7 +162,7 @@ class Normalize_Layer { const T* betta, T* gamma_grad, T* betta_grad, - sycl::queue* stream[2], + sycl::queue stream[2], T* inp_grad_out, const T* norm_out) { diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/onemkl_wrappers.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/onemkl_wrappers.hpp index e69abee..3d52378 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/onemkl_wrappers.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/onemkl_wrappers.hpp @@ -12,7 +12,7 @@ #include -int onemkl_gemm_ex(sycl::queue* handle, +int onemkl_gemm_ex(sycl::queue handle, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int m, @@ -24,7 +24,7 @@ int onemkl_gemm_ex(sycl::queue* handle, const float* B, float* C); -int onemkl_gemm_ex(sycl::queue* handle, +int onemkl_gemm_ex(sycl::queue handle, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int m, @@ -36,7 +36,7 @@ int onemkl_gemm_ex(sycl::queue* handle, const sycl::half* B, sycl::half* C); -int onemkl_strided_batched_gemm(sycl::queue* handle, +int onemkl_strided_batched_gemm(sycl::queue handle, int m, int n, int k, @@ -53,7 +53,7 @@ int onemkl_strided_batched_gemm(sycl::queue* handle, int batch, int algo = -1); -int onemkl_strided_batched_gemm(sycl::queue* handle, +int onemkl_strided_batched_gemm(sycl::queue handle, int m, int n, int k, diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/softmax.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/softmax.hpp index 9ca9810..ea336a1 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/softmax.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/softmax.hpp @@ -37,12 +37,12 @@ class Softmax { ~Softmax() {} - void Forward(int bsz, T* vals, const T* attn_mask, sycl::queue* stream) + void Forward(int bsz, T* vals, const T* attn_mask, sycl::queue stream) { launch_attn_softmax(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream); } - void Backward(int bsz, T* out_grad, const T* soft_out, sycl::queue* stream) + void Backward(int bsz, T* out_grad, const T* soft_out, sycl::queue stream) { launch_attn_softmax_backward_v2( out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/includes/strided_batch_gemm.hpp b/intel_extension_for_deepspeed/op_builder/csrc/includes/strided_batch_gemm.hpp index 2e0ce64..1c2da43 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/includes/strided_batch_gemm.hpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/includes/strided_batch_gemm.hpp @@ -62,7 +62,7 @@ class StridedBatchGemm { virtual ~StridedBatchGemm() {} - void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, sycl::queue* handle) + void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, sycl::queue handle) { int stride_a = _config.m * _config.k; int stride_b = _config.n * _config.k; @@ -101,7 +101,7 @@ class StridedBatchGemm { } } - void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, sycl::queue* handle) + void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, sycl::queue handle) { int stride_a = _config.m * _config.k; int stride_b = _config.n * _config.k; @@ -136,7 +136,7 @@ class StridedBatchGemm { const T* d_output, const T* _buffer_a, const T* _buffer_b, - sycl::queue* handle, + sycl::queue handle, T* inpGradA = nullptr, T* inpGradB = nullptr) { diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.cpp similarity index 97% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.cpp index 25e5e0f..0db6000 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/dropout_kernels.cpp @@ -373,7 +373,7 @@ void launch_dropout(T* out, int total_count, int dim, float ratio, - queue* stream, + queue stream, bool bwd) { /* @@ -391,14 +391,14 @@ void launch_dropout(T* out, uint64_t inc = total_count / grid_dim[2] / block_dim[2]; std::pair seed = SyclContext::Instance().IncrementOffset(inc); if (bwd) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for( nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_kernel_bwd(total_count, ratio, vals, out, mask, seed, item_ct1); }); }); else - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for( nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_kernel(total_count, ratio, out, vals, mask, seed, item_ct1); @@ -412,7 +412,7 @@ template void launch_dropout(float* out, int total_count, int dim, float ratio, - queue* stream, + queue stream, bool); template void launch_dropout(bf16* out, const bf16* vals, @@ -420,7 +420,7 @@ template void launch_dropout(bf16* out, int total_count, int dim, float ratio, - queue* stream, + queue stream, bool); template void launch_dropout(half* out, const half* vals, @@ -428,7 +428,7 @@ template void launch_dropout(half* out, int total_count, int dim, float ratio, - queue* stream, + queue stream, bool); void dropout_grad_kernel(const int N, @@ -516,7 +516,7 @@ void dropout_grad_kernel(const int N, } template -void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, queue* stream) +void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, queue stream) { /* * Dropout.Backward0 @@ -526,7 +526,7 @@ void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, q const float scale = 1. / (1. - ratio); range<3> grid_dim = range<3>(1, 1, DS_GET_BLOCKS(total_count / unroll_factor)); range<3> block_dim = range<3>(1, 1, DS_CUDA_NUM_THREADS); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_grad_kernel(total_count, scale, vals, mask, item_ct1); }); @@ -537,17 +537,17 @@ template void launch_dropout_grad(float* vals, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); template void launch_dropout_grad(bf16* vals, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); template void launch_dropout_grad(half* vals, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); void dropout_grad_kernel(const int N, const float scale, @@ -624,7 +624,7 @@ void launch_dropout_grad(T* vals_out, uint8_t* mask, int total_count, float ratio, - queue* stream) + queue stream) { /* * Dropout.Backward1 @@ -634,7 +634,7 @@ void launch_dropout_grad(T* vals_out, const float scale = 1. / (1. - ratio); range<3> grid_dim = range<3>(1, 1, DS_GET_BLOCKS(total_count / unroll_factor)); range<3> block_dim = range<3>(1, 1, DS_CUDA_NUM_THREADS); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_grad_kernel(total_count, scale, vals, vals_out, mask, item_ct1); }); @@ -645,19 +645,19 @@ template void launch_dropout_grad(float* vals_out, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); template void launch_dropout_grad(bf16* vals_out, const bf16* vals, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); template void launch_dropout_grad(half* vals_out, const half* vals, uint8_t* mask, int total_count, float ratio, - queue* stream); + queue stream); /* * not called in transformer kernel Shi Yuankun 2021/10/21 @@ -821,7 +821,7 @@ void launch_dropout(T* out, int batch, int dim, float ratio, - queue* stream) + queue stream) { assert(unroll_factor == 4); @@ -832,7 +832,7 @@ void launch_dropout(T* out, uint64_t inc = (batch * dim) / grid_dim[2] / block_dim[2]; std::pair seed = SyclContext::Instance().IncrementOffset(inc); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_kernel(total_count, dim, ratio, bias, out, mask, seed, item_ct1); }); @@ -845,14 +845,14 @@ template void launch_dropout(float*, int batch, int dim, float ratio, - queue* stream); + queue stream); template void launch_dropout(half*, const half* bias, uint8_t* mask, int batch, int dim, float ratio, - queue* stream); + queue stream); void dropout_kernel(const int N, const int dim, @@ -1153,7 +1153,7 @@ void launch_dropout(T* out, int batch, int dim, float ratio, - queue* stream) + queue stream) { assert(unroll_factor == 4); @@ -1164,7 +1164,7 @@ void launch_dropout(T* out, uint64_t inc = (batch * dim) / grid_dim[2] / block_dim[2]; std::pair seed = SyclContext::Instance().IncrementOffset(inc); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { dropout_kernel( total_count, dim, ratio, input, residual, bias, out, mask, seed, item_ct1); @@ -1180,7 +1180,7 @@ template void launch_dropout(float*, int batch, int dim, float ratio, - queue* stream); + queue stream); template void launch_dropout(bf16*, const bf16*, const bf16* residual, @@ -1189,7 +1189,7 @@ template void launch_dropout(bf16*, int batch, int dim, float ratio, - queue* stream); + queue stream); template void launch_dropout(half*, const half*, const half* residual, @@ -1198,4 +1198,4 @@ template void launch_dropout(half*, int batch, int dim, float ratio, - queue* stream); + queue stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.cpp similarity index 94% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.cpp index 9a91502..18d3585 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_dropout_sycl.cpp @@ -23,7 +23,7 @@ std::vector dropout_forward(float ratio, T* output_ptr = (T*)output.data_ptr(); uint8_t* mask_ptr = (uint8_t*)mask.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Dropout _dropout = Dropout(typename Dropout::Config(ratio, dim)); _dropout.SetMask(mask_ptr); _dropout.Forward(bsz, output_ptr, input_ptr, q); @@ -57,7 +57,7 @@ std::vector dropout_forward_with_bias(float ratio, T* output_ptr = (T*)output.data_ptr(); uint8_t* mask_ptr = (uint8_t*)mask.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Dropout _dropout = Dropout(typename Dropout::Config(ratio, dim)); _dropout.SetMask(mask_ptr); _dropout.ForwardWithBias(bsz, output_ptr, input_ptr, residual_ptr, bias_ptr, q); @@ -74,7 +74,7 @@ std::vector dropout_backward(float ratio, { CHECK_INPUT(vals); CHECK_INPUT(mask); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Dropout _dropout = Dropout(typename Dropout::Config(ratio, dim)); uint8_t* mask_ptr = (uint8_t*)mask.data_ptr(); _dropout.SetMask(mask_ptr); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.cpp similarity index 95% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.cpp index 5232bb5..c428e20 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_feedforward_sycl.cpp @@ -28,7 +28,7 @@ std::vector feedforward_forward(int bsz, T* output_ptr = (T*)output.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); FeedForward _ff = FeedForward(typename FeedForward::Config(batchSize, outputSize, inputSize)); @@ -70,7 +70,7 @@ std::vector feedforward_backward(int bsz, T* grad_w_ptr = (T*)grad_weights.data_ptr(); T* grad_b_ptr = (T*)grad_bias.data_ptr(); T* grad_i_ptr = (T*)grad_input.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); FeedForward _ff = FeedForward(typename FeedForward::Config(batchSize, outputSize, inputSize)); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.cpp similarity index 91% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.cpp index 0a05c41..3c59603 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_gelu_sycl.cpp @@ -14,7 +14,7 @@ std::vector gelu_forward(int intermediate_size, const T* bias_ptr = (const T*)bias.data_ptr(); auto output = torch::empty_like(input); T* output_ptr = (T*)output.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Gelu _gelu = Gelu(typename Gelu::Config(intermediate_size)); _gelu.ForwardWithBiasAdd(bsz_seq, input_ptr, bias_ptr, output_ptr, q); return {output}; @@ -32,7 +32,7 @@ std::vector gelu_backward(torch::Tensor& d_output, const T* input_ptr = (const T*)input.data_ptr(); const T* bias_ptr = (const T*)bias.data_ptr(); T* d_output_ptr = (T*)d_output.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Gelu _gelu = Gelu(typename Gelu::Config(intermediate_size)); _gelu.Backward(bsz_seq, d_output_ptr, input_ptr, bias_ptr, q); return {d_output}; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.cpp similarity index 94% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.cpp index c9fd531..4e95fe5 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_layer_reorder_sycl.cpp @@ -26,7 +26,7 @@ std::vector transform4d_0213(const torch::Tensor& input, // for 1 attn_o_inp, trans_count=1 output = torch::empty({batch, seq_len, num_heads, hidden_size / num_heads}, options); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); const T* input_ptr = (const T*)input.data_ptr(); T* output_ptr = (T*)output.data_ptr(); @@ -57,7 +57,7 @@ std::vector bias_add_transform_0213(const torch::Tensor& input, auto output = torch::empty({3, batch, num_heads, seq_len, hidden_size / num_heads}, options); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); const T* input_ptr = (const T*)input.data_ptr(); const T* bias_ptr = (const T*)bias.data_ptr(); @@ -84,7 +84,7 @@ std::vector transform_0213(const torch::Tensor& input, auto output = torch::empty({batch, num_heads, seq_len, hidden_size / num_heads}, options); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); const T* input_ptr = (const T*)input.data_ptr(); T* output_ptr = (T*)output.data_ptr(); @@ -111,7 +111,7 @@ std::vector fused_add2(const torch::Tensor& input1, auto output = torch::empty({batch, seq_len, hidden_size}, options); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); const T* input_ptr1 = (const T*)input1.data_ptr(); const T* input_ptr2 = (const T*)input2.data_ptr(); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.cpp similarity index 97% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.cpp index d779a12..083a92d 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_normalize_sycl.cpp @@ -36,7 +36,7 @@ std::vector normalize_forward(const int batch, const T* gamma_ptr = (const T*)gamma.data_ptr(); const T* betta_ptr = (const T*)betta.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Normalize_Layer _norm( typename Normalize_Layer::Config(batch, seq_len, hidden_size, epsilon, true, wmean)); _norm.SetMean(mean_ptr); @@ -94,11 +94,11 @@ std::vector normalize_backward(const int batch, T* inp_grad_ptr = (T*)input_grad.data_ptr(); T* mean_ptr = (T*)mean.data_ptr(); T* var_ptr = (T*)var.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Normalize_Layer _norm( typename Normalize_Layer::Config(batch, seq_len, hidden_size, epsilon, true, wmean)); - sycl::queue* qs[2] = {q, q}; + sycl::queue qs[2] = {q, q}; _norm.SetMean(mean_ptr); _norm.SetVar(var_ptr); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.cpp similarity index 91% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.cpp index 557fcb4..75c4b6d 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_softmax_sycl.cpp @@ -15,7 +15,7 @@ std::vector softmax_forward(int bsz, T* inout_ptr = (T*)inout.data_ptr(); const T* mask_ptr = (const T*)mask.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Softmax _softmax = Softmax(typename Softmax::Config(bsz, num_heads, seq_len)); _softmax.SetSeqLength(seq_len); _softmax.Forward(bsz, inout_ptr, mask_ptr, q); @@ -35,7 +35,7 @@ std::vector softmax_backward(int bsz, T* out_grad_ptr = (T*)out_grad.data_ptr(); const T* input_ptr = (const T*)input.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); Softmax _softmax = Softmax(typename Softmax::Config(bsz, num_heads, seq_len)); _softmax.SetSeqLength(seq_len); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.cpp similarity index 96% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.cpp index 3f67b33..feee850 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_stridedbatchgemm_sycl.cpp @@ -39,7 +39,7 @@ std::vector stridedbatchgemm_forward(const int batchSize, T* matC_ptr = (T*)matC.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); _sbgemm.Forward(batchSize, matC_ptr, matA_ptr, matB_ptr, q); return {matC}; @@ -88,7 +88,7 @@ std::vector stridedbatchgemm_backward(const int batchSize, T* grad_a_ptr = (T*)grad_matA.data_ptr(); T* grad_b_ptr = (T*)grad_matB.data_ptr(); - sycl::queue* q = ::SyclContext::Instance().GetCurrentStream(); + sycl::queue q = ::SyclContext::Instance().GetCurrentStream(); _sbgemm.Backward(batchSize, grad_c_ptr, matA_ptr, matB_ptr, q, grad_a_ptr, grad_b_ptr); return {grad_matA, grad_matB}; diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.cpp similarity index 98% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.cpp index 8668270..42c4b90 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/ds_transformer_sycl.cpp @@ -172,7 +172,7 @@ void BertTransformerLayer::Forward(int bsz, T* gelu_inp_ptr, T* ff2_inp_ptr) { - if (!_stochastic_mode) _stream->wait(); + if (!_stochastic_mode) _stream.wait(); T* workspace = static_cast(::SyclContext::Instance().GetWorkSpace()); size_t small_buf_size = bsz * _seq_length * _hidden_size; @@ -340,7 +340,7 @@ void BertTransformerLayer::Backward(int bsz, T* grad_norm_w_ptr, T* grad_norm_b_ptr) { - if (!_stochastic_mode) _stream->wait(); + if (!_stochastic_mode) _stream.wait(); T* workspace = static_cast(::SyclContext::Instance().GetWorkSpace()); size_t small_buf_size = bsz * _seq_length * _hidden_size; @@ -353,7 +353,7 @@ void BertTransformerLayer::Backward(int bsz, : buf_3 + small_buf_size); T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads); - sycl::queue* streams[2] = {_stream, _stream}; + sycl::queue streams[2] = {_stream, _stream}; int bsz_seq = bsz * _seq_length; int bsz_heads = bsz * _heads; @@ -507,11 +507,11 @@ void BertTransformerLayer::Backward(int bsz, // this case, buf_1 connected with buf_2 and buf_3 are all inputs launch_transform4d_0213(ff2_buf, buf_1, bsz, _heads, _seq_length, _hidden_size, _stream, 3); - T* grad_out_buffer = (T*)malloc_shared(10 * sizeof(T), *_stream); - T* input_buffer = (T*)malloc_shared(10 * sizeof(T), *_stream); - T* weight_buffer = (T*)malloc_shared(10 * sizeof(T), *_stream); - T* grad_weight_buffer = (T*)malloc_shared(10 * sizeof(T), *_stream); - T* grad_bias_buffer = (T*)malloc_shared(10 * sizeof(T), *_stream); + T* grad_out_buffer = (T*)malloc_shared(10 * sizeof(T), _stream); + T* input_buffer = (T*)malloc_shared(10 * sizeof(T), _stream); + T* weight_buffer = (T*)malloc_shared(10 * sizeof(T), _stream); + T* grad_weight_buffer = (T*)malloc_shared(10 * sizeof(T), _stream); + T* grad_bias_buffer = (T*)malloc_shared(10 * sizeof(T), _stream); if (_pre_or_postLayerNorm) { _qkv_linear.Backward(bsz_seq, ff2_buf, @@ -561,7 +561,7 @@ void BertTransformerLayer::Backward(int bsz, } } else { launch_fused_add2(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream); - _stream->submit([&](sycl::handler& cgh) { + _stream.submit([&](sycl::handler& cgh) { cgh.single_task([=]() { for (int i = 0; i < 10; ++i) { grad_out_buffer[i] = ff2_buf[i]; @@ -574,7 +574,7 @@ void BertTransformerLayer::Backward(int bsz, }); } - _stream->wait(); + _stream.wait(); } template diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.cpp similarity index 97% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.cpp index 6991dcb..7b4f81d 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/gelu_kernels.cpp @@ -393,14 +393,14 @@ void launch_bias_gelu(const T* input, T* output, int intermediate_size, int batch_size, - queue* stream) + queue stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = (intermediate_size - 1) / (iterations * 4) + 1; range<3> block_dims(1, 1, threads); range<3> grid_dims(1, 1, batch_size); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dims * block_dims, block_dims), [=](nd_item<3> item_ct1) { fused_bias_gelu(input, bias, output, intermediate_size / 4, iterations, item_ct1); }); @@ -408,26 +408,26 @@ void launch_bias_gelu(const T* input, } template -void launch_gelu(const T* input, T* output, int intermediate_size, int batch_size, queue* stream) +void launch_gelu(const T* input, T* output, int intermediate_size, int batch_size, queue stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = (intermediate_size - 1) / (iterations * 4) + 1; range<3> block_dims(1, 1, threads); range<3> grid_dims(1, 1, batch_size); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dims * block_dims, block_dims), [=](nd_item<3> item_ct1) { gelu_kernel(input, output, intermediate_size / 4, iterations, item_ct1); }); }); } -template void launch_bias_gelu(const float*, const float*, float*, int, int, queue*); -template void launch_bias_gelu(const half*, const half*, half*, int, int, queue*); -template void launch_bias_gelu(const bf16*, const bf16*, bf16*, int, int, queue*); +template void launch_bias_gelu(const float*, const float*, float*, int, int, queue); +template void launch_bias_gelu(const half*, const half*, half*, int, int, queue); +template void launch_bias_gelu(const bf16*, const bf16*, bf16*, int, int, queue); -template void launch_gelu(const float*, float*, int, int, queue*); -template void launch_gelu(const half*, half*, int, int, queue*); +template void launch_gelu(const float*, float*, int, int, queue); +template void launch_gelu(const half*, half*, int, int, queue); template void launch_d_gelu(T* d_output, @@ -435,20 +435,20 @@ void launch_d_gelu(T* d_output, const T* bias, int intermediate_size, int batch_size, - queue* stream) + queue stream) { int iterations = (intermediate_size + 1023) / 1024; int threads = (intermediate_size - 1) / (iterations * 4) + 1; range<3> block_dims(1, 1, threads); range<3> grid_dims(1, 1, batch_size); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dims * block_dims, block_dims), [=](nd_item<3> item_ct1) { d_gelu_func(d_output, input, bias, intermediate_size / 4, iterations, item_ct1); }); }); } -template void launch_d_gelu(float*, const float*, const float*, int, int, queue*); -template void launch_d_gelu(half*, const half*, const half*, int, int, queue*); -template void launch_d_gelu(bf16*, const bf16*, const bf16*, int, int, queue*); +template void launch_d_gelu(float*, const float*, const float*, int, int, queue); +template void launch_d_gelu(half*, const half*, const half*, int, int, queue); +template void launch_d_gelu(bf16*, const bf16*, const bf16*, int, int, queue); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.cpp similarity index 95% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.cpp index ce64746..850da66 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/general_kernels.cpp @@ -98,12 +98,12 @@ void column_sum_reduce(const bf16* inp, } template -void launch_fuse_transpose_bias_kernel(const T* inp, T* out, int rows, int cols, queue* stream) +void launch_fuse_transpose_bias_kernel(const T* inp, T* out, int rows, int cols, queue stream) { range<3> grid_dim(1, 1, (cols - 1) / MAX_SG_NUM + 1); range<3> block_dim(1, MAX_SG_NUM, MAX_SG_NUM); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor tile( range<2>(MAX_SG_NUM, MAX_SG_NUM1), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -118,17 +118,17 @@ template void launch_fuse_transpose_bias_kernel(const float* inp, float* out, int rows, int cols, - queue* stream); + queue stream); template void launch_fuse_transpose_bias_kernel(const bf16* inp, bf16* out, int rows, int cols, - queue* stream); + queue stream); template void launch_fuse_transpose_bias_kernel(const half* inp, half* out, int rows, int cols, - queue* stream); + queue stream); void fused_add2_kernel(const int N, float* out, @@ -238,13 +238,13 @@ void launch_fused_add2(T* out, int batch_size, int seq_length, int hidden_dim, - queue* stream) + queue stream) { int total_count = batch_size * seq_length * hidden_dim / 4; range<3> grid_dim = range<3>(1, 1, DS_GET_BLOCKS(total_count)); //(batch_size * seq_length); range<3> block_dim = range<3>(1, 1, DS_CUDA_NUM_THREADS); //(hidden_dim / 4); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { fused_add2_kernel(total_count, out, inp1, inp2, item_ct1); }); @@ -257,21 +257,21 @@ template void launch_fused_add2(float* out, int batch_size, int seq_length, int hidden_dim, - queue* stream); + queue stream); template void launch_fused_add2(bf16* out, const bf16* inp1, const bf16* inp2, int batch_size, int seq_length, int hidden_dim, - queue* stream); + queue stream); template void launch_fused_add2(half* out, const half* inp1, const half* inp2, int batch_size, int seq_length, int hidden_dim, - queue* stream); + queue stream); void fused_add3_kernel(float* out, const float* inp1, @@ -357,12 +357,12 @@ void launch_fused_add3(float* out, int batch_size, int seq_length, int hidden_size, - queue* stream) + queue stream) { range<3> grid_dim(1, 1, batch_size * seq_length); range<3> block_dim(1, 1, hidden_size / 4); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { fused_add3_kernel(out, inp1, @@ -383,13 +383,13 @@ void launch_fused_add3(half* out, int batch_size, int seq_length, int hidden_size, - queue* stream) + queue stream) { range<3> grid_dim(1, 1, batch_size * seq_length); range<3> block_dim(1, 1, hidden_size / 4); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { fused_add3_kernel(out, inp1, @@ -496,13 +496,13 @@ void launch_fused_add4(float* out, int batch_size, int seq_length, int hidden_size, - queue* stream) + queue stream) { range<3> grid_dim(1, 1, batch_size * seq_length); range<3> block_dim(1, 1, hidden_size / 4); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { fused_add4_kernel(out, inp1, @@ -525,13 +525,13 @@ void launch_fused_add4(half* out, int batch_size, int seq_length, int hidden_size, - queue* stream) + queue stream) { range<3> grid_dim(1, 1, batch_size * seq_length); range<3> block_dim(1, 1, hidden_size / 4); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) { fused_add4_kernel(out, inp1, diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.cpp similarity index 97% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.cpp index 008dade..49aba94 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/normalize_kernels.cpp @@ -442,7 +442,7 @@ void launch_bias_residual_layer_norm(T* vals, float epsilon, int batch_size, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm, bool training, T* vars, @@ -461,7 +461,7 @@ void launch_bias_residual_layer_norm(T* vals, sycl::range<3> block_dim(1, 1, threads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor shr_acc_ct1(sycl::range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); cgh.parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), @@ -489,7 +489,7 @@ template void launch_bias_residual_layer_norm(float* vals, float epsilon, int batch_size, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm, bool training, float* vars, @@ -501,7 +501,7 @@ template void launch_bias_residual_layer_norm(bf16* vals, float epsilon, int batch_size, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, bool preLayerNorm, bool training, bf16* vars, @@ -514,7 +514,7 @@ void launch_bias_residual_layer_norm(half* vals, float epsilon, int batch_size, int hidden_dim, - queue* stream, + queue stream, bool preLayerNorm, bool training, half* vars, @@ -535,7 +535,7 @@ void launch_bias_residual_layer_norm(half* vals, range<3> block_dim(1, 1, threads); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { sycl::accessor shr_acc_ct1(sycl::range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -579,7 +579,7 @@ void launch_bias_residual_layer_norm(T* vals, float epsilon, int batch_size, int hidden_dim, - queue* stream, + queue stream, bool preLayerNorm, bool training, T* vars) @@ -600,7 +600,7 @@ void launch_bias_residual_layer_norm(T* vals, range<3> block_dim(1, 1, threads); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor shr_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -629,7 +629,7 @@ template void launch_bias_residual_layer_norm(float* vals, float epsilon, int batch_size, int hidden_dim, - queue* stream, + queue stream, bool preLayerNorm, bool training, float* vars); @@ -640,7 +640,7 @@ template void launch_bias_residual_layer_norm(bf16* vals, float epsilon, int batch_size, int hidden_dim, - queue* stream, + queue stream, bool preLayerNorm, bool training, bf16* vars); @@ -652,7 +652,7 @@ void launch_bias_residual_layer_norm(half* vals, float epsilon, int batch_size, int hidden_dim, - queue* stream, + queue stream, bool preLayerNorm, bool training, half* vars) @@ -675,7 +675,7 @@ void launch_bias_residual_layer_norm(half* vals, range<3> block_dim(1, 1, threads); - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { sycl::accessor shr_acc_ct1(sycl::range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -1443,7 +1443,7 @@ void launch_layerNorm_backward(const T* out_grad, T* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const T* betta) { @@ -1452,7 +1452,7 @@ void launch_layerNorm_backward(const T* out_grad, range<3> grid_dim(1, 1, hidden_dim / TILE_DIM); range<3> block_dim(1, TILE_DIM, TILE_DIM); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -1488,7 +1488,7 @@ void launch_layerNorm_backward(const T* out_grad, range<3> block_dim2(1, 1, threads); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -1518,7 +1518,7 @@ template void launch_layerNorm_backward(const float* out_grad, float* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const float* betta); @@ -1531,7 +1531,7 @@ template void launch_layerNorm_backward(const bf16* out_grad, bf16* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const bf16* betta); @@ -1545,7 +1545,7 @@ void launch_layerNorm_backward(const half* out_grad, half* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const half* betta) { @@ -1557,7 +1557,7 @@ void launch_layerNorm_backward(const half* out_grad, // LayerNormBackward1<<>>( // out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, // hidden_dim, invertible); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -1592,7 +1592,7 @@ void launch_layerNorm_backward(const half* out_grad, range<3> block_dim2(1, 1, threads / 2); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2021,7 +2021,7 @@ void launch_layerNorm_backward(const T* out_grad, T* inp_grad, int batch, int hidden_dim, - queue* stream[2]) + queue stream[2]) { int threads = THREADS; @@ -2031,7 +2031,7 @@ void launch_layerNorm_backward(const T* out_grad, // LayerNormBackward1<<>>( // out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, // hidden_dim); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2063,7 +2063,7 @@ void launch_layerNorm_backward(const T* out_grad, throw std::runtime_error("Unsupport hidden_dim."); range<3> block_dim2(1, 1, threads); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2093,7 +2093,7 @@ template void launch_layerNorm_backward(const float* out_grad, float* inp_grad, int batch, int hidden_dim, - queue* stream[2]); + queue stream[2]); template void launch_layerNorm_backward(const bf16* out_grad, const bf16* X_data, const bf16* vars, @@ -2104,7 +2104,7 @@ template void launch_layerNorm_backward(const bf16* out_grad, bf16* inp_grad, int batch, int hidden_dim, - queue* stream[2]); + queue stream[2]); template <> void launch_layerNorm_backward(const half* out_grad, const half* X_data, @@ -2116,7 +2116,7 @@ void launch_layerNorm_backward(const half* out_grad, half* inp_grad, int batch, int hidden_dim, - queue* stream[2]) + queue stream[2]) { int threads = THREADS; @@ -2126,7 +2126,7 @@ void launch_layerNorm_backward(const half* out_grad, // LayerNormBackward1<<>>( // out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, // hidden_dim); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2160,7 +2160,7 @@ void launch_layerNorm_backward(const half* out_grad, throw std::runtime_error("Unsupport hidden_dim."); range<3> block_dim2(1, 1, threads / 2); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2191,7 +2191,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const T* betta) { @@ -2202,7 +2202,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, // LayerNormBackward1<<>>( // out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, // hidden_dim, invertible); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2235,7 +2235,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, range<3> block_dim2(1, 1, threads); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2266,7 +2266,7 @@ template void launch_layerNorm_backward_fused_add(const float* out_grad1, float* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const float* betta); @@ -2280,7 +2280,7 @@ template void launch_layerNorm_backward_fused_add(const bf16* out_grad1, bf16* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const bf16* betta); @@ -2295,7 +2295,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, half* inp_grad, int batch, int hidden_dim, - queue* stream[2], + queue stream[2], bool invertible, const half* betta) { @@ -2307,7 +2307,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, // LayerNormBackward1<<>>( // out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, // hidden_dim, invertible); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2341,7 +2341,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, throw std::runtime_error("Unsupport hidden_dim."); range<3> block_dim2(1, 1, threads / 2); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2374,7 +2374,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, T* inp_grad, int batch, int hidden_dim, - queue* stream[2]) + queue stream[2]) { int threads = THREADS; @@ -2384,7 +2384,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, // LayerNormBackward1<<>>( // out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, // hidden_dim); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2416,7 +2416,7 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1, throw std::runtime_error("Unsupport hidden_dim."); range<3> block_dim2(1, 1, threads); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); @@ -2447,7 +2447,7 @@ template void launch_layerNorm_backward_fused_add(const float* out_grad1, float* inp_grad, int batch, int hidden_dim, - queue* stream[2]); + queue stream[2]); template void launch_layerNorm_backward_fused_add(const bf16* out_grad1, const bf16* out_grad2, const bf16* X_data, @@ -2459,7 +2459,7 @@ template void launch_layerNorm_backward_fused_add(const bf16* out_grad1, bf16* inp_grad, int batch, int hidden_dim, - queue* stream[2]); + queue stream[2]); template <> void launch_layerNorm_backward_fused_add(const half* out_grad1, const half* out_grad2, @@ -2472,7 +2472,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, half* inp_grad, int batch, int hidden_dim, - queue* stream[2]) + queue stream[2]) { int threads = THREADS; @@ -2482,7 +2482,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, // LayerNormBackward1<<>>( // out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, // hidden_dim); - stream[0]->submit([&](handler& cgh) { + stream[0].submit([&](handler& cgh) { accessor betta_buffer( range<2>(MAX_SG_NUM /*MAX_WARP_NUM*/, MAX_SG_NUM1), cgh); accessor gamma_buffer( @@ -2515,7 +2515,7 @@ void launch_layerNorm_backward_fused_add(const half* out_grad1, throw std::runtime_error("Unsupport hidden_dim."); range<3> block_dim2(1, 1, threads / 2); - stream[1]->submit([&](handler& cgh) { + stream[1].submit([&](handler& cgh) { accessor partialSum_acc_ct1( range<1>(MAX_SG_NUM /*MAX_WARP_NUM*/), cgh); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.cpp similarity index 93% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.cpp index 073d628..3ea90b3 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/onednn_wrappers.cpp @@ -1,8 +1,9 @@ #include "onednn_wrappers.hpp" +#include "inference_sycl_layers.h" #include template -inline int onednn_matmul(sycl::queue* handle, +inline int onednn_matmul(sycl::queue handle, bool trans_src, bool trans_wgt, int m, @@ -20,10 +21,10 @@ inline int onednn_matmul(sycl::queue* handle, * wgt, [k, n], n: k: in_features, out_feature * dst, [m, n], m: batch, n: out_features */ - device dev = handle->get_device(); - context ctx = handle->get_context(); + device dev = handle.get_device(); + context ctx = handle.get_context(); dnnl::engine engine = dnnl::sycl_interop::make_engine(dev, ctx); - dnnl::stream stream = dnnl::sycl_interop::make_stream(engine, *handle); + dnnl::stream stream = dnnl::sycl_interop::make_stream(engine, handle); dnnl::memory::dims src_dims, wgt_dims, dst_dims; @@ -90,10 +91,10 @@ inline int onednn_matmul(sycl::queue* handle, matmul_args.insert({DNNL_ARG_DST, dst_mem}); matmul_prim.execute(stream, matmul_args); - stream.wait(); + /* stream.wait(); */ } -int onednn_matmul_ex(sycl::queue* handle, +int onednn_matmul_ex(sycl::queue handle, bool trans_src, bool trans_wgt, int m, @@ -109,7 +110,7 @@ int onednn_matmul_ex(sycl::queue* handle, handle, trans_src, trans_wgt, m, n, k, alpha, beta, src_ptr, wgt_ptr, dst_ptr, 1); } -int onednn_batchgemm(sycl::queue* handle, +int onednn_batchgemm(sycl::queue handle, int m, int n, int k, diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.cpp similarity index 92% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.cpp index e003b99..6b9e1a9 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/onemkl_wrappers.cpp @@ -7,7 +7,7 @@ #error "Unsupported compiler" #endif -int onemkl_gemm_ex(sycl::queue* handle, +int onemkl_gemm_ex(sycl::queue handle, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int m, @@ -24,7 +24,7 @@ int onemkl_gemm_ex(sycl::queue* handle, int ldb = (transb == oneapi::mkl::transpose::nontrans) ? k : n; int ldc = m; oneapi::mkl::blas::gemm( - *handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } catch (sycl::exception const& exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; @@ -32,7 +32,7 @@ int onemkl_gemm_ex(sycl::queue* handle, } } -int onemkl_gemm_ex(sycl::queue* handle, +int onemkl_gemm_ex(sycl::queue handle, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int m, @@ -49,7 +49,7 @@ int onemkl_gemm_ex(sycl::queue* handle, int ldb = (transb == oneapi::mkl::transpose::nontrans) ? k : n; int ldc = m; oneapi::mkl::blas::gemm( - *handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } catch (sycl::exception const& exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; @@ -57,7 +57,7 @@ int onemkl_gemm_ex(sycl::queue* handle, } } -int onemkl_strided_batched_gemm(sycl::queue* handle, +int onemkl_strided_batched_gemm(sycl::queue handle, int m, int n, int k, @@ -78,7 +78,7 @@ int onemkl_strided_batched_gemm(sycl::queue* handle, int lda = (transa == oneapi::mkl::transpose::nontrans) ? m : k; int ldb = (transb == oneapi::mkl::transpose::nontrans) ? k : n; int ldc = m; - oneapi::mkl::blas::gemm_batch(*handle, + oneapi::mkl::blas::gemm_batch(handle, transa, transb, m, @@ -103,7 +103,7 @@ int onemkl_strided_batched_gemm(sycl::queue* handle, } } -int onemkl_strided_batched_gemm(sycl::queue* handle, +int onemkl_strided_batched_gemm(sycl::queue handle, int m, int n, int k, @@ -124,7 +124,7 @@ int onemkl_strided_batched_gemm(sycl::queue* handle, int lda = (transa == oneapi::mkl::transpose::nontrans) ? m : k; int ldb = (transb == oneapi::mkl::transpose::nontrans) ? k : n; int ldc = m; - oneapi::mkl::blas::gemm_batch(*handle, + oneapi::mkl::blas::gemm_batch(handle, transa, transb, m, diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.cpp similarity index 95% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.cpp index 52f9755..f939d76 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/softmax_kernels.cpp @@ -444,7 +444,7 @@ void launch_attn_softmax(T* vals, int batch_size, int heads, int sequence_length, - queue* stream) + queue stream) { const int threads = 128; int seq_length4 = sequence_length / 4; @@ -465,7 +465,7 @@ void launch_attn_softmax(T* vals, : MAX_THREAD_ITERATIONS); if (sequence_length <= 8) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -481,7 +481,7 @@ void launch_attn_softmax(T* vals, }); }); else if (sequence_length <= 16) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -497,7 +497,7 @@ void launch_attn_softmax(T* vals, }); }); else if (sequence_length <= 32) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -513,7 +513,7 @@ void launch_attn_softmax(T* vals, }); }); else if (sequence_length <= 64) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -529,7 +529,7 @@ void launch_attn_softmax(T* vals, }); }); else if (sequence_length <= 128) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -545,7 +545,7 @@ void launch_attn_softmax(T* vals, }); }); else if (sequence_length <= 256) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1( range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -578,7 +578,7 @@ void launch_attn_softmax(T* vals, (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads : MAX_THREAD_ITERATIONS); if (sequence_length <= 512) { - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1(range<1>(MAX_SG_NUM), cgh); cgh.parallel_for( @@ -594,7 +594,7 @@ void launch_attn_softmax(T* vals, }); }); } else if (sequence_length < (MAX_THREADS * MAX_THREAD_ITERATIONS * 4)) - stream->submit([&](handler& cgh) { + stream.submit([&](handler& cgh) { accessor data_block_acc_ct1(range<1>(MAX_SG_NUM), cgh); cgh.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), @@ -621,21 +621,21 @@ template void launch_attn_softmax(float* vals, int batch_size, int heads, int sequence_length, - queue* stream); + queue stream); template void launch_attn_softmax(bf16* vals, const bf16* attn_mask, int batch_size, int heads, int sequence_length, - queue* stream); + queue stream); template void launch_attn_softmax(half* vals, const half* attn_mask, int batch_size, int heads, int sequence_length, - queue* stream); + queue stream); template void softmax_backward_kernel(T* out_grad, @@ -775,62 +775,62 @@ void launch_attn_softmax_backward_v2(T* out_grad, int batch_size, int heads, int seq_length, - queue* stream) + queue stream) { const int sgs_per_block = 4; range<3> grid_dim(1, 1, batch_size * heads * seq_length / sgs_per_block); range<3> block_dim(1, sgs_per_block, MAX_SG_NUM); if (seq_length <= 32) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 64) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 128) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 256) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 384) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 512) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 768) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 1024) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); }); else if (seq_length <= 2048) - stream->parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), + stream.parallel_for(nd_range<3>(grid_dim * block_dim, block_dim), [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(MAX_SG_NUM)]] { softmax_backward_kernel_v2( out_grad, soft_inp, seq_length, item_ct1); @@ -846,16 +846,16 @@ template void launch_attn_softmax_backward_v2(float* out_grad, int batch_size, int heads, int seq_length, - queue* stream); + queue stream); template void launch_attn_softmax_backward_v2(bf16* out_grad, const bf16* soft_inp, int batch_size, int heads, int seq_length, - queue* stream); + queue stream); template void launch_attn_softmax_backward_v2(half* out_grad, const half* soft_inp, int batch_size, int heads, int seq_length, - queue* stream); + queue stream); diff --git a/intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.dp.cpp b/intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.cpp similarity index 96% rename from intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.dp.cpp rename to intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.cpp index 209fc68..bf02a70 100644 --- a/intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.dp.cpp +++ b/intel_extension_for_deepspeed/op_builder/csrc/transformer/transform_kernels.cpp @@ -45,13 +45,13 @@ void Transpose(const sycl::half* inp_mat, sycl::half* out_mat, int rows, int cols, - sycl::queue* stream) + sycl::queue stream) { int threads = THREADS; sycl::range<3> grid_dim(1, 1, (rows * cols + threads - 1) / threads); sycl::range<3> block_dim(1, 1, threads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor data_block_acc_ct1(sycl::range<1>(rows_trans * (cols_trans + 1)), cgh); cgh.parallel_for(sycl::nd_range<3>(grid_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { @@ -62,13 +62,13 @@ void Transpose(const sycl::half* inp_mat, } template <> -void Transpose(const float* inp_mat, float* out_mat, int rows, int cols, sycl::queue* stream) +void Transpose(const float* inp_mat, float* out_mat, int rows, int cols, sycl::queue stream) { int threads = THREADS; sycl::range<3> grid_dim(1, 1, (rows * cols + threads - 1) / threads); sycl::range<3> block_dim(1, 1, threads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor data_block_acc_ct1(sycl::range<1>(rows_trans * (cols_trans + 1)), cgh); cgh.parallel_for( @@ -203,14 +203,14 @@ void launch_transform_0213(float* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream) + sycl::queue stream) { hidden_dim >>= 2; int head_ext = (hidden_dim - 1) / MAX_THREADS + 1; sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim(1, (seq_length * head_ext), batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { transform_0213( @@ -226,14 +226,14 @@ void launch_transform_0213(bf16* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream) + sycl::queue stream) { hidden_dim >>= 2; int head_ext = (hidden_dim - 1) / MAX_THREADS + 1; sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim(1, (seq_length * head_ext), batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { transform_0213( @@ -249,14 +249,14 @@ void launch_transform_0213(sycl::half* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream) + sycl::queue stream) { hidden_dim >>= 3; int head_ext = (hidden_dim - 1) / MAX_THREADS + 1; sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim(1, (seq_length * head_ext), batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { transform_0213( @@ -532,7 +532,7 @@ void launch_bias_add_transform_0213(float* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 2; @@ -541,7 +541,7 @@ void launch_bias_add_transform_0213(float* output, sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim((trans_count * head_ext), seq_length, batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { bias_add_transform_0213( @@ -560,7 +560,7 @@ void launch_bias_add_transform_0213(bf16* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 2; @@ -569,7 +569,7 @@ void launch_bias_add_transform_0213(bf16* output, sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim((trans_count * head_ext), seq_length, batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { bias_add_transform_0213( @@ -588,7 +588,7 @@ void launch_bias_add_transform_0213(sycl::half* output, int seq_length, int hidden_dim, int heads, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 3; @@ -596,7 +596,7 @@ void launch_bias_add_transform_0213(sycl::half* output, int head_ext = (hidden_dim - 1) / MAX_THREADS + 1; sycl::range<3> block_dim(1, (heads / head_ext), hidden_dim / heads); sycl::range<3> grid_dim((trans_count * head_ext), seq_length, batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dim * block_dim, block_dim), [=](sycl::nd_item<3> item_ct1) { bias_add_transform_0213( @@ -608,7 +608,7 @@ void launch_bias_add_transform_0213(sycl::half* output, } else { sycl::range<3> block_dim(trans_count, heads, hidden_dim / heads); sycl::range<3> grid_dim(1, seq_length / 2, batch_size); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor(float* out, int heads, int seq_length, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 2; sycl::range<3> grid_dims(trans_count, heads * ((seq_length - 1) / 8 + 1), batch_size); sycl::range<3> block_dims(1, 8, hidden_dim / heads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { transform4d_0213(out, in, heads, seq_length, hidden_dim, 1, item_ct1); @@ -832,13 +832,13 @@ void launch_transform4d_0213(bf16* out, int heads, int seq_length, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 2; sycl::range<3> grid_dims(trans_count, heads * ((seq_length - 1) / 8 + 1), batch_size); sycl::range<3> block_dims(1, 8, hidden_dim / heads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { transform4d_0213(out, in, heads, seq_length, hidden_dim, 1, item_ct1); @@ -856,7 +856,7 @@ void launch_transform4d_0213(sycl::half* out, int heads, int seq_length, int hidden_dim, - sycl::queue* stream, + sycl::queue stream, int trans_count) { hidden_dim >>= 3; @@ -864,7 +864,7 @@ void launch_transform4d_0213(sycl::half* out, int head_ext = (hidden_dim - 1) / MAX_THREADS + 1; sycl::range<3> grid_dims((seq_length * head_ext), trans_count, batch_size); sycl::range<3> block_dims(1, (heads / head_ext), hidden_dim / heads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { transform4d_0213( @@ -874,7 +874,7 @@ void launch_transform4d_0213(sycl::half* out, } else { sycl::range<3> grid_dims(1, seq_length / 2, batch_size); sycl::range<3> block_dims(trans_count, heads, hidden_dim / heads); - stream->submit([&](sycl::handler& cgh) { + stream.submit([&](sycl::handler& cgh) { sycl::accessor