Add EP API Stream support

chilo-ms · chilo-ms · commit 832a7f46e8b7 · 2025-08-11T13:54:52.000-07:00
diff --git a/plugin_execution_providers/tensorrt/cuda_allocator.h b/plugin_execution_providers/tensorrt/cuda_allocator.h
@@ -10,11 +10,12 @@ using DeviceId = int16_t;
 struct CUDAAllocator : OrtAllocator {
   CUDAAllocator(const OrtMemoryInfo* mem_info, DeviceId device_id) : mem_info_(mem_info), device_id_(device_id) {
     OrtAllocator::version = ORT_API_VERSION;
-    OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) {
-      return static_cast<CUDAAllocator*>(this_)->Alloc(size);
-    };
+    OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<CUDAAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<CUDAAllocator*>(this_)->Free(p); };
     OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const CUDAAllocator*>(this_)->Info(); };
+    OrtAllocator::Reserve = nullptr;
+    OrtAllocator::GetStats = nullptr;
+    OrtAllocator::AllocOnStream = nullptr; // Allocate memory, handling usage across different Streams. Not used for TRT EP.
   }
   // TODO: Handle destructor
   //~CUDAAllocator();
@@ -41,6 +42,9 @@ struct CUDAPinnedAllocator : OrtAllocator {
     OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<CUDAPinnedAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<CUDAPinnedAllocator*>(this_)->Free(p); };
     OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const CUDAPinnedAllocator*>(this_)->Info(); };
+    OrtAllocator::Reserve = nullptr;
+    OrtAllocator::GetStats = nullptr;
+    OrtAllocator::AllocOnStream = nullptr;
   }
   // TODO: Handle destructor
   //~CUDAPinnedAllocator();
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc
@@ -14,6 +14,7 @@
 #include "tensorrt_execution_provider.h"
 #include "cuda_allocator.h"
 #include "onnx_ctx_model_helper.h"
+#include "tensorrt_execution_provider_stream_support.h"
 #include "onnx/onnx_pb.h"
 #include "cuda/unary_elementwise_ops_impl.h"
 #include "ep_utils.h"
@@ -1960,6 +1961,30 @@ const char* ORT_API_CALL TensorrtExecutionProvider::GetNameImpl(const OrtEp* thi
   return ep->name_.c_str();
 }
 
+OrtStatus* ORT_API_CALL TensorrtExecutionProvider::CreateSyncStreamForDeviceImpl(_In_ OrtEp* this_ptr,
+                                                                                 _In_ const OrtMemoryDevice* memory_device,
+                                                                                 _Outptr_ OrtSyncStreamImpl** stream) noexcept {
+  // A per-session OrtSyncStreamImpl can be created here if the session options affect the implementation.
+  // Logging of any issues should use logger_ which is the session logger.
+
+  TensorrtExecutionProvider* ep = static_cast<TensorrtExecutionProvider*>(this_ptr);
+
+  // we only create streams for the default device memory.
+  if (auto mem_type = ep->factory_.ep_api.MemoryDevice_GetMemoryType(memory_device);
+      mem_type != OrtDeviceMemoryType_DEFAULT) {
+    std::string error = "Invalid OrtMemoryDevice. Expected OrtDeviceMemoryType_DEFAULT(0). Got ";
+    error += std::to_string(mem_type);
+    return ep->ort_api.CreateStatus(ORT_INVALID_ARGUMENT, error.c_str());
+  }
+
+  auto device_id = ep->factory_.ep_api.MemoryDevice_GetDeviceId(memory_device);
+
+  auto sync_stream = std::make_unique<TrtSyncStreamImpl>(ep->factory_, ep, device_id, nullptr);
+  *stream = sync_stream.release();
+
+  return nullptr;
+}
+
 /**
  * Refit the weight-stripped engine
  */
@@ -2070,6 +2095,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFa
   GetCapability = GetCapabilityImpl;
   Compile = CompileImpl;
   ReleaseNodeComputeInfos = ReleaseNodeComputeInfosImpl;
+  CreateSyncStreamForDevice = CreateSyncStreamForDeviceImpl;
 
   // Initialize the execution provider.
   auto status = ort_api.Logger_LogMessage(&logger_,
@@ -2158,7 +2184,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFa
     force_timing_cache_match_ = info_.force_timing_cache;
     detailed_build_log_ = info_.detailed_build_log;
     dump_ep_context_model_ = info_.dump_ep_context_model;
-    dump_ep_context_model_ = true;
+    //dump_ep_context_model_ = true;
     ep_context_file_path_ = info_.ep_context_file_path;
     ep_context_embed_mode_ = info_.ep_context_embed_mode;
     enable_engine_cache_for_ep_context_model();
@@ -2378,7 +2404,6 @@ void ORT_API_CALL TensorrtExecutionProvider::ReleaseNodeComputeInfosImpl(OrtEp*
   }
 }
 
-
 //
 // Implementation of TRTEpNodeComputeInfo
 //
@@ -2487,7 +2512,7 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void*
   cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
 
   //cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  //cudaStreamCreate(&stream);
 
   // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
   // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even
@@ -3053,6 +3078,9 @@ void TRTEpNodeComputeInfo::ReleaseStateImpl(OrtNodeComputeInfo* this_ptr, void*
   // Do nothing for here.
 }
 
+//
+// Implementation of TRTEpEpContextNodeComputeInfo
+//
 TRTEpEpContextNodeComputeInfo::TRTEpEpContextNodeComputeInfo(TensorrtExecutionProvider& ep) : ep(ep) {
   ort_version_supported = ORT_API_VERSION;
   CreateState = CreateStateImpl;
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h
@@ -227,7 +227,7 @@ static const std::string k_ep_ctx_onnx_model_filename = "onnx_model_filename";
 
 /// <summary>
 /// 
-/// Plugin TensorRT EP
+/// Plugin TensorRT EP implementing OrtEp.
 ///
 /// </summary>
 struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
@@ -311,6 +311,8 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   std::unordered_map<std::string, std::string> trt_node_name_with_precision_;
   std::unordered_map<std::string, std::unordered_map<std::string, float>> dynamic_range_map_;
   std::unordered_map<std::string, std::string> cache_suffix_;
+  bool external_stream_ = false;
+  cudaStream_t stream_ = nullptr;
 
  private:
   static const char* ORT_API_CALL GetNameImpl(const OrtEp* this_ptr) noexcept;
@@ -323,12 +325,11 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   static void ORT_API_CALL ReleaseNodeComputeInfosImpl(OrtEp* this_ptr, OrtNodeComputeInfo** node_compute_infos,
                                                        size_t num_node_compute_infos) noexcept;
 
-  OrtStatus* CreateEpContextNodes(gsl::span<const OrtNode*> fused_nodes,
-                                  /*out*/ gsl::span<OrtNode*> ep_context_nodes);
+  static OrtStatus* ORT_API_CALL CreateSyncStreamForDeviceImpl(_In_ OrtEp* this_ptr,
+                                                               _In_ const OrtMemoryDevice* memory_device,
+                                                               _Outptr_ OrtSyncStreamImpl** stream) noexcept;
 
   mutable TensorrtExecutionProviderInfo info_;
-  bool external_stream_ = false;
-  cudaStream_t stream_ = nullptr;
   int max_partition_iterations_ = 1000;
   size_t min_subgraph_size_ = 1;
   size_t max_workspace_size_ = 1 << 30;  // 1GB
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_stream_support.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_stream_support.cc
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "tensorrt_execution_provider_stream_support.h"
+#include "tensorrt_provider_factory.h"
+#include "tensorrt_execution_provider.h"
+
+#include "cuda/cuda_common.h"
+#include "cuda/cuda_call.h"
+
+//
+// TrtSyncStreamImpl implementation
+//
+
+TrtSyncStreamImpl::TrtSyncStreamImpl(TensorrtExecutionProviderFactory& factory, const OrtEp* ep, uint32_t device_id, const OrtKeyValuePairs* /*stream_options*/)
+    : ApiPtrs(factory), ep_{ep}, factory_{&factory} {
+  ort_version_supported = ORT_API_VERSION;
+  CreateNotification = CreateNotificationImpl;
+  GetHandle = GetHandleImpl;
+  Flush = FlushImpl;
+  OnSessionRunEnd = OnSessionRunEndImpl;
+  Release = ReleaseImpl;
+
+  const TensorrtExecutionProvider* trt_ep = static_cast<const TensorrtExecutionProvider*>(ep_);
+  if (trt_ep->external_stream_) {
+    stream_ = trt_ep->stream_;
+    own_stream_ = false;
+  } else {
+    CUDA_CALL_THROW(cudaSetDevice(static_cast<int>(device_id)));
+    cudaStream_t stream = nullptr;
+    CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    stream_ = stream;
+    own_stream_ = true;
+  }
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncStreamImpl::CreateNotificationImpl(_In_ OrtSyncStreamImpl* this_ptr,
+                                                                  _Outptr_ OrtSyncNotificationImpl** notification) noexcept {
+  auto& impl = *static_cast<TrtSyncStreamImpl*>(this_ptr);
+
+  std::unique_ptr<TrtSyncNotificationImpl> trt_sync_notification;
+  RETURN_IF_ERROR(TrtSyncNotificationImpl::Create(impl.stream_, impl, trt_sync_notification));
+
+  *notification = trt_sync_notification.release();
+  return nullptr;
+}
+
+/*static*/
+void* ORT_API_CALL TrtSyncStreamImpl::GetHandleImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<TrtSyncStreamImpl*>(this_ptr);
+  return static_cast<void*>(impl.stream_);
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncStreamImpl::FlushImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<TrtSyncStreamImpl*>(this_ptr);
+
+  // only flush when we own the stream, not external
+  if (impl.own_stream_) CUDA_CALL_THROW(cudaStreamSynchronize(static_cast<cudaStream_t>(impl.stream_)));
+  return nullptr;
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncStreamImpl::OnSessionRunEndImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept {
+  return nullptr;
+}
+
+// callback for EP library to release any internal state
+/*static*/
+void ORT_API_CALL TrtSyncStreamImpl::ReleaseImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept {
+  delete static_cast<TrtSyncStreamImpl*>(this_ptr);
+}
+
+//
+// Notification support
+//
+
+/*static*/
+OrtStatus* TrtSyncNotificationImpl::Create(cudaStream_t stream, const ApiPtrs& apis,
+                                           std::unique_ptr<TrtSyncNotificationImpl>& notification){
+  auto trt_sync_notification = std::make_unique<TrtSyncNotificationImpl>(stream, apis);
+  CUDA_RETURN_IF_ERROR(cudaEventCreateWithFlags(&trt_sync_notification->event_, cudaEventDisableTiming));
+
+  notification = std::move(trt_sync_notification);
+
+  return nullptr;
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncNotificationImpl::ActivateImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<TrtSyncNotificationImpl*>(this_ptr);
+  CUDA_RETURN_IF_ERROR(cudaEventRecord(impl.event_, impl.stream_));
+
+  return nullptr;
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncNotificationImpl::WaitOnDeviceImpl(_In_ OrtSyncNotificationImpl* this_ptr,
+                                                                  _In_ OrtSyncStream* stream) noexcept {
+  auto& impl = *static_cast<TrtSyncNotificationImpl*>(this_ptr);
+  void* handle = impl.ort_api.SyncStream_GetHandle(stream);
+  CUDA_RETURN_IF_ERROR(cudaStreamWaitEvent(static_cast<cudaStream_t>(handle), impl.event_));
+
+  return nullptr;
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL TrtSyncNotificationImpl::WaitOnHostImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<TrtSyncNotificationImpl*>(this_ptr);
+  CUDA_RETURN_IF_ERROR(cudaEventSynchronize(impl.event_));
+
+  return nullptr;
+}
+
+/*static*/
+void ORT_API_CALL TrtSyncNotificationImpl::ReleaseImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept {
+  delete static_cast<TrtSyncNotificationImpl*>(this_ptr);
+}
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_stream_support.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_stream_support.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "onnxruntime_c_api.h"
+#include "tensorrt_provider_factory.h"
+#include "ep_utils.h"
+
+#include <cuda_runtime_api.h>
+
+//
+// Class implementing Stream support for synchronization.
+//
+struct TrtSyncStreamImpl : public OrtSyncStreamImpl, public ApiPtrs {
+ TrtSyncStreamImpl(TensorrtExecutionProviderFactory& factory,
+                   const OrtEp* ep,
+                   uint32_t device_id,
+                   const OrtKeyValuePairs* /*stream_options*/);
+
+ private:
+  static OrtStatus* ORT_API_CALL CreateNotificationImpl(_In_ OrtSyncStreamImpl* this_ptr,
+                                                        _Outptr_ OrtSyncNotificationImpl** sync_notification) noexcept;
+  static void* ORT_API_CALL GetHandleImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept;
+  static OrtStatus* ORT_API_CALL FlushImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept;
+  static OrtStatus* ORT_API_CALL OnSessionRunEndImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept;
+  static void ORT_API_CALL ReleaseImpl(_In_ OrtSyncStreamImpl* this_ptr) noexcept;
+
+  // EP instance if the stream is being created internally for inferencing.
+  // nullptr when the stream is created outside of an inference session for data copies.
+  const OrtEp* ep_;
+  TensorrtExecutionProviderFactory* factory_{nullptr};
+
+  cudaStream_t stream_{nullptr};
+  bool own_stream_{true};
+};
+
+//
+// Class implementing synchronization notification support.
+//
+struct TrtSyncNotificationImpl : public OrtSyncNotificationImpl, public ApiPtrs {
+  static OrtStatus* Create(cudaStream_t stream, const ApiPtrs& apis,
+                           std::unique_ptr<TrtSyncNotificationImpl>& notification);
+
+  TrtSyncNotificationImpl(cudaStream_t stream, const ApiPtrs& apis) : stream_(stream), ApiPtrs(apis) {
+    ort_version_supported = ORT_API_VERSION;
+    Activate = ActivateImpl;
+    Release = ReleaseImpl;
+    WaitOnDevice = WaitOnDeviceImpl;
+    WaitOnHost = WaitOnHostImpl;
+  }
+
+ private:
+  static OrtStatus* ORT_API_CALL ActivateImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept;
+  static OrtStatus* ORT_API_CALL WaitOnDeviceImpl(_In_ OrtSyncNotificationImpl* this_ptr,
+                                                  _In_ OrtSyncStream* stream) noexcept;
+  static OrtStatus* ORT_API_CALL WaitOnHostImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept;
+  static void ORT_API_CALL ReleaseImpl(_In_ OrtSyncNotificationImpl* this_ptr) noexcept;
+
+  cudaStream_t& stream_;
+  cudaEvent_t event_;
+};
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc
@@ -249,7 +249,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl
 }
 
 bool ORT_API_CALL TensorrtExecutionProviderFactory::IsStreamAwareImpl(const OrtEpFactory* /*this_ptr*/) noexcept {
-  return false;
+  return true;
 }
 
 // To make symbols visible on macOS/iOS
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h
@@ -55,8 +55,6 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
 
   static bool ORT_API_CALL IsStreamAwareImpl(const OrtEpFactory* /*this_ptr*/) noexcept;
 
-  void SetGPUDataTransfer(std::unique_ptr<TRTEpDataTransfer> gpu_data_transfer);
-
   const std::string ep_name_;           // EP name
   const std::string vendor_{"Nvidia"};  // EP vendor name
   const std::string ep_version_{"0.1.0"};  // EP version
diff --git a/plugin_execution_providers/tensorrt/utils/cuda/cuda_call.h b/plugin_execution_providers/tensorrt/utils/cuda/cuda_call.h
@@ -60,3 +60,4 @@ std::conditional_t<THRW, void, OrtStatus*> CudaCall(
     //ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line);
 
 #define CUDA_CALL(expr) (CudaCall<cudaError, false>((expr), #expr, "CUDA", cudaSuccess, "", __FILE__, __LINE__))
+#define CUDA_CALL_THROW(expr) (CudaCall<cudaError, true>((expr), #expr, "CUDA", cudaSuccess, "", __FILE__, __LINE__))
diff --git a/plugin_execution_providers/tensorrt/utils/helper.ccc b/plugin_execution_providers/tensorrt/utils/helper.ccc
diff --git a/plugin_execution_providers/tensorrt/utils/status.ccc b/plugin_execution_providers/tensorrt/utils/status.ccc

Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl`
`249`	`249`	`}`
`250`	`250`
`251`	`251`	`bool ORT_API_CALL TensorrtExecutionProviderFactory::IsStreamAwareImpl(const OrtEpFactory* /this_ptr/) noexcept {`
`252`		`- return false;`
	`252`	`+ return true;`
`253`	`253`	`}`
`254`	`254`
`255`	`255`	`// To make symbols visible on macOS/iOS`
Original file line number	Diff line number	Diff line change
`@@ -60,3 +60,4 @@ std::conditional_t<THRW, void, OrtStatus*> CudaCall(`
`60`	`60`	`//ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line);`
`61`	`61`
`62`	`62`	`#define CUDA_CALL(expr) (CudaCall<cudaError, false>((expr), #expr, "CUDA", cudaSuccess, "", __FILE__, __LINE__))`
	`63`	`+#define CUDA_CALL_THROW(expr) (CudaCall<cudaError, true>((expr), #expr, "CUDA", cudaSuccess, "", __FILE__, __LINE__))`