@@ -104,8 +104,8 @@ void setup_input_tensors(
104104  for  (size_t  i = 0 ; i < inputs.size (); i++) {
105105    std::string name = compiled_engine->in_binding_names [i];
106106
107-     TORCHTRT_CHECK (
108-         inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " device ());
107+     //   TORCHTRT_CHECK(
108+     //       inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
109109
110110    auto  expected_type =
111111        util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
@@ -202,30 +202,30 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
202202
203203std::vector<at::Tensor> execute_engine (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
204204  auto  run_standard_execution = [&]() {
205-     bool  cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206-     bool  shape_changed = _validate_shapes (inputs, compiled_engine);
205+     bool  cudagraphs_enabled = false ;  // (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206+     bool  shape_changed = false ;  // _validate_shapes(inputs, compiled_engine);
207207
208208    //  Whether cudagraphs needs to record the graph on this pass
209209    auto  result = compiled_engine->runtime_states .set_runtime_states (
210210        cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs , shape_changed);
211211
212-     bool  need_cudagraphs_record = std::get<0 >(result);
212+     bool  need_cudagraphs_record = false ;  // std::get<0>(result);
213213    bool  can_use_pre_allocated_outputs = std::get<1 >(result);
214214    bool  need_cudagraphs_reset = std::get<2 >(result);
215215
216-     if  (need_cudagraphs_reset) {
217-       compiled_engine->cudagraph .reset ();
218-     }
216+     //   if (need_cudagraphs_reset) {
217+     //     compiled_engine->cudagraph.reset();
218+     //   }
219219
220-     std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
220+     std::vector<at::Tensor> outputs;
221221
222222    //  Intialize inputs and outputs to be available throughout the succeeding scopes
223223    { //  Input Setup
224-       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225-       if  (compiled_engine->profile_execution ) {
226-         input_profiler_guard =
227-             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228-       }
224+       //   std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225+       //   if (compiled_engine->profile_execution) {
226+       //     input_profiler_guard =
227+       //         std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228+       //   }
229229
230230      setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
231231      //  Check if input shapes can be inferred.
@@ -240,72 +240,71 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240    }
241241
242242    { //  Output Setup
243-       std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244-       if  (compiled_engine->profile_execution ) {
245-         output_profiler_guard =
246-             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
247-       }
243+       bool  new_outputs = false ;
244+       //  std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
245+       //  if (compiled_engine->profile_execution) {
246+       //    output_profiler_guard =
247+       //        std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
248+       //  }
248249      if  (can_use_pre_allocated_outputs) {
249250        outputs = compiled_engine->pre_allocated_outputs ;
250251      } else  {
251-         outputs = create_output_tensors (compiled_engine);
252+         if  (compiled_engine->allocated_outputs .size () == 0 ) {
253+           compiled_engine->allocated_outputs  = create_output_tensors (compiled_engine);
254+           std::cout << " new_outputs" 
255+           new_outputs = true ;
256+         }
257+         outputs = compiled_engine->allocated_outputs ;
252258      }
253259
254-       for  (auto  output_indices : compiled_engine->out_binding_map ) {
255-         auto  pyt_idx = output_indices.second ;
256-         std::string name = compiled_engine->out_binding_names [pyt_idx];
257-         if  (need_cudagraphs_record) {
258-           //  If we are recording the cuda graph then we need to update the persistent output buffer
259-           compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260-         }
260+       if  (new_outputs) {
261+         for  (auto  output_indices : compiled_engine->out_binding_map ) {
262+           auto  pyt_idx = output_indices.second ;
263+           std::string name = compiled_engine->out_binding_names [pyt_idx];
264+           if  (need_cudagraphs_record) {
265+             //  If we are recording the cuda graph then we need to update the persistent output buffer
266+             compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
267+           }
261268
262-         if  (cudagraphs_enabled) {
263-           TORCHTRT_CHECK (
264-               compiled_engine->exec_ctx ->setTensorAddress (
265-                   name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266-               " Error while setting the output tensor address" 
267-         } else  {
268-           TORCHTRT_CHECK (
269-               compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270-               " Error while setting the output tensor address" 
269+           if  (cudagraphs_enabled) {
270+             TORCHTRT_CHECK (
271+                 compiled_engine->exec_ctx ->setTensorAddress (
272+                     name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
273+                 " Error while setting the output tensor address" 
274+           } else  {
275+             TORCHTRT_CHECK (
276+                 compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
277+                 " Error while setting the output tensor address" 
278+           }
271279        }
272280      }
273281    }
274282
275-     auto  current_device_id = -1 ;
276-     if  (inputs.size () > 0 ) {
277-       current_device_id = inputs[0 ].device ().index (); //  Done this way to avoid a call to cudart
278-     } else  if  (outputs.size () > 0 ) {
279-       current_device_id = outputs[0 ].device ().index (); //  Done this way to avoid a call to cudart
280-     }
281- 
282-     compiled_engine->caller_stream  = c10::cuda::getCurrentCUDAStream (current_device_id);
283-     if  (compiled_engine->engine_stream  == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284-       //  Create a new stream if the engine stream is the default stream
285-       compiled_engine->engine_stream  = c10::cuda::getStreamFromPool (false , current_device_id);
286-     }
283+     //  auto current_device_id = -1;
284+     //  if (inputs.size() > 0) {
285+     //    current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
286+     //    if (current_device_id != compiled_engine->current_device_id) {
287+     //      compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
288+     //    }
289+     //  } 
287290
288291    { //  Engine Execution (execute on engine stream)
289-       c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290292
291-       std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292-       if  (compiled_engine->profile_execution ) {
293-         enqueue_profiler_guard =
294-             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295-       }
293+       //  std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
294+       //  if (compiled_engine->profile_execution) {
295+       //    enqueue_profiler_guard =
296+       //        std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
297+       //  }
298+ 
296299
297-       //  Block engine stream until results are available on caller stream
298-       at::cuda::CUDAEvent caller_exec_complete;
299-       caller_exec_complete.record (compiled_engine->caller_stream );
300-       caller_exec_complete.block (compiled_engine->engine_stream );
301300
302301      if  (!cudagraphs_enabled) {
303302        //  Direct execution uses the caller buffers directly
304-         compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
303+         compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305304      } else  {
306305        if  (need_cudagraphs_record) {
307306          //  If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308-           c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
307+           c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309308          compiled_engine->cudagraph .capture_begin ();
310309          compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311310          compiled_engine->cudagraph .capture_end ();
@@ -321,27 +320,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
321320    } //  End engine exeuction (resets to caller stream)
322321
323322    //  Create output buffer for next execution of graph or trt context.
324-     if  (compiled_engine->use_pre_allocated_outputs ) {
325-       compiled_engine->pre_allocated_outputs  = create_output_tensors (compiled_engine);
326-     }
327- 
328-     //  Block caller stream until engine execution is complete
329-     at::cuda::CUDAEvent trt_exec_complete;
330-     trt_exec_complete.record (compiled_engine->engine_stream );
331-     trt_exec_complete.block (compiled_engine->caller_stream );
332- 
333-     if  (cudagraphs_enabled) {
334-       //  If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335-       for  (size_t  o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
336-         outputs[o].copy_ (compiled_engine->output_buffers [o], false );
337-       }
338-     }
339- 
340-     if  (compiled_engine->profile_execution ) {
341-       LOG_INFO (std::endl << *compiled_engine->trt_engine_profiler );
342-       dump_trace (compiled_engine->trt_engine_profile_path , *compiled_engine->trt_engine_profiler );
343-       compiled_engine->dump_engine_layer_info ();
344-     }
323+     //  if (compiled_engine->use_pre_allocated_outputs) {
324+     //    compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
325+     //  }
326+ 
327+     //  if (cudagraphs_enabled) {
328+     //    // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
329+     //    for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
330+     //      outputs[o].copy_(compiled_engine->output_buffers[o], false);
331+     //    }
332+     //  }
333+ 
334+     //  if (compiled_engine->profile_execution) {
335+     //    LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
336+     //    dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
337+     //    compiled_engine->dump_engine_layer_info();
338+     //  }
345339
346340    return  outputs;
347341  };
@@ -378,45 +372,31 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378372    auto  current_device_id = -1 ;
379373    if  (inputs.size () > 0 ) {
380374      current_device_id = inputs[0 ].device ().index (); //  Done this way to avoid a call to cudart
381-     } else  {
382-       current_device_id = at::cuda::current_device ();
383-     }
375+       if  (current_device_id != compiled_engine->current_device_id ) {
376+         compiled_engine->stream  = c10::cuda::getCurrentCUDAStream (current_device_id);
377+       
378+       }
379+     } 
384380
385-     compiled_engine->caller_stream  = c10::cuda::getCurrentCUDAStream (current_device_id);
386-     if  (compiled_engine->engine_stream  == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387-       //  Create a new stream if the engine stream is the default stream
388-       compiled_engine->engine_stream  = c10::cuda::getStreamFromPool (false , current_device_id);
389-     }
390381
391382    { //  Engine Execution (execute on engine stream)
392-       c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393- 
394-       std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395-       if  (compiled_engine->profile_execution ) {
396-         enqueue_profiler_guard =
397-             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398-       }
399383
400-       //  Block engine stream until results are available on caller stream
401-       at::cuda::CUDAEvent caller_exec_complete;
402-       caller_exec_complete.record (compiled_engine->caller_stream );
403-       caller_exec_complete.block (compiled_engine->engine_stream );
384+       //  std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
385+       //  if (compiled_engine->profile_execution) {
386+       //    enqueue_profiler_guard =
387+       //        std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
388+       //  }
404389
405390      //  Direct execution uses the caller buffers directly
406-       compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
391+       compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407392
408393    } //  End engine exeuction (resets to caller stream)
409394
410-     //  Block caller stream until engine execution is complete
411-     at::cuda::CUDAEvent trt_exec_complete;
412-     trt_exec_complete.record (compiled_engine->engine_stream );
413-     trt_exec_complete.block (compiled_engine->caller_stream );
414- 
415-     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416-     if  (compiled_engine->profile_execution ) {
417-       output_profiler_guard =
418-           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
419-     }
395+     //  std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
396+     //  if (compiled_engine->profile_execution) {
397+     //    output_profiler_guard =
398+     //        std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
399+     //  }
420400    std::vector<at::Tensor> outputs;
421401    for  (size_t  i = 0 ; i < compiled_engine->out_binding_names .size (); i++) {
422402      auto  name = compiled_engine->out_binding_names [i];
@@ -476,45 +456,45 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
476456          std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path );
477457    }
478458
479-     RTDevice curr_device = get_current_device ();
480-     LOG_DEBUG (" Current Device: " 
481- 
482-     //  Generic Target Device Prefix
483-     std::string target_device = " cuda:" 
484- 
485-     if  (is_switch_required (curr_device, compiled_engine->device_info )) {
486-       //  Scan through available CUDA devices and set the CUDA device context correctly
487-       RTDevice device =
488-           select_rt_device (compiled_engine->device_info , curr_device, compiled_engine->hardware_compatible );
489-       set_rt_device (device);
490- 
491-       //  Target device is new device
492-       target_device += std::to_string (device.id );
493- 
494-       for  (auto & in : inputs) {
495-         in = in.to (torch::Device (target_device));
496-       }
497-     } else  {
498-       //  Target device is current device
499-       target_device += std::to_string (curr_device.id );
500-     }
501- 
502-     //  For each input, ensure its current device is the desired target device
503-     for  (size_t  i = 0 ; i < inputs.size (); i++) {
504-       at::Tensor* in = &inputs[i];
505-       std::string current_tensor_device = in->device ().str ();
506- 
507-       //  If current device string does not match target device, display warning and move tensor accordingly
508-       if  (current_tensor_device != target_device) {
509-         LOG_WARNING (
510-             " Input " "  of engine " name  << "  was found to be on " 
511-                      << "  but should be on " " . This tensor is being moved by the runtime but " 
512-                      << " for performance considerations, ensure your inputs are all on GPU " 
513-                      << " and open an issue here (https://github.com/pytorch/TensorRT/issues) if this " 
514-                      << " warning persists." 
515-         *in = in->to (torch::Device (target_device));
516-       }
517-     }
459+   //     RTDevice curr_device = get_current_device();
460+   //     LOG_DEBUG("Current Device: " << curr_device);
461+ 
462+   //     // Generic Target Device Prefix
463+   //     std::string target_device = "cuda:";
464+ 
465+   //     if (is_switch_required(curr_device, compiled_engine->device_info)) {
466+   //       // Scan through available CUDA devices and set the CUDA device context correctly
467+   //       RTDevice device =
468+   //           select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
469+   //       set_rt_device(device);
470+ 
471+   //       // Target device is new device
472+   //       target_device += std::to_string(device.id);
473+ 
474+   //       for (auto& in : inputs) {
475+   //         in = in.to(torch::Device(target_device));
476+   //       }
477+   //     } else {
478+   //       // Target device is current device
479+   //       target_device += std::to_string(curr_device.id);
480+   //     }
481+ 
482+   //     // For each input, ensure its current device is the desired target device
483+   //     for (size_t i = 0; i < inputs.size(); i++) {
484+   //       at::Tensor* in = &inputs[i];
485+   //       std::string current_tensor_device = in->device().str();
486+ 
487+   //       // If current device string does not match target device, display warning and move tensor accordingly
488+   //       if (current_tensor_device != target_device) {
489+   //         LOG_WARNING(
490+   //             "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
491+   //                      << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
492+   //                      << "for performance considerations, ensure your inputs are all on GPU "
493+   //                      << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
494+   //                      << "warning persists.");
495+   //         *in = in->to(torch::Device(target_device));
496+   //       }
497+   //     }
518498  }
519499
520500  if  (compiled_engine->requires_output_allocator ) { //  engine requires OA
0 commit comments