From 798c20bb74f53d621543711164c01f607d391399 Mon Sep 17 00:00:00 2001
From: Phillip Kuznetsov <philkuz@gimletlabs.ai>
Date: Thu, 2 Oct 2025 23:26:47 +0000
Subject: [PATCH] Remove triton_backend to make code-scanner happy

Signed-off-by: Phillip Kuznetsov <philkuz@gimletlabs.ai>
---
 .../disaggregated_serving/README.md           |  123 --
 .../disaggregated_serving.md                  |  123 --
 .../disaggregated_serving_bls/1/model.py      |  138 --
 .../disaggregated_serving_bls/config.pbtxt    |  465 -----
 triton_backend/all_models/gpt/ensemble/1/.tmp |    0
 .../all_models/gpt/ensemble/config.pbtxt      |  230 ---
 .../all_models/gpt/postprocessing/1/model.py  |  119 --
 .../gpt/postprocessing/config.pbtxt           |   32 -
 .../all_models/gpt/preprocessing/1/model.py   |  196 --
 .../all_models/gpt/preprocessing/config.pbtxt |   71 -
 .../all_models/gpt/tensorrt_llm/1/model.py    |  203 ---
 .../all_models/gpt/tensorrt_llm/config.pbtxt  |  146 --
 .../inflight_batcher_llm/ensemble/1/.tmp      |    0
 .../ensemble/config.pbtxt                     |  712 --------
 .../postprocessing/1/model.py                 |  177 --
 .../postprocessing/config.pbtxt               |   70 -
 .../preprocessing/1/model.py                  | 1003 ----------
 .../preprocessing/config.pbtxt                |  261 ---
 .../tensorrt_llm/1/model.py                   | 1623 -----------------
 .../tensorrt_llm/config.pbtxt                 |  887 ---------
 .../tensorrt_llm_bls/1/lib/decode.py          |  459 -----
 .../tensorrt_llm_bls/1/lib/triton_decoder.py  |  597 ------
 .../tensorrt_llm_bls/1/model.py               |  253 ---
 .../tensorrt_llm_bls/config.pbtxt             |  447 -----
 .../llmapi/tensorrt_llm/1/helpers.py          |  159 --
 .../all_models/llmapi/tensorrt_llm/1/model.py |  692 -------
 .../llmapi/tensorrt_llm/1/model.yaml          |   19 -
 .../llmapi/tensorrt_llm/config.pbtxt          |  237 ---
 .../multimodal/ensemble/config.pbtxt          |  638 -------
 .../multimodal/multimodal_encoders/1/model.py |  707 -------
 .../multimodal_encoders/1/multimodal_utils.py |  350 ----
 .../multimodal_encoders/config.pbtxt          |  182 --
 .../requirements-llava-onevision.txt          |    1 -
 .../multimodal/requirements-mllama.txt        |    2 -
 .../multimodal/requirements-qwen2vl.txt       |    2 -
 .../multimodal/requirements-vila.txt          |    5 -
 .../all_models/tests/test_decode.py           |  376 ----
 .../tests/test_llmapi_python_backend.py       |  249 ---
 .../tests/test_multi_image_preprocess.py      |  256 ---
 .../tests/test_multimodal_encoders.py         |  155 --
 .../all_models/tests/test_python_backend.py   |  872 ---------
 .../all_models/tests/test_triton_decoder.py   |  456 -----
 .../all_models/whisper/whisper_bls/1/fbank.py |   99 -
 .../all_models/whisper/whisper_bls/1/model.py |  242 ---
 .../whisper/whisper_bls/1/tokenizer.py        |  183 --
 .../whisper/whisper_bls/config.pbtxt          |   73 -
 .../base_metrics_verification_tests.py        |  400 ----
 .../custom_metrics_verification_tests.py      |  169 --
 .../ci/L0_backend_trtllm/generate_engines.sh  |   82 -
 .../ci/L0_backend_trtllm/simple_data.json     |   67 -
 triton_backend/ci/L0_backend_trtllm/test.sh   |  662 -------
 triton_backend/ci/README.md                   |  107 --
 .../inflight_batcher_llm/CMakeLists.txt       |  318 ----
 .../inflight_batcher_llm/client/README.md     |   31 -
 .../inflight_batcher_llm/client/__init__.py   |    0
 .../e2e_grpc_speculative_decoding_client.py   |  562 ------
 .../client/end_to_end_grpc_client.py          |  736 --------
 .../client/inflight_batcher_llm_client.py     | 1250 -------------
 .../TritonTensorRTLLMBackendConfig.cmake.in   |   39 -
 .../cmake/modules/set_ifndef.cmake            |   24 -
 .../inflight_batcher_llm/scripts/build.sh     |   76 -
 .../custom_metrics_reporter.cc                |  283 ---
 .../custom_metrics_reporter.h                 |  196 --
 .../src/libtensorrtllm.cc                     |  225 ---
 .../src/libtriton_tensorrtllm.ldscript        |   33 -
 .../src/model_instance_state.cc               | 1569 ----------------
 .../src/model_instance_state.h                |  295 ---
 .../inflight_batcher_llm/src/model_state.cc   |  336 ----
 .../inflight_batcher_llm/src/model_state.h    |  158 --
 .../inflight_batcher_llm/src/namedTensor.cpp  |  121 --
 .../inflight_batcher_llm/src/namedTensor.h    |   93 -
 .../inflight_batcher_llm/src/utils.cc         | 1139 ------------
 .../inflight_batcher_llm/src/utils.h          |  443 -----
 .../inflight_batcher_llm/tests/CMakeLists.txt |   66 -
 .../inflight_batcher_llm/tests/first.json     |   25 -
 .../tests/modelInstanceStateTest.cpp          |   94 -
 .../inflight_batcher_llm/tests/modelState.cpp |  105 --
 .../inflight_batcher_llm/tests/second.json    |   25 -
 .../inflight_batcher_llm/tests/third.json     |   25 -
 .../inflight_batcher_llm/tests/utilsTest.cpp  |  781 --------
 triton_backend/requirements.txt               |    7 -
 .../scripts/launch_triton_server.py           |  249 ---
 triton_backend/tools/__init__.py              |    0
 triton_backend/tools/dataset/long_input.csv   |    1 -
 triton_backend/tools/dataset/long_output.csv  |    1 -
 .../tools/dataset/mini_cnn_eval.json          |  502 -----
 .../dataset/mini_cnn_eval_spec_decoding.json  |   77 -
 .../tools/dataset/short_input_end_id.csv      |    1 -
 .../dataset/short_input_end_id_medusa.csv     |    1 -
 .../tools/dataset/short_output_end_id.csv     |    1 -
 .../dataset/short_output_end_id_eagle.csv     |    1 -
 .../dataset/short_output_end_id_medusa.csv    |    1 -
 triton_backend/tools/fill_template.py         |   70 -
 .../tools/gpt/benchmark_core_model.py         |  177 --
 triton_backend/tools/gpt/client.py            |  120 --
 triton_backend/tools/gpt/client_async.py      |  133 --
 triton_backend/tools/gpt/end_to_end_test.py   |  249 ---
 triton_backend/tools/gpt/gen_input_data.py    |  108 --
 triton_backend/tools/gpt/input_data.json      |    3 -
 .../benchmark_core_model.py                   |  518 ------
 .../inflight_batcher_llm/end_to_end_test.py   |  638 -------
 .../speculative_decoding_test.py              |  375 ----
 .../test_max_queue_size.py                    |  638 -------
 triton_backend/tools/llmapi_client.py         |  275 ---
 triton_backend/tools/multimodal/client.py     |  420 -----
 triton_backend/tools/tests/__init__.py        |    0
 .../tools/tests/test_fill_template.py         |   33 -
 triton_backend/tools/utils.sh                 |   35 -
 triton_backend/tools/utils/__init__.py        |    0
 triton_backend/tools/utils/utils.py           |  483 -----
 triton_backend/tools/whisper/client.py        |  291 ---
 111 files changed, 30233 deletions(-)
 delete mode 100644 triton_backend/all_models/disaggregated_serving/README.md
 delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving.md
 delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py
 delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt
 delete mode 100644 triton_backend/all_models/gpt/ensemble/1/.tmp
 delete mode 100755 triton_backend/all_models/gpt/ensemble/config.pbtxt
 delete mode 100644 triton_backend/all_models/gpt/postprocessing/1/model.py
 delete mode 100755 triton_backend/all_models/gpt/postprocessing/config.pbtxt
 delete mode 100644 triton_backend/all_models/gpt/preprocessing/1/model.py
 delete mode 100644 triton_backend/all_models/gpt/preprocessing/config.pbtxt
 delete mode 100644 triton_backend/all_models/gpt/tensorrt_llm/1/model.py
 delete mode 100644 triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
 delete mode 100755 triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py
 delete mode 100755 triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
 delete mode 100755 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py
 delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
 delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py
 delete mode 100755 triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
 delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
 delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
 delete mode 100755 triton_backend/all_models/multimodal/ensemble/config.pbtxt
 delete mode 100755 triton_backend/all_models/multimodal/multimodal_encoders/1/model.py
 delete mode 100644 triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
 delete mode 100755 triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt
 delete mode 100644 triton_backend/all_models/multimodal/requirements-llava-onevision.txt
 delete mode 100644 triton_backend/all_models/multimodal/requirements-mllama.txt
 delete mode 100755 triton_backend/all_models/multimodal/requirements-qwen2vl.txt
 delete mode 100755 triton_backend/all_models/multimodal/requirements-vila.txt
 delete mode 100644 triton_backend/all_models/tests/test_decode.py
 delete mode 100644 triton_backend/all_models/tests/test_llmapi_python_backend.py
 delete mode 100755 triton_backend/all_models/tests/test_multi_image_preprocess.py
 delete mode 100644 triton_backend/all_models/tests/test_multimodal_encoders.py
 delete mode 100644 triton_backend/all_models/tests/test_python_backend.py
 delete mode 100644 triton_backend/all_models/tests/test_triton_decoder.py
 delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/fbank.py
 delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/model.py
 delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/tokenizer.py
 delete mode 100644 triton_backend/all_models/whisper/whisper_bls/config.pbtxt
 delete mode 100644 triton_backend/ci/L0_backend_trtllm/base_metrics_verification_tests.py
 delete mode 100644 triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
 delete mode 100644 triton_backend/ci/L0_backend_trtllm/generate_engines.sh
 delete mode 100644 triton_backend/ci/L0_backend_trtllm/simple_data.json
 delete mode 100644 triton_backend/ci/L0_backend_trtllm/test.sh
 delete mode 100644 triton_backend/ci/README.md
 delete mode 100644 triton_backend/inflight_batcher_llm/CMakeLists.txt
 delete mode 100644 triton_backend/inflight_batcher_llm/client/README.md
 delete mode 100644 triton_backend/inflight_batcher_llm/client/__init__.py
 delete mode 100644 triton_backend/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py
 delete mode 100644 triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py
 delete mode 100755 triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py
 delete mode 100644 triton_backend/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in
 delete mode 100644 triton_backend/inflight_batcher_llm/cmake/modules/set_ifndef.cmake
 delete mode 100644 triton_backend/inflight_batcher_llm/scripts/build.sh
 delete mode 100644 triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
 delete mode 100644 triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
 delete mode 100644 triton_backend/inflight_batcher_llm/src/libtensorrtllm.cc
 delete mode 100644 triton_backend/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript
 delete mode 100644 triton_backend/inflight_batcher_llm/src/model_instance_state.cc
 delete mode 100644 triton_backend/inflight_batcher_llm/src/model_instance_state.h
 delete mode 100644 triton_backend/inflight_batcher_llm/src/model_state.cc
 delete mode 100644 triton_backend/inflight_batcher_llm/src/model_state.h
 delete mode 100644 triton_backend/inflight_batcher_llm/src/namedTensor.cpp
 delete mode 100644 triton_backend/inflight_batcher_llm/src/namedTensor.h
 delete mode 100644 triton_backend/inflight_batcher_llm/src/utils.cc
 delete mode 100644 triton_backend/inflight_batcher_llm/src/utils.h
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/CMakeLists.txt
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/first.json
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/modelInstanceStateTest.cpp
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/modelState.cpp
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/second.json
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/third.json
 delete mode 100644 triton_backend/inflight_batcher_llm/tests/utilsTest.cpp
 delete mode 100644 triton_backend/requirements.txt
 delete mode 100644 triton_backend/scripts/launch_triton_server.py
 delete mode 100644 triton_backend/tools/__init__.py
 delete mode 100644 triton_backend/tools/dataset/long_input.csv
 delete mode 100644 triton_backend/tools/dataset/long_output.csv
 delete mode 100644 triton_backend/tools/dataset/mini_cnn_eval.json
 delete mode 100644 triton_backend/tools/dataset/mini_cnn_eval_spec_decoding.json
 delete mode 100755 triton_backend/tools/dataset/short_input_end_id.csv
 delete mode 100644 triton_backend/tools/dataset/short_input_end_id_medusa.csv
 delete mode 100755 triton_backend/tools/dataset/short_output_end_id.csv
 delete mode 100644 triton_backend/tools/dataset/short_output_end_id_eagle.csv
 delete mode 100644 triton_backend/tools/dataset/short_output_end_id_medusa.csv
 delete mode 100644 triton_backend/tools/fill_template.py
 delete mode 100644 triton_backend/tools/gpt/benchmark_core_model.py
 delete mode 100644 triton_backend/tools/gpt/client.py
 delete mode 100644 triton_backend/tools/gpt/client_async.py
 delete mode 100644 triton_backend/tools/gpt/end_to_end_test.py
 delete mode 100644 triton_backend/tools/gpt/gen_input_data.py
 delete mode 100644 triton_backend/tools/gpt/input_data.json
 delete mode 100644 triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py
 delete mode 100644 triton_backend/tools/inflight_batcher_llm/end_to_end_test.py
 delete mode 100644 triton_backend/tools/inflight_batcher_llm/speculative_decoding_test.py
 delete mode 100644 triton_backend/tools/inflight_batcher_llm/test_max_queue_size.py
 delete mode 100755 triton_backend/tools/llmapi_client.py
 delete mode 100755 triton_backend/tools/multimodal/client.py
 delete mode 100644 triton_backend/tools/tests/__init__.py
 delete mode 100644 triton_backend/tools/tests/test_fill_template.py
 delete mode 100644 triton_backend/tools/utils.sh
 delete mode 100644 triton_backend/tools/utils/__init__.py
 delete mode 100644 triton_backend/tools/utils/utils.py
 delete mode 100644 triton_backend/tools/whisper/client.py

diff --git a/triton_backend/all_models/disaggregated_serving/README.md b/triton_backend/all_models/disaggregated_serving/README.md
deleted file mode 100644
index 9ebb917c752..00000000000
--- a/triton_backend/all_models/disaggregated_serving/README.md
+++ /dev/null
@@ -1,123 +0,0 @@
-<!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-!-->
-
-# Running Disaggregated Serving with Triton TensorRT LLM Backend
-
-## Overview
-
-Disaggregated serving refers to a technique that uses separate GPUs for
-running the context and generation phases of LLM inference.
-
-For Triton integration, a BLS model named
-[_disaggregated\_serving\_bls_](./disaggregated_serving_bls/1/model.py)
-has been created that orchestrates the disaggregated serving pipeline. This
-BLS model requires the TRT-LLM model names that are going to be used for
-context and generation phases.
-
-This example assumes access to a two GPU device systems with CUDA_VISIBLE_DEVICES
-set to `0,1`.
-
-## Model Repository Setup and Start Server
-
-1. Setup the model repository as instructed in the [LLaMa](../docs/llama.md)
-guide.
-
-2. Create context and generation models with the desired tensor-parallel
-configuration. We will be using `context` and `generation` model names for
-context and generation models respectively. The context and generation models
-should be copying the config
-[tensorrt_llm](../inflight_batcher_llm/tensorrt_llm/) model.
-
-3. Set the `participant_ids` for context and generation models to `1` and `2` respectively.
-
-4. Set the `gpu_device_ids` for context and generation models to `0` and `1` respectively.
-
-5. Set the `context_model_name` and `generation_model_name` to `context` and `generation` in the
-[disaggregated_serving_bls](./disaggregated_serving_bls/config.pbtxt) model configuration.
-
-Your model repository should look like below:
-
-```
-disaggreagted_serving/
-|-- context
-|   |-- 1
-|   `-- config.pbtxt
-|-- disaggregated_serving_bls
-|   |-- 1
-|   |   `-- model.py
-|   `-- config.pbtxt
-|-- ensemble
-|   |-- 1
-|   `-- config.pbtxt
-|-- generation
-|   |-- 1
-|   `-- config.pbtxt
-|-- postprocessing
-|   |-- 1
-|   |   `-- model.py
-|   `-- config.pbtxt
-`-- preprocessing
-    |-- 1
-    |   `-- model.py
-    `-- config.pbtxt
-```
-
-6. Rename the `tensorrt_llm` model in the `ensemble` config.pbtxt file to `disaggregated_serving_bls`.
-
-7. Launch the Triton Server:
-
-```
-python3 scripts/launch_triton_server.py --world_size 3 --tensorrt_llm_model_name context,generation --multi-model --disable-spawn-processes
-```
-
-> ![NOTE]
->
-> The world size should be equal to `tp*pp` of context model + `tp*pp` of generation model + 1.
-> The additional process is required for the orchestrator.
-
-6. Send a request to the server.
-
-```
-python3 inflight_batcher_llm/client/end_to_end_grpc_client.py -S -p "Machine learning is"
-```
-
-## Creating Multiple Copies of the Context and Generation Models (Data Parallelism)
-
-You can also create multiple copies of the context and generation models. This can be
-achieved by setting the `participant_ids` and `gpu_device_ids` for each instance.
-
-For example, if you have a context model with `tp=2` and you want to create 2
-copies of it, you can set the `participant_ids` to `1,2;3,4`,
-`gpu_device_ids` to `0,1;2,3` (assuming a 4-GPU system), and set the `count`
-in `instance_groups` section of the model configuration to 2. This will create 2
-copies of the context model where the first copy will be on GPU 0 and 1, and the
-second copy will be on GPU 2 and 3.
-
-## Known Issues
-
-1. Only C++ version of the backend is supported right now.
diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md b/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md
deleted file mode 100644
index 9ebb917c752..00000000000
--- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md
+++ /dev/null
@@ -1,123 +0,0 @@
-<!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-!-->
-
-# Running Disaggregated Serving with Triton TensorRT LLM Backend
-
-## Overview
-
-Disaggregated serving refers to a technique that uses separate GPUs for
-running the context and generation phases of LLM inference.
-
-For Triton integration, a BLS model named
-[_disaggregated\_serving\_bls_](./disaggregated_serving_bls/1/model.py)
-has been created that orchestrates the disaggregated serving pipeline. This
-BLS model requires the TRT-LLM model names that are going to be used for
-context and generation phases.
-
-This example assumes access to a two GPU device systems with CUDA_VISIBLE_DEVICES
-set to `0,1`.
-
-## Model Repository Setup and Start Server
-
-1. Setup the model repository as instructed in the [LLaMa](../docs/llama.md)
-guide.
-
-2. Create context and generation models with the desired tensor-parallel
-configuration. We will be using `context` and `generation` model names for
-context and generation models respectively. The context and generation models
-should be copying the config
-[tensorrt_llm](../inflight_batcher_llm/tensorrt_llm/) model.
-
-3. Set the `participant_ids` for context and generation models to `1` and `2` respectively.
-
-4. Set the `gpu_device_ids` for context and generation models to `0` and `1` respectively.
-
-5. Set the `context_model_name` and `generation_model_name` to `context` and `generation` in the
-[disaggregated_serving_bls](./disaggregated_serving_bls/config.pbtxt) model configuration.
-
-Your model repository should look like below:
-
-```
-disaggreagted_serving/
-|-- context
-|   |-- 1
-|   `-- config.pbtxt
-|-- disaggregated_serving_bls
-|   |-- 1
-|   |   `-- model.py
-|   `-- config.pbtxt
-|-- ensemble
-|   |-- 1
-|   `-- config.pbtxt
-|-- generation
-|   |-- 1
-|   `-- config.pbtxt
-|-- postprocessing
-|   |-- 1
-|   |   `-- model.py
-|   `-- config.pbtxt
-`-- preprocessing
-    |-- 1
-    |   `-- model.py
-    `-- config.pbtxt
-```
-
-6. Rename the `tensorrt_llm` model in the `ensemble` config.pbtxt file to `disaggregated_serving_bls`.
-
-7. Launch the Triton Server:
-
-```
-python3 scripts/launch_triton_server.py --world_size 3 --tensorrt_llm_model_name context,generation --multi-model --disable-spawn-processes
-```
-
-> ![NOTE]
->
-> The world size should be equal to `tp*pp` of context model + `tp*pp` of generation model + 1.
-> The additional process is required for the orchestrator.
-
-6. Send a request to the server.
-
-```
-python3 inflight_batcher_llm/client/end_to_end_grpc_client.py -S -p "Machine learning is"
-```
-
-## Creating Multiple Copies of the Context and Generation Models (Data Parallelism)
-
-You can also create multiple copies of the context and generation models. This can be
-achieved by setting the `participant_ids` and `gpu_device_ids` for each instance.
-
-For example, if you have a context model with `tp=2` and you want to create 2
-copies of it, you can set the `participant_ids` to `1,2;3,4`,
-`gpu_device_ids` to `0,1;2,3` (assuming a 4-GPU system), and set the `count`
-in `instance_groups` section of the model configuration to 2. This will create 2
-copies of the context model where the first copy will be on GPU 0 and 1, and the
-second copy will be on GPU 2 and 3.
-
-## Known Issues
-
-1. Only C++ version of the backend is supported right now.
diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py b/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py
deleted file mode 100644
index 14e8de75a87..00000000000
--- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import json
-
-import triton_python_backend_utils as pb_utils
-
-
-def read_parameter_as_type(value, name, pytype=str):
-    if value == "":
-        return None
-    if value.startswith("${") and value.endswith("}"):
-        return None
-    if pytype is bool:
-        return value.lower() in ["1", "true"]
-    try:
-        result = pytype(value)
-        return result
-    except:
-        pb_utils.Logger.log_warning(
-            f"Could not read parameter '{name}' with value '{value}', will use default."
-        )
-        return None
-
-
-def get_parameter(model_config, name, pytype=str):
-    if name not in model_config['parameters']:
-        return None
-    return read_parameter_as_type(
-        model_config['parameters'][name]['string_value'], name, pytype)
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        model_config = json.loads(args['model_config'])
-        self.context_model_name = get_parameter(model_config,
-                                                "context_model_name")
-        self.generation_model_name = get_parameter(model_config,
-                                                   "generation_model_name")
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
-
-    def create_context_request(self, request):
-        inputs = request.inputs()
-        triton_request = pb_utils.InferenceRequest(
-            model_name=self.context_model_name,
-            inputs=inputs,
-            parameters={"request_type": "context_only"},
-            requested_output_names=[])
-        return triton_request
-
-    def create_generation_request(self, request, context_response):
-        inputs = request.inputs()
-        context_phase_params = pb_utils.get_output_tensor_by_name(
-            context_response, "context_phase_params")
-        if context_phase_params is None:
-            raise pb_utils.TritonModelException(
-                "Context response must have an output named context phase params"
-            )
-        inputs.append(context_phase_params)
-        triton_request = pb_utils.InferenceRequest(
-            model_name=self.generation_model_name,
-            inputs=inputs,
-            parameters={"request_type": "generation_only"},
-            requested_output_names=[])
-        return triton_request
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model.
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        for request in requests:
-            context_request = self.create_context_request(request)
-            context_responses = context_request.exec(decoupled=self.decoupled)
-            if self.decoupled:
-                context_responses = list(context_responses)
-                assert len(
-                    context_responses) == 1, "Expected 1 context response"
-
-            if self.decoupled:
-                context_response = context_responses[0]
-            else:
-                context_response = context_responses
-            if context_response.has_error():
-                raise pb_utils.TritonModelException(
-                    f"Context model {self.context_model_name} failed with error: {context_response.error().message()}"
-                )
-            generation_request = self.create_generation_request(
-                request, context_response)
-
-            # TODO(itabrizian): Send the context response to reduce TTFT in decoupled case.
-            # It requires adding the generated token to the generation request
-            # to avoid sending the first token multiple times.
-            responses = generation_request.exec(decoupled=self.decoupled)
-
-            if self.decoupled:
-                for response in responses:
-                    if response.has_error():
-                        raise pb_utils.TritonModelException(
-                            f"Generation model {self.generation_model_name} failed with error: {response.error().message()}"
-                        )
-                    request.get_response_sender().send(response)
-
-                request.get_response_sender().send(
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            else:
-                request.get_response_sender().send(
-                    responses,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt b/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt
deleted file mode 100644
index 8243c2cf03b..00000000000
--- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt
+++ /dev/null
@@ -1,465 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "disaggregated_serving_bls"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-  decoupled: ${decoupled_mode}
-}
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    allow_ragged_batch: true
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_return_sequences"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "draft_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "draft_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "draft_acceptance_threshold"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stop_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "bad_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "embedding_bias"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "runtime_top_p_min"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "runtime_top_p_decay"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "runtime_top_p_reset_ids"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "early_stopping"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_search_diversity_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_perf_metrics"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "exclude_input_in_output"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stop"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "streaming"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "prompt_table_extra_ids"
-    data_type: TYPE_UINT64
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  # the unique task ID for the given LoRA.
-  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
-  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
-  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
-  {
-    name: "lora_task_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
-  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
-  # each of the in / out tensors are first flattened and then concatenated together in the format above.
-  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
-  {
-    name: "lora_weights"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  # module identifier (same size a first dimension of lora_weights)
-  # See LoraModule::ModuleType for model id mapping
-  #
-  # "attn_qkv": 0     # compbined qkv adapter
-  # "attn_q": 1       # q adapter
-  # "attn_k": 2       # k adapter
-  # "attn_v": 3       # v adapter
-  # "attn_dense": 4   # adapter for the dense layer in attention
-  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
-  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
-  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
-  #
-  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
-  {
-    name: "lora_config"
-    data_type: TYPE_INT32
-    dims: [ -1, 3 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide_type"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "return_num_output_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_num_input_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "sequence_length"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1, -1 ]
-  },
-  {
-    name: "batch_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "sequence_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_new_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_reused_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "arrival_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_scheduled_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "last_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "acceptance_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_accepted_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_output_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_input_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-instance_group [
-  {
-    count: ${disaggregated_serving_bls_count}
-    kind : KIND_CPU
-  }
-]
-
-parameters: {
-  key: "context_model_name"
-  value: {
-    string_value: "${context_model_name}"
-  }
-}
-parameters: {
-  key: "generation_model_name"
-  value: {
-    string_value: "${generation_model_name}"
-  }
-}
diff --git a/triton_backend/all_models/gpt/ensemble/1/.tmp b/triton_backend/all_models/gpt/ensemble/1/.tmp
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/all_models/gpt/ensemble/config.pbtxt b/triton_backend/all_models/gpt/ensemble/config.pbtxt
deleted file mode 100755
index e1bc29a2531..00000000000
--- a/triton_backend/all_models/gpt/ensemble/config.pbtxt
+++ /dev/null
@@ -1,230 +0,0 @@
-name: "ensemble"
-platform: "ensemble"
-max_batch_size: 1024
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1, -1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "preprocessing"
-      model_version: -1
-      input_map {
-        key: "QUERY"
-        value: "text_input"
-      }
-      input_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "max_tokens"
-      }
-      input_map {
-        key: "BAD_WORDS_DICT"
-        value: "bad_words"
-      }
-      input_map {
-        key: "STOP_WORDS_DICT"
-        value: "stop_words"
-      }
-      output_map {
-        key: "REQUEST_INPUT_LEN"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      output_map {
-        key: "INPUT_ID"
-        value: "_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-    },
-    {
-      model_name: "tensorrt_llm"
-      model_version: -1
-      input_map {
-        key: "input_ids"
-        value: "_INPUT_ID"
-      }
-      input_map {
-        key: "input_lengths"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      input_map {
-        key: "request_output_len"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-          key: "end_id"
-          value: "end_id"
-      }
-      input_map {
-          key: "pad_id"
-          value: "pad_id"
-      }
-      input_map {
-          key: "runtime_top_k"
-          value: "top_k"
-      }
-      input_map {
-          key: "runtime_top_p"
-          value: "top_p"
-      }
-      input_map {
-          key: "temperature"
-          value: "temperature"
-      }
-      input_map {
-          key: "len_penalty"
-          value: "length_penalty"
-      }
-      input_map {
-          key: "repetition_penalty"
-          value: "repetition_penalty"
-      }
-      input_map {
-          key: "min_tokens"
-          value: "min_tokens"
-      }
-      input_map {
-          key: "presence_penalty"
-          value: "presence_penalty"
-      }
-      input_map {
-          key: "frequency_penalty"
-          value: "frequency_penalty"
-      }
-      input_map {
-          key: "seed"
-          value: "seed"
-      }
-      input_map {
-          key: "beam_width"
-          value: "beam_width"
-      }
-      input_map {
-          key: "output_log_probs"
-          value: "output_log_probs"
-      }
-      output_map {
-        key: "output_ids"
-        value: "_TOKENS_BATCH"
-      }
-    },
-    {
-      model_name: "postprocessing"
-      model_version: -1
-      input_map {
-        key: "TOKENS_BATCH"
-        value: "_TOKENS_BATCH"
-      }
-      output_map {
-        key: "OUTPUT"
-        value: "text_output"
-      }
-    }
-  ]
-}
diff --git a/triton_backend/all_models/gpt/postprocessing/1/model.py b/triton_backend/all_models/gpt/postprocessing/1/model.py
deleted file mode 100644
index 006a98cf399..00000000000
--- a/triton_backend/all_models/gpt/postprocessing/1/model.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side="left",
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
-
-        # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            tokens_batch = pb_utils.get_input_tensor_by_name(
-                request, 'TOKENS_BATCH').as_numpy()
-
-            # Reshape Input
-            # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
-            # tokens_batch = tokens_batch.T
-
-            # Postprocessing output data.
-            outputs = self._postprocessing(tokens_batch)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            output_tensor = pb_utils.Tensor(
-                'OUTPUT',
-                np.array(outputs).astype(self.output_dtype))
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _postprocessing(self, tokens_batch):
-        outputs = []
-        for beam_tokens in tokens_batch:
-            for tokens in beam_tokens:
-                output = self.tokenizer.decode(tokens)
-                outputs.append(output.encode('utf8'))
-        return outputs
diff --git a/triton_backend/all_models/gpt/postprocessing/config.pbtxt b/triton_backend/all_models/gpt/postprocessing/config.pbtxt
deleted file mode 100755
index 04b56cab226..00000000000
--- a/triton_backend/all_models/gpt/postprocessing/config.pbtxt
+++ /dev/null
@@ -1,32 +0,0 @@
-name: "postprocessing"
-backend: "python"
-max_batch_size: 1024
-dynamic_batching {}
-input [
-  {
-    name: "TOKENS_BATCH"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT"
-    data_type: TYPE_STRING
-    dims: [ -1, -1 ]
-  }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-instance_group [
-    {
-        count: 1
-        kind: KIND_CPU
-    }
-]
diff --git a/triton_backend/all_models/gpt/preprocessing/1/model.py b/triton_backend/all_models/gpt/preprocessing/1/model.py
deleted file mode 100644
index a301e16b747..00000000000
--- a/triton_backend/all_models/gpt/preprocessing/1/model.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-from typing import List
-
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from torch.nn.utils.rnn import pad_sequence
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       padding_side='left',
-                                                       legacy=False,
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
-                                            add_special_tokens=False)[0]
-
-        # Parse model output configs and convert Triton types to numpy types
-        input_names = [
-            "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
-        ]
-        for input_name in input_names:
-            setattr(
-                self,
-                input_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
-                        model_config, input_name)['data_type']))
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request,
-                                                      'QUERY').as_numpy()
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
-
-            bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT').as_numpy()
-            stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT').as_numpy()
-
-            # Preprocessing input data.
-            input_id, request_input_len = self._create_request(query)
-            bad_words = self._to_word_list_format(bad_words_dict)
-            stop_words = self._to_word_list_format(stop_words_dict)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            input_id_tensor = pb_utils.Tensor(
-                'INPUT_ID',
-                np.array(input_id).astype(self.input_id_dtype))
-            request_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_INPUT_LEN',
-                np.array(request_input_len).astype(
-                    self.request_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor('REQUEST_OUTPUT_LEN',
-                                                        request_output_len)
-            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
-            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
-                                                    stop_words)
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
-                request_input_len_tensor, request_output_len_tensor
-            ])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _create_request(self, query):
-        """
-            query : batch string (2D numpy array)
-        """
-        start_ids = [
-            torch.IntTensor(self.tokenizer.encode(s[0].decode())) for s in query
-        ]
-        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
-
-        start_ids = pad_sequence(start_ids,
-                                 batch_first=True,
-                                 padding_value=self.pad_id)
-        # input_len = min(start_lengths)
-        #attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
-
-        return start_ids, start_lengths
-
-    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
-        '''
-        word_lists format:
-            len(word_lists) == batch_size
-            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
-        '''
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if word_lists is None:
-            # Return an empty array of shape (1,2,0)
-            return np.empty([1, 2, 0], dtype="int32")
-
-        flat_ids = []
-        offsets = []
-        for word_list in word_lists:
-            item_flat_ids = []
-            item_offsets = []
-
-            for word in word_list:
-                if isinstance(word, bytes):
-                    word = word.decode()
-
-                ids = self.tokenizer.encode(word, add_special_tokens=False)
-
-                if len(ids) == 0:
-                    continue
-
-                item_flat_ids += ids
-                item_offsets.append(len(ids))
-
-            flat_ids.append(np.array(item_flat_ids))
-            offsets.append(np.cumsum(np.array(item_offsets)))
-
-        pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
-                                constant_values=-1)
-
-        return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
diff --git a/triton_backend/all_models/gpt/preprocessing/config.pbtxt b/triton_backend/all_models/gpt/preprocessing/config.pbtxt
deleted file mode 100644
index c36b2b81618..00000000000
--- a/triton_backend/all_models/gpt/preprocessing/config.pbtxt
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "preprocessing"
-backend: "python"
-max_batch_size: 1024
-input [
-    {
-        name: "QUERY"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    }
-]
-output [
-    {
-        name: "INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "PROMPT_LEARNING_TASK_NAME_IDS"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-instance_group [
-    {
-        count: 1
-        kind: KIND_CPU
-    }
-]
diff --git a/triton_backend/all_models/gpt/tensorrt_llm/1/model.py b/triton_backend/all_models/gpt/tensorrt_llm/1/model.py
deleted file mode 100644
index cb8cad377fe..00000000000
--- a/triton_backend/all_models/gpt/tensorrt_llm/1/model.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import json
-
-import torch
-import triton_python_backend_utils as pb_utils
-from torch import from_numpy
-
-from tensorrt_llm.runtime import ModelRunner, SamplingConfig
-
-
-def mpi_comm():
-    from mpi4py import MPI
-    return MPI.COMM_WORLD
-
-
-def mpi_rank():
-    return mpi_comm().Get_rank()
-
-
-def get_engine_name(model, dtype, tp_size, rank):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-
-
-def get_input_tensor_by_name(request, name):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is not None:
-        # Triton tensor -> numpy tensor -> PyTorch tensor
-        return from_numpy(tensor.as_numpy())
-    else:
-        return tensor
-
-
-def get_input_scalar_by_name(request, name):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is not None:
-        # Triton tensor -> numpy tensor -> first scalar
-        tensor = tensor.as_numpy()
-        return tensor.reshape((tensor.size, ))[0]
-    else:
-        return tensor
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        model_config = json.loads(args['model_config'])
-        engine_dir = model_config['parameters']['engine_dir']['string_value']
-        self.comm = mpi_comm()
-        self.rank = mpi_rank()
-        self.runner = ModelRunner.from_dir(engine_dir=engine_dir,
-                                           rank=self.rank)
-        if self.rank != 0:
-            while (True):
-                self.execute([None])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model.
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        responses = []
-
-        # Every Python backend must iterate through list of requests and create
-        # an instance of pb_utils.InferenceResponse class for each of them. You
-        # should avoid storing any of the input Tensors in the class attributes
-        # as they will be overridden in subsequent inference requests. You can
-        # make a copy of the underlying NumPy array and store it if it is
-        # required.
-        for request in requests:
-            # Perform inference on the request and append it to responses list...
-            inputs = {}
-            if self.rank == 0:
-                inputs['input_ids'] = get_input_tensor_by_name(
-                    request, 'input_ids')
-                inputs['input_lengths'] = get_input_tensor_by_name(
-                    request, 'input_lengths')
-                inputs['request_output_len'] = get_input_scalar_by_name(
-                    request, 'request_output_len')
-                inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
-                inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
-                inputs['beam_width'] = get_input_scalar_by_name(
-                    request, 'beam_width')
-                inputs['temperature'] = get_input_scalar_by_name(
-                    request, 'temperature')
-                inputs['runtime_top_k'] = get_input_scalar_by_name(
-                    request, 'runtime_top_k')
-                inputs['runtime_top_p'] = get_input_scalar_by_name(
-                    request, 'runtime_top_p')
-                inputs['len_penalty'] = get_input_scalar_by_name(
-                    request, 'len_penalty')
-                inputs['repetition_penalty'] = get_input_scalar_by_name(
-                    request, 'repetition_penalty')
-                inputs['min_tokens'] = get_input_scalar_by_name(
-                    request, 'min_tokens')
-                inputs['presence_penalty'] = get_input_scalar_by_name(
-                    request, 'presence_penalty')
-                inputs['frequency_penalty'] = get_input_scalar_by_name(
-                    request, 'frequency_penalty')
-                inputs['seed'] = get_input_scalar_by_name(request, 'seed')
-                inputs['output_log_probs'] = get_input_scalar_by_name(
-                    request, 'output_log_probs')
-
-            # Broadcast requests to other clients
-            inputs = self.comm.bcast(inputs, root=0)
-            input_ids = inputs['input_ids'].cuda()
-            end_id = inputs['end_id']
-            pad_id = inputs['pad_id']
-
-            sampling_config = SamplingConfig(end_id=end_id, pad_id=pad_id)
-            if inputs['beam_width'] is not None:
-                sampling_config.num_beams = inputs['beam_width']
-            if inputs['temperature'] is not None:
-                sampling_config.temperature = inputs['temperature']
-            if inputs['runtime_top_k'] is not None:
-                sampling_config.top_k = inputs['runtime_top_k']
-            if inputs['runtime_top_p'] is not None:
-                sampling_config.top_p = inputs['runtime_top_p']
-            if inputs['len_penalty'] is not None:
-                sampling_config.length_penalty = inputs['len_penalty']
-            if inputs['repetition_penalty'] is not None:
-                sampling_config.repetition_penalty = inputs[
-                    'repetition_penalty']
-            if inputs['min_tokens'] is not None:
-                sampling_config.min_tokens = inputs['min_tokens']
-            if inputs['presence_penalty'] is not None:
-                sampling_config.presence_penalty = inputs['presence_penalty']
-            if inputs['frequency_penalty'] is not None:
-                sampling_config.frequency_penalty = inputs['frequency_penalty']
-            sampling_config.seed = inputs['seed']
-            sampling_config.output_log_probs = inputs['output_log_probs']
-            sampling_config.return_dict = True
-
-            outputs = self.runner.generate(input_ids, None, sampling_config)
-            output_ids = outputs["output_ids"]
-
-            if self.rank == 0:
-                # Create output tensors. You need pb_utils.Tensor
-                # objects to create pb_utils.InferenceResponse.
-                torch.cuda.synchronize()
-                output_tensors = [
-                    pb_utils.Tensor("output_ids",
-                                    output_ids.cpu().numpy())
-                ]
-
-                if sampling_config.output_log_probs:
-                    # [max_seq_len, batch_size, num_beams] -> [batch_size, max_seq_len, num_beams]
-                    log_probs = self.runner.session.log_probs_tiled.transpose(
-                        0, 1).cpu().numpy()
-                    output_tensors.append(
-                        pb_utils.Tensor("log_probs", log_probs))
-
-                # Create InferenceResponse. You can set an error here in case
-                # there was a problem with handling this inference request.
-                # Below is an example of how you can set errors in inference
-                # response:
-                #
-                # pb_utils.InferenceResponse(
-                #    output_tensors=..., TritonError("An error occurred"))
-
-                inference_response = pb_utils.InferenceResponse(output_tensors)
-            else:
-                inference_response = pb_utils.InferenceResponse([])
-            responses.append(inference_response)
-
-        # You must return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        return
diff --git a/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt b/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt
deleted file mode 100644
index 5d2108fc3ea..00000000000
--- a/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt
+++ /dev/null
@@ -1,146 +0,0 @@
-name: "tensorrt_llm"
-backend: "python"
-max_batch_size: 1024
-
-# # Uncomment this for dynamic_batching
-# dynamic_batching {
-#    max_queue_delay_microseconds: 50000
-# }
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  }
-]
-instance_group [
-  {
-    count: 1
-    kind : KIND_CPU
-  }
-]
-parameters {
-  key: "engine_dir"
-  value: {
-    string_value: "${engine_dir}"
-  }
-}
-parameters: {
-  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
-  value: {
-    string_value: "no"
-  }
-}
diff --git a/triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp b/triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt
deleted file mode 100644
index c07a5ab6de4..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt
+++ /dev/null
@@ -1,712 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "ensemble"
-platform: "ensemble"
-max_batch_size: ${triton_max_batch_size}
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  },
-  {
-    name: "decoder_text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_return_sequences"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-    name: "exclude_input_in_output"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_perf_metrics"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_num_input_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_num_output_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stream"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "prompt_table_extra_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "embedding_bias_words"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "embedding_bias_weights"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-  },
-  # the unique task ID for the given LoRA.
-  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
-  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
-  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
-  {
-    name: "lora_task_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
-  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
-  # each of the in / out tensors are first flattened and then concatenated together in the format above.
-  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
-  {
-    name: "lora_weights"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  # module identifier (same size a first dimension of lora_weights)
-  # See LoraModule::ModuleType for model id mapping
-  #
-  # "attn_qkv": 0     # compbined qkv adapter
-  # "attn_q": 1       # q adapter
-  # "attn_k": 2       # k adapter
-  # "attn_v": 3       # v adapter
-  # "attn_dense": 4   # adapter for the dense layer in attention
-  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
-  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
-  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
-  #
-  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
-  {
-    name: "lora_config"
-    data_type: TYPE_INT32
-    dims: [ -1, 3 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide_type"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1, -1 ]
-  },
-  {
-    name: "batch_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "sequence_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_new_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_reused_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "arrival_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_scheduled_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "last_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "acceptance_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_accepted_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_input_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_output_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "preprocessing"
-      model_version: -1
-      input_map {
-        key: "QUERY"
-        value: "text_input"
-      }
-      input_map {
-        key: "DECODER_QUERY"
-        value: "decoder_text_input"
-      }
-      input_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "max_tokens"
-      }
-      input_map {
-        key: "BAD_WORDS_DICT"
-        value: "bad_words"
-      }
-      input_map {
-        key: "STOP_WORDS_DICT"
-        value: "stop_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WORDS"
-        value: "embedding_bias_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WEIGHTS"
-        value: "embedding_bias_weights"
-      }
-      input_map {
-        key: "END_ID"
-        value: "end_id"
-      }
-      input_map {
-        key: "PAD_ID"
-        value: "pad_id"
-      }
-      input_map {
-        key: "PROMPT_TABLE_EXTRA_ID"
-        value: "prompt_table_extra_id"
-      }
-      output_map {
-        key: "REQUEST_INPUT_LEN"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      output_map {
-        key: "INPUT_ID"
-        value: "_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_DECODER_INPUT_LEN"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      output_map {
-        key: "DECODER_INPUT_ID"
-        value: "_DECODER_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      output_map {
-        key: "STOP_WORDS_IDS"
-        value: "_STOP_WORDS_IDS"
-      }
-      output_map {
-        key: "BAD_WORDS_IDS"
-        value: "_BAD_WORDS_IDS"
-      }
-      output_map {
-        key: "EMBEDDING_BIAS"
-        value: "_EMBEDDING_BIAS"
-      }
-      output_map {
-        key: "OUT_END_ID"
-        value: "_PREPROCESSOR_END_ID"
-      }
-      output_map {
-        key: "OUT_PAD_ID"
-        value: "_PREPROCESSOR_PAD_ID"
-      }
-      output_map {
-        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
-        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
-      }
-    },
-    {
-      model_name: "tensorrt_llm"
-      model_version: -1
-      input_map {
-        key: "input_ids"
-        value: "_INPUT_ID"
-      }
-      input_map {
-        key: "decoder_input_ids"
-        value: "_DECODER_INPUT_ID"
-      }
-      input_map {
-        key: "input_lengths"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      input_map {
-        key: "decoder_input_lengths"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      input_map {
-        key: "exclude_input_in_output"
-        value: "exclude_input_in_output"
-      }
-      input_map {
-        key: "request_output_len"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-          key: "end_id"
-          value: "_PREPROCESSOR_END_ID"
-      }
-      input_map {
-          key: "pad_id"
-          value: "_PREPROCESSOR_PAD_ID"
-      }
-      input_map {
-          key: "embedding_bias"
-          value: "_EMBEDDING_BIAS"
-      }
-      input_map {
-          key: "runtime_top_k"
-          value: "top_k"
-      }
-      input_map {
-          key: "runtime_top_p"
-          value: "top_p"
-      }
-      input_map {
-          key: "temperature"
-          value: "temperature"
-      }
-      input_map {
-          key: "len_penalty"
-          value: "length_penalty"
-      }
-      input_map {
-          key: "repetition_penalty"
-          value: "repetition_penalty"
-      }
-      input_map {
-          key: "min_tokens"
-          value: "min_tokens"
-      }
-      input_map {
-          key: "presence_penalty"
-          value: "presence_penalty"
-      }
-      input_map {
-          key: "frequency_penalty"
-          value: "frequency_penalty"
-      }
-      input_map {
-          key: "seed"
-          value: "seed"
-      }
-      input_map {
-          key: "return_log_probs"
-          value: "return_log_probs"
-      }
-      input_map {
-          key: "return_context_logits"
-          value: "return_context_logits"
-      }
-      input_map {
-          key: "return_generation_logits"
-          value: "return_generation_logits"
-      }
-      input_map {
-          key: "return_perf_metrics"
-          value: "return_perf_metrics"
-      }
-      input_map {
-          key: "return_num_input_tokens"
-          value: "return_num_input_tokens"
-      }
-      input_map {
-          key: "return_num_output_tokens"
-          value: "return_num_output_tokens"
-      }
-      input_map {
-          key: "num_return_sequences"
-          value: "num_return_sequences"
-      }
-      input_map {
-          key: "beam_width"
-          value: "beam_width"
-      }
-      input_map {
-          key: "streaming"
-          value: "stream"
-      }
-      input_map {
-        key: "prompt_embedding_table"
-        value: "prompt_embedding_table"
-      }
-      input_map {
-        key: "prompt_vocab_size"
-        value: "prompt_vocab_size"
-      }
-      input_map {
-        key: "stop_words_list"
-        value: "_STOP_WORDS_IDS"
-      }
-      input_map {
-        key: "bad_words_list"
-        value: "_BAD_WORDS_IDS"
-      }
-      input_map {
-        key: "prompt_table_extra_ids"
-        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
-      },
-      input_map {
-        key: "lora_task_id",
-        value: "lora_task_id"
-      },
-      input_map {
-        key: "lora_weights",
-        value: "lora_weights"
-      },
-      input_map {
-        key: "lora_config",
-        value: "lora_config"
-      },
-      input_map {
-        key: "guided_decoding_guide_type",
-        value: "guided_decoding_guide_type"
-      },
-      input_map {
-        key: "guided_decoding_guide",
-        value: "guided_decoding_guide"
-      }
-      output_map {
-        key: "output_ids"
-        value: "_TOKENS_BATCH"
-      }
-      output_map {
-        key: "sequence_length"
-        value: "_SEQUENCE_LENGTH"
-      },
-      output_map {
-        key: "cum_log_probs"
-        value: "cum_log_probs"
-      }
-      output_map {
-        key: "output_log_probs"
-        value: "output_log_probs"
-      },
-      output_map {
-        key: "context_logits"
-        value: "context_logits"
-      },
-      output_map {
-        key: "generation_logits"
-        value: "generation_logits"
-      },
-      output_map {
-        key: "batch_index"
-        value: "batch_index"
-      },
-      output_map {
-        key: "sequence_index"
-        value: "sequence_index"
-      },
-      output_map {
-        key: "kv_cache_alloc_new_blocks"
-        value: "kv_cache_alloc_new_blocks"
-      },
-      output_map {
-        key: "kv_cache_reused_blocks"
-        value: "kv_cache_reused_blocks"
-      },
-      output_map {
-        key: "kv_cache_alloc_total_blocks"
-        value: "kv_cache_alloc_total_blocks"
-      },
-      output_map {
-        key: "arrival_time_ns"
-        value: "arrival_time_ns"
-      },
-      output_map {
-        key: "first_scheduled_time_ns"
-        value: "first_scheduled_time_ns"
-      },
-      output_map {
-        key: "first_token_time_ns"
-        value: "first_token_time_ns"
-      },
-      output_map {
-        key: "last_token_time_ns"
-        value: "last_token_time_ns"
-      },
-      output_map {
-        key: "acceptance_rate"
-        value: "acceptance_rate"
-      },
-      output_map {
-        key: "total_accepted_draft_tokens"
-        value: "total_accepted_draft_tokens"
-      },
-      output_map {
-        key: "total_draft_tokens"
-        value: "total_draft_tokens"
-      },
-      output_map {
-        key: "num_input_tokens"
-        value: "num_input_tokens"
-      },
-      output_map {
-        key: "num_output_tokens"
-        value: "num_output_tokens"
-      }
-    },
-    {
-      model_name: "postprocessing"
-      model_version: -1
-      input_map {
-        key: "TOKENS_BATCH"
-        value: "_TOKENS_BATCH"
-      }
-      input_map {
-        key: "SEQUENCE_LENGTH"
-        value: "_SEQUENCE_LENGTH"
-      }
-      output_map {
-        key: "OUTPUT"
-        value: "text_output"
-      }
-    }
-  ]
-}
diff --git a/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py b/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py
deleted file mode 100644
index 70d88ae785c..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        skip_special_tokens = model_config['parameters'].get(
-            'skip_special_tokens')
-        if skip_special_tokens is not None:
-            skip_special_tokens_str = skip_special_tokens['string_value'].lower(
-            )
-            if skip_special_tokens_str in [
-                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
-            ]:
-                self.skip_special_tokens = skip_special_tokens_str in [
-                    'true', '1', 't', 'y', 'yes'
-                ]
-            else:
-                print(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.skip_special_tokens = True
-        else:
-            print(
-                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
-            )
-            self.skip_special_tokens = True
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
-
-        # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        tokens_batch = []
-        sequence_lengths = []
-        for idx, request in enumerate(requests):
-            for input_tensor in request.inputs():
-                if input_tensor.name() == "TOKENS_BATCH":
-                    tokens_batch.append(input_tensor.as_numpy())
-                elif input_tensor.name() == "SEQUENCE_LENGTH":
-                    sequence_lengths.append(input_tensor.as_numpy())
-                else:
-                    raise ValueError(f"unknown input {input_tensor.name}")
-
-        # batch decode
-        list_of_tokens = []
-        req_idx_offset = 0
-        req_idx_offsets = [req_idx_offset]
-        for idx, token_batch in enumerate(tokens_batch):
-            for batch_idx, beam_tokens in enumerate(token_batch):
-                for beam_idx, tokens in enumerate(beam_tokens):
-                    seq_len = sequence_lengths[idx][batch_idx][beam_idx]
-                    list_of_tokens.append(tokens[:seq_len])
-                    req_idx_offset += 1
-
-            req_idx_offsets.append(req_idx_offset)
-
-        all_outputs = self.tokenizer.batch_decode(
-            list_of_tokens, skip_special_tokens=self.skip_special_tokens)
-
-        # construct responses
-        responses = []
-        for idx, request in enumerate(requests):
-            req_outputs = [
-                x.encode('utf8')
-                for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx +
-                                                                          1]]
-            ]
-
-            output_tensor = pb_utils.Tensor(
-                'OUTPUT',
-                np.array(req_outputs).astype(self.output_dtype))
-
-            outputs = [output_tensor]
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=outputs)
-            responses.append(inference_response)
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
diff --git a/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
deleted file mode 100644
index 9f3655ef4d7..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "postprocessing"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-dynamic_batching {}
-input [
-  {
-    name: "TOKENS_BATCH"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "SEQUENCE_LENGTH"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-parameters {
-  key: "skip_special_tokens"
-  value: {
-    string_value: "${skip_special_tokens}"
-  }
-}
-
-instance_group [
-    {
-        count: ${postprocessing_instance_count}
-        kind: KIND_CPU
-    }
-]
diff --git a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py b/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py
deleted file mode 100755
index 5a7d36ac1e8..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py
+++ /dev/null
@@ -1,1003 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import base64
-import io
-import json
-import os
-from collections import defaultdict
-from typing import List
-
-import numpy as np
-import requests
-import triton_python_backend_utils as pb_utils
-from PIL import Image
-from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        add_special_tokens = model_config['parameters'].get(
-            'add_special_tokens')
-        multimodal_model_path = model_config['parameters'][
-            'multimodal_model_path']['string_value']
-        max_num_images = model_config['parameters'].get('max_num_images')
-
-        if max_num_images is not None:
-            max_num_images_str = max_num_images['string_value']
-            if max_num_images_str.isdigit():
-                self.max_num_images = int(max_num_images_str)
-            else:
-                print(
-                    f"[TensorRT-LLM][WARNING] 'max_num_images' parameter is not set correctly (value is {max_num_images_str}). Will be set to None"
-                )
-                self.max_num_images = None
-        else:
-            print(
-                f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
-            )
-            self.max_num_images = None
-        if multimodal_model_path == "${multimodal_model_path}" or multimodal_model_path == "":
-            multimodal_model_path = None
-
-        if add_special_tokens is not None:
-            add_special_tokens_str = add_special_tokens['string_value'].lower()
-            if add_special_tokens_str in [
-                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
-            ]:
-                self.add_special_tokens = add_special_tokens_str in [
-                    'true', '1', 't', 'y', 'yes'
-                ]
-            else:
-                print(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.add_special_tokens = True
-        else:
-            print(
-                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
-            )
-            self.add_special_tokens = True
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-
-        if isinstance(self.tokenizer, T5Tokenizer):
-            self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
-
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        self.tokenizer_end_id = self.tokenizer.encode(
-            self.tokenizer.eos_token, add_special_tokens=False)[0]
-        self.tokenizer_pad_id = self.tokenizer.encode(
-            self.tokenizer.pad_token, add_special_tokens=False)[0]
-        self.vocab_size = self.tokenizer.vocab_size
-
-        self.is_multimodal = False
-        self.model_type = None
-        self.vision_preprocessor = None
-
-        if multimodal_model_path is not None:
-            self.is_multimodal = True
-            multimodal_model_path = os.path.join(multimodal_model_path,
-                                                 'config.json')
-            with open(multimodal_model_path, 'r') as f:
-                visual_model_config = json.load(f)
-            self.model_type = visual_model_config['builder_config'][
-                'model_type']
-
-            assert self.model_type in [
-                'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision',
-                'qwen2_vl'
-            ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama, llava_onevision and qwen2_vl. Got {self.model_type}."
-
-            assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently."
-
-            llm_model_path = model_config['parameters']['gpt_model_path'][
-                'string_value']
-            llm_model_path = os.path.join(llm_model_path, 'config.json')
-            with open(llm_model_path, 'r') as f:
-                llm_model_config = json.load(f)
-            self.vocab_size = int(
-                llm_model_config["pretrained_config"]["vocab_size"])
-            self._setup_ptable_shape(llm_model_config)
-
-            if self.model_type in ['mllama', 'llava_onevision', 'qwen2_vl']:
-                self.vision_preprocessor = VisionPreProcessor(
-                    self.model_type,
-                    AutoProcessor.from_pretrained(tokenizer_dir), model_config)
-
-        # Parse model output configs and convert Triton types to numpy types
-        output_names = [
-            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
-            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
-            "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_TABLE_EXTRA_IDS",
-            "PIXEL_VALUES", "IMAGE_SIZES"
-        ]
-        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
-        for input_name in input_names:
-            setattr(
-                self,
-                input_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_input_config_by_name(model_config,
-                                                      input_name)['data_type']))
-
-        for output_name in output_names:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
-                        model_config, output_name)['data_type']))
-
-    def _setup_ptable_shape(self, llm_model_config):
-        max_prompt_embedding_table_size = llm_model_config['build_config'][
-            'max_prompt_embedding_table_size']
-        max_batch_size = llm_model_config['build_config']['max_batch_size']
-
-        num_multimodal_features = max_prompt_embedding_table_size // max_batch_size
-        hidden_size = llm_model_config['pretrained_config']['hidden_size']
-        if self.max_num_images is not None:
-            num_multimodal_features = num_multimodal_features // self.max_num_images
-
-        self.ptable_shape = (-1, num_multimodal_features, hidden_size)
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request,
-                                                      'QUERY').as_numpy()
-            batch_size = query.shape[0]
-
-            decoder_query = pb_utils.get_input_tensor_by_name(
-                request, 'DECODER_QUERY')
-            if decoder_query is not None:
-                decoder_query = decoder_query.as_numpy()
-
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
-
-            bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT')
-            if bad_words_dict is not None:
-                bad_words_dict = bad_words_dict.as_numpy()
-
-            stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT')
-            if stop_words_dict is not None:
-                stop_words_dict = stop_words_dict.as_numpy()
-
-            embedding_bias_words = pb_utils.get_input_tensor_by_name(
-                request, 'EMBEDDING_BIAS_WORDS')
-            if embedding_bias_words is not None:
-                embedding_bias_words = embedding_bias_words.as_numpy()
-
-            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
-                request, 'EMBEDDING_BIAS_WEIGHTS')
-            if embedding_bias_weights is not None:
-                embedding_bias_weights = embedding_bias_weights.as_numpy()
-
-            # Take the end_id from the input tensors
-            # If not specified, use tokenizer to get end_id
-            end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
-            if end_id is not None:
-                end_id = end_id.as_numpy()
-            else:
-                end_id = [[self.tokenizer_end_id]] * batch_size
-
-            # Take the pad_id from the input tensors
-            # If not specified, use tokenizer to get pad_id
-            pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
-            if pad_id is not None:
-                pad_id = pad_id.as_numpy()
-            else:
-                pad_id = [[self.tokenizer_pad_id]] * batch_size
-
-            # Take the extra_id from the input tensors
-            # Extra id is used in kv cache reuse for p-tuning
-            prompt_table_extra_id = pb_utils.get_input_tensor_by_name(
-                request, 'PROMPT_TABLE_EXTRA_ID')
-            if prompt_table_extra_id is not None:
-                prompt_table_extra_id = prompt_table_extra_id.as_numpy()
-                assert prompt_table_extra_id.shape[
-                    0] == batch_size, "Prompt table extra id must have the same batch size as Query"
-                assert prompt_table_extra_id.shape[
-                    1] == 1, "Multiple IDs cannot be provided for a single image"
-
-            # Preprocessing vision input passed as a url or bytes tensor
-            img_urls = pb_utils.get_input_tensor_by_name(request, 'IMAGE_URL')
-            image_bytes = pb_utils.get_input_tensor_by_name(
-                request, 'IMAGE_BYTES')
-            video_bytes = pb_utils.get_input_tensor_by_name(
-                request, 'VIDEO_BYTES')
-            vision_processed_tensors = []
-            visual_tokens = []
-            if self.is_multimodal and (img_urls or image_bytes or video_bytes):
-                assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None"
-                processed_tensors = {}
-                if self.model_type == 'mllama':
-                    processed_tensors = self.vision_preprocessor.mllama_process(
-                        queries=query.astype(str).tolist(),
-                        img_urls=img_urls,
-                        image_bytes=image_bytes,
-                    )
-                elif self.model_type == 'llava_onevision':
-                    if video_bytes is None:
-                        processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_image(
-                            queries=query.astype(str).tolist(),
-                            img_urls=img_urls,
-                            image_bytes=image_bytes,
-                        )
-                    else:
-                        processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_video(
-                            queries=query.astype(str).tolist(),
-                            video_bytes=video_bytes,
-                        )
-                elif self.model_type == 'qwen2_vl':
-                    processed_tensors = self.vision_preprocessor.qwen2_vl_process_image(
-                        queries=query.astype(str).tolist(),
-                        img_urls=img_urls,
-                        image_bytes=image_bytes,
-                    )
-                    qwen2vl_input_id_tensor = processed_tensors.get("INPUT_IDS")
-                    processed_tensors.pop("INPUT_IDS")
-                    qwen2vl_input_length_tensor = processed_tensors.get(
-                        "REQUEST_INPUT_LEN")
-                    processed_tensors.pop("REQUEST_INPUT_LEN")
-                else:
-                    raise ValueError(
-                        "Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
-                    )
-                vision_processed_tensors = [
-                    pb_utils.Tensor.from_dlpack(k, v)
-                    for k, v in processed_tensors.items()
-                ]
-            else:
-                assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
-
-            # Preprocessing input data.
-            # For the LLaVA_OneVision model, num_multimodal_features is not a fixed value
-            input_id, request_input_len = self._create_request(
-                query, visual_tokens)
-            if decoder_query is not None:
-                decoder_input_id, request_decoder_input_len = self._create_request(
-                    decoder_query)
-            else:
-                decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
-                request_decoder_input_len = 1 * np.ones(
-                    (batch_size, 1), np.int32)
-
-            bad_words = self._to_word_list_format(bad_words_dict, batch_size)
-            stop_words = self._to_word_list_format(stop_words_dict, batch_size)
-
-            embedding_bias = self._get_embedding_bias(
-                embedding_bias_words, embedding_bias_weights,
-                self.embedding_bias_weights_dtype, batch_size)
-
-            if prompt_table_extra_id is not None and self.model_type != 'qwen2_vl':
-                prompt_table_extra_ids = np.zeros_like(input_id)
-                for i in range(batch_size):
-                    prompt_table_extra_ids[i] = np.where(
-                        input_id[i] >= self.vocab_size,
-                        prompt_table_extra_id[i], 0)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            # Qwen2-VL model has special logic to process input ids
-            if self.model_type == 'qwen2_vl':
-                input_id_tensor = pb_utils.Tensor.from_dlpack(
-                    'INPUT_ID', qwen2vl_input_id_tensor)
-                request_input_len_tensor = pb_utils.Tensor.from_dlpack(
-                    'REQUEST_INPUT_LEN', qwen2vl_input_length_tensor)
-            else:
-                input_id_tensor = pb_utils.Tensor(
-                    'INPUT_ID', input_id.astype(self.input_id_dtype))
-                request_input_len_tensor = pb_utils.Tensor(
-                    'REQUEST_INPUT_LEN',
-                    request_input_len.astype(self.request_input_len_dtype))
-            decoder_input_id_tensor = pb_utils.Tensor(
-                'DECODER_INPUT_ID',
-                decoder_input_id.astype(self.decoder_input_id_dtype))
-            request_decoder_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_DECODER_INPUT_LEN',
-                request_decoder_input_len.astype(
-                    self.request_decoder_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor('REQUEST_OUTPUT_LEN',
-                                                        request_output_len)
-            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
-            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
-                                                    stop_words)
-            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
-                                                    embedding_bias)
-            end_id_tensor = pb_utils.Tensor('OUT_END_ID',
-                                            np.array(end_id, dtype=np.int32))
-            pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
-                                            np.array(pad_id, dtype=np.int32))
-            if prompt_table_extra_id is not None:
-                prompt_table_extra_ids_tensor = pb_utils.Tensor(
-                    'OUT_PROMPT_TABLE_EXTRA_IDS',
-                    np.array(prompt_table_extra_ids,
-                             dtype=self.out_prompt_table_extra_ids_dtype))
-                inference_response = pb_utils.InferenceResponse(output_tensors=[
-                    input_id_tensor, decoder_input_id_tensor,
-                    bad_words_ids_tensor, stop_words_ids_tensor,
-                    request_input_len_tensor, request_decoder_input_len_tensor,
-                    request_output_len_tensor, embedding_bias_tensor,
-                    end_id_tensor, pad_id_tensor, prompt_table_extra_ids_tensor
-                ] + vision_processed_tensors)
-            else:
-                inference_response = pb_utils.InferenceResponse(output_tensors=[
-                    input_id_tensor, decoder_input_id_tensor,
-                    bad_words_ids_tensor, stop_words_ids_tensor,
-                    request_input_len_tensor, request_decoder_input_len_tensor,
-                    request_output_len_tensor, embedding_bias_tensor,
-                    end_id_tensor, pad_id_tensor
-                ] + vision_processed_tensors)
-            responses.append(inference_response)
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _split_prompt_by_images(self, concatenated_ids, image_token_index=-200):
-        """
-        Splits tokenized prompts by image placeholders for each sample in the batch.
-
-        Args:
-            concatenated_ids (np.ndarray): A batch of concatenated token IDs, where image placeholders are indicated by `image_token_index`.
-
-        Returns:
-            List[List[np.ndarray]]: A list containing lists of token ID arrays for each prompt segment, per batch sample.
-        """
-        batch_splits = []
-        for batch in concatenated_ids:
-            zero_indices = np.where(batch == image_token_index)[0]
-            start_idx = 0
-            splits = []
-            for idx in zero_indices:
-                if start_idx != idx:
-                    splits.append(batch[start_idx:idx].reshape(1, -1))
-                start_idx = idx + 1
-            if start_idx < len(batch):
-                splits.append(batch[start_idx:].reshape(1, -1))
-
-            splits = [split for split in splits if split.size > 0]
-            batch_splits.append(splits)
-
-        return batch_splits
-
-    def _setup_fake_prompts(self, batch_size, batch_split_prompts):
-        """
-        Replaces image placeholders with unique fake prompt IDs for multi-image inputs.
-
-        Args:
-            batch_size (int): The number of samples in the batch.
-            batch_split_prompts (List[List[np.ndarray]]): Tokenized prompt segments for each batch sample.
-
-        Returns:
-            np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
-        """
-
-        num_multimodal_features = self.ptable_shape[1]
-        input_ids_list = []
-
-        for batch_idx in range(batch_size):
-            splits = batch_split_prompts[batch_idx]
-            sample_input_ids = [splits[0]]
-            sample_fake_prompt_counter = self.vocab_size
-
-            for split_idx in range(len(splits) - 1):
-                fake_prompt_id = np.arange(
-                    sample_fake_prompt_counter,
-                    sample_fake_prompt_counter + num_multimodal_features)
-                sample_fake_prompt_counter += num_multimodal_features
-                fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0)
-                sample_input_ids.append(fake_prompt_id)
-                sample_input_ids.append(splits[split_idx + 1])
-
-            sample_input_ids = np.concatenate(sample_input_ids, axis=1)
-            input_ids_list.append(sample_input_ids)
-
-        # Pad the input_ids to the same length for bs > 1
-        max_seq_len = max(
-            [sample_input_ids.shape[1] for sample_input_ids in input_ids_list])
-        input_ids_padded = []
-        for sample_input_ids in input_ids_list:
-            seq_len = sample_input_ids.shape[1]
-            pad_width = max_seq_len - seq_len
-            if pad_width > 0:
-                sample_input_ids_padded = np.pad(
-                    sample_input_ids, ((0, 0), (0, pad_width)),
-                    'constant',
-                    constant_values=self.tokenizer_pad_id)
-            else:
-                sample_input_ids_padded = sample_input_ids
-            input_ids_padded.append(sample_input_ids_padded)
-
-        input_ids = np.stack(input_ids_padded)
-        input_ids = input_ids.reshape(batch_size, -1).astype(np.int32)
-
-        return input_ids
-
-    def _process_multi_image_inputs(self, query, image_token_index=-200):
-        """
-        Processes input queries that contain multiple images by tokenizing the input strings and inserting image_token_index between the parts.
-
-        Args:
-            query (np.ndarray): Batch of input strings.
-
-        Returns:
-            List[np.ndarray]: List of tokenized input IDs for each sample.
-        """
-        start_ids = []
-        for s in query:
-            parts = s[0].decode().split('<image>')
-            num_images = len(parts) - 1
-            if num_images > self.max_num_images:
-                raise ValueError(
-                    f"The number of images in the request ({num_images}) exceeds the maximum allowed ({self.max_num_images})."
-                )
-            tokenized_parts = [
-                self.tokenizer.encode(part, add_special_tokens=False)
-                for part in parts
-            ]
-
-            # Insert `image_token_index` between the parts to represent <image>
-            final_ids = []
-            for i, part in enumerate(tokenized_parts):
-                final_ids.extend(part)
-                if i < len(tokenized_parts) - 1:
-                    final_ids.append(image_token_index)
-
-            start_ids.append(np.array(final_ids).astype(int))
-
-        return start_ids
-
-    def _create_request(self, query, visual_tokens=None):
-        """
-            query : batch string (2D numpy array)
-        """
-        if isinstance(self.tokenizer, T5Tokenizer):
-            start_ids = [
-                np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
-                    s[0].decode(), add_special_tokens=self.add_special_tokens)).
-                astype(int) for s in query
-            ]
-        else:
-            # Qwen2-VL input id is calculated when processing image
-            if 'qwen2_vl' == self.model_type:
-                return None, None
-            if self.is_multimodal and self.max_num_images and self.max_num_images > 1:
-                start_ids = self._process_multi_image_inputs(query)
-
-            else:
-                start_ids = [
-                    np.array(
-                        self.tokenizer.encode(
-                            s[0].decode(),
-                            add_special_tokens=self.add_special_tokens)).astype(
-                                int) for s in query
-                ]
-
-        if self.is_multimodal:
-            if 'blip2' in self.model_type or 'mllama' == self.model_type:
-                pre_prompt = None
-                post_prompt = None
-            elif 'llava' == self.model_type:
-                pre_prompt = "USER:\n"
-                post_prompt = " ASSISTANT:"
-            elif 'vila' == self.model_type:
-                pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
-                post_prompt = " ASSISTANT:"
-            elif 'llava_onevision' == self.model_type:
-                pre_prompt = "<|im_start|>user "
-                post_prompt = "<|im_end|><|im_start|>assistant\n"
-            pre_prompt_id = np.array(
-                self.tokenizer.encode(
-                    pre_prompt,
-                    add_special_tokens=self.add_special_tokens,
-                    padding=True)) if pre_prompt is not None else np.array(
-                        [], dtype=int)
-
-            post_prompt_id = np.array(
-                self.tokenizer.encode(
-                    post_prompt,
-                    add_special_tokens=self.add_special_tokens,
-                    padding=True)) if post_prompt is not None else np.array(
-                        [], dtype=int)
-
-            if self.max_num_images and self.max_num_images > 1:
-                concatenated_ids = [
-                    np.concatenate((pre_prompt_id, ids, post_prompt_id), axis=0)
-                    for ids in start_ids
-                ]
-                batch_split_prompts = self._split_prompt_by_images(
-                    concatenated_ids)
-                start_ids = self._setup_fake_prompts(query.shape[0],
-                                                     batch_split_prompts)
-            elif self.model_type == 'llava_onevision':
-                fake_prompt_ids = []
-                extra_id = np.array(
-                    self.tokenizer.encode(
-                        '\n',
-                        add_special_tokens=self.add_special_tokens,
-                        padding=True))
-                for tokens in visual_tokens:
-                    prompt_id = np.arange(self.vocab_size,
-                                          self.vocab_size + tokens)
-                    fake_prompt_ids.append(prompt_id)
-                start_ids = [
-                    np.concatenate((pre_prompt_id, prompt_id, extra_id, ids,
-                                    post_prompt_id),
-                                   axis=0)
-                    for prompt_id, ids in zip(fake_prompt_ids, start_ids)
-                ]
-            else:
-                fake_prompt_id = np.arange(
-                    self.vocab_size, self.vocab_size + self.ptable_shape[1])
-                start_ids = [
-                    np.concatenate(
-                        (pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
-                        axis=0) for ids in start_ids
-                ]
-
-        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
-
-        max_len = 0
-        for seq in start_ids:
-            max_len = max(max_len, seq.shape[0])
-        start_ids = np.stack([
-            np.pad(seq, (0, max_len - seq.shape[0]),
-                   'constant',
-                   constant_values=(0, self.tokenizer_pad_id))
-            for seq in start_ids
-        ])
-
-        return start_ids, start_lengths
-
-    def _to_word_list_format(self, word_lists: List[List[str | bytes]],
-                             batch_size):
-        '''
-        word_lists format:
-            len(word_lists) == batch_size
-            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
-        '''
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if word_lists is None:
-            # Return an empty array of shape (1,2,0)
-            return np.empty([batch_size, 2, 0], dtype="int32")
-
-        flat_ids = []
-        offsets = []
-        for word_list in word_lists:
-            item_flat_ids = []
-            item_offsets = []
-
-            for word in word_list:
-                if isinstance(word, bytes):
-                    word = word.decode()
-
-                ids = self.tokenizer.encode(word, add_special_tokens=False)
-                if len(ids) == 0:
-                    continue
-
-                item_flat_ids += ids
-                item_offsets.append(len(ids))
-
-            flat_ids.append(np.array(item_flat_ids))
-            offsets.append(np.cumsum(np.array(item_offsets)))
-
-        pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
-                                constant_values=-1)
-
-        return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
-
-    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
-                            bias_dtype, batch_size):
-
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if embedding_bias_words is None or embedding_bias_weights is None:
-            return np.empty([batch_size, 0],
-                            dtype=self.embedding_bias_weights_dtype)
-
-        batch_embedding_bias = []
-        for words, weights in zip(embedding_bias_words, embedding_bias_weights):
-
-            vocab_size = len(self.tokenizer.vocab)
-            embedding_bias = [0.] * vocab_size
-
-            assert len(words) == len(
-                weights
-            ), "Embedding bias words must have same dimension as embedding bias weights"
-
-            for word, weight in zip(words, weights):
-                if isinstance(word, bytes):
-                    word = word.decode()
-                ids = self.tokenizer.encode(word)
-
-                if len(ids) == 0:
-                    continue
-
-                for id in ids:
-                    embedding_bias[id] += weight
-
-            batch_embedding_bias.append(np.array(embedding_bias))
-
-        return np.array(batch_embedding_bias, dtype=bias_dtype)
-
-
-class VisionPreProcessor:
-    """ A class that can load images from url requests, and process them via a vision model processor,
-    in preparation for the vision encoder.
-    """
-
-    def __init__(self,
-                 vision_model_type,
-                 vision_model_processor,
-                 preprocessor_model_config={}):
-        # import libraries that are only relevant for multimodal models
-        import torch
-        from torch.utils.dlpack import from_dlpack
-
-        # NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm
-        #       except for the specific modules tensorrt_llm and multimodal_encoders.
-        #       As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils.
-        _str_to_torch_dtype_dict = dict(
-            bfloat16=torch.bfloat16,
-            float16=torch.float16,
-            float32=torch.float32,
-            int64=torch.int64,
-            int32=torch.int32,
-            int8=torch.int8,
-            bool=torch.bool,
-            fp8=torch.float8_e4m3fn,
-        )
-
-        def str_dtype_to_torch(dtype):
-            ret = _str_to_torch_dtype_dict.get(dtype)
-            assert ret is not None, f'Unsupported dtype: {dtype}'
-            return ret
-
-        self.load_images_tensor = lambda tensor: tensor if not hasattr(
-            tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack())
-
-        # extract expected output tensor dtype
-        self.output_str_dtypes = {}
-        for properties in preprocessor_model_config.get('output', []):
-            dtype = properties['data_type']
-            self.output_str_dtypes[properties['name']] = np.dtype(
-                pb_utils.triton_string_to_numpy(dtype)).name
-
-        # create method for converting output tensors batch to the expected type
-        self.convert_tensor_list_to_tensor = lambda tensor_list: torch.concat(
-            [
-                torch.from_numpy(x) if isinstance(x, np.ndarray) else x
-                for x in tensor_list
-            ],
-            dim=0)
-        self.convert_tensor_to_str_dtype = lambda tensor, dtype: tensor.to(
-            str_dtype_to_torch(dtype))
-
-        # create model-specific processor
-        self.vision_model_processor = vision_model_processor
-        self.vision_model_type = vision_model_type
-
-    def load_images_from_urls(self, img_urls):
-        images = []
-        for img_url in img_urls:
-            img_url = img_url.decode()
-            if img_url.startswith("data:image/jpeg;base64,"):
-                image_base64 = img_url.split(",")[1]
-                # Decode the base64 string
-                image_data = base64.b64decode(image_base64)
-                # Create a BytesIO object from the decoded data
-                image_buffer = io.BytesIO(image_data)
-                images.append(Image.open(image_buffer))
-            else:
-                images.append(Image.open(
-                    requests.get(img_url, stream=True).raw))
-        return images
-
-    def mllama_process(self, queries, img_urls=None, image_bytes=None):
-        vision_processed_tensors = {}
-        if img_urls is not None or image_bytes is not None:
-            if img_urls is not None:
-                # download and read images
-                images = [
-                    self.load_images_from_urls(urls)
-                    for urls in img_urls.as_numpy()
-                ]
-            else:
-                images = [
-                    img for img_list in self.load_images_tensor(image_bytes)
-                    for img in img_list
-                ]
-
-            batch_size = len(images)
-
-            preprocessor_outputs = {}
-            possible_output_names = [
-                'PIXEL_VALUES', 'ASPECT_RATIO_IDS', 'ASPECT_RATIO_MASK',
-                'CROSS_ATTENTION_MASK'
-            ]
-            for batch_id in range(batch_size):
-                # Preprocess images and query
-                processed_vision_data = self.vision_model_processor(
-                    images=images[batch_id],
-                    text=queries[batch_id],
-                    return_tensors="pt")
-                # Reshape pixel_values to [num_images, *HWC/CHW]
-                val = processed_vision_data["pixel_values"]
-                val = val.reshape(1, -1, *(val.shape[-3:]))
-                processed_vision_data["pixel_values"] = val
-
-                # Create vision output tensors
-                for key in possible_output_names:
-                    val = processed_vision_data.get(key.lower())
-                    if val is not None:
-                        if key not in preprocessor_outputs:
-                            preprocessor_outputs[key] = []
-                        preprocessor_outputs[key].append(val)
-
-            for key, tensor_list in preprocessor_outputs.items():
-                val = self.convert_tensor_list_to_tensor(tensor_list)
-                if key in self.output_str_dtypes:
-                    val = self.convert_tensor_to_str_dtype(
-                        val, self.output_str_dtypes[key])
-                vision_processed_tensors[key] = val
-        return vision_processed_tensors
-
-    def llava_onevision_process_image(self,
-                                      queries,
-                                      img_urls=None,
-                                      image_bytes=None):
-
-        import torch
-        vision_processed_tensors = {}
-        if img_urls is not None:
-            # download and read images
-            images = [
-                self.load_images_from_urls(urls)
-                for urls in img_urls.as_numpy()
-            ]
-        else:
-            images = [
-                img for img_list in self.load_images_tensor(image_bytes)
-                for img in img_list
-            ]
-
-        batch_size = len(images)
-        assert len(
-            queries
-        ) == batch_size, f"Image must have the same batch size as Query."
-        preprocessor_outputs = {}
-        possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES']
-        visual_tokens = []
-        for batch_id in range(batch_size):
-            # Preprocess images and query
-            processed_vision_data = self.vision_model_processor(
-                images=images[batch_id], text='<image>', return_tensors="pt")
-            visual_tokens.append(processed_vision_data['input_ids'].shape[1])
-            # Create vision output tensors
-            for key in possible_output_names:
-                val = processed_vision_data.get(key.lower())
-                if val is not None:
-                    if key not in preprocessor_outputs:
-                        preprocessor_outputs[key] = []
-                    preprocessor_outputs[key].append(val)
-
-        max_patch = max(x.shape[1]
-                        for x in preprocessor_outputs['PIXEL_VALUES'])
-        preprocessor_outputs['PIXEL_VALUES'] = [
-            torch.nn.functional.pad(
-                image, (0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[1], 0, 0),
-                mode='constant')
-            for image in preprocessor_outputs['PIXEL_VALUES']
-        ]
-        for key, tensor_list in preprocessor_outputs.items():
-            val = self.convert_tensor_list_to_tensor(tensor_list)
-            if key in self.output_str_dtypes:
-                val = self.convert_tensor_to_str_dtype(
-                    val, self.output_str_dtypes[key])
-            vision_processed_tensors[key] = val
-        return vision_processed_tensors, visual_tokens
-
-    def llava_onevision_process_video(self, queries, video_bytes=None):
-        import torch
-        vision_processed_tensors = {}
-        videos = [video for video in self.load_images_tensor(video_bytes)]
-
-        batch_size = len(videos)
-        assert len(
-            queries
-        ) == batch_size, f"Video must have the same batch size as Query."
-        preprocessor_outputs = {}
-        preprocessor_outputs['PIXEL_VALUES'] = []
-        preprocessor_outputs['IS_VIDEO_INPUT'] = []
-        visual_tokens = []
-        for batch_id in range(len(queries)):
-            processed_vision_data = self.vision_model_processor(
-                videos=list(videos[batch_id]),
-                text='<video>',
-                return_tensors="pt")
-            visual_tokens.append(processed_vision_data['input_ids'].shape[1])
-            preprocessor_outputs['PIXEL_VALUES'].append(
-                processed_vision_data['pixel_values_videos'])
-            preprocessor_outputs['IS_VIDEO_INPUT'].append(
-                torch.ones((1, 1), dtype=torch.bool))
-
-        for key, tensor_list in preprocessor_outputs.items():
-            val = self.convert_tensor_list_to_tensor(tensor_list)
-            if key in self.output_str_dtypes:
-                val = self.convert_tensor_to_str_dtype(
-                    val, self.output_str_dtypes[key])
-            vision_processed_tensors[key] = val
-        return vision_processed_tensors, visual_tokens
-
-    def qwen2_vl_process_image(self, queries, img_urls=None, image_bytes=None):
-        import torch
-        vision_processed_tensors = {}
-        # Retrieved from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/config.json
-        vision_token_id = 151654
-        image_token_id = 151655
-        video_token_id = 151656
-        vocab_size = 152064
-
-        if img_urls is not None:
-            # download and read images
-            images = [
-                self.load_images_from_urls(urls)
-                for urls in img_urls.as_numpy()
-            ]
-        else:
-            images = [
-                img for img_list in self.load_images_tensor(image_bytes)
-                for img in img_list
-            ]
-        batch_size = len(images)
-        preprocessor_outputs = defaultdict(list)
-        possible_output_names = [
-            'PIXEL_VALUES', 'IMAGE_GRID_THW', 'ATTENTION_MASK', 'INPUT_IDS'
-        ]
-        for batch_id in range(batch_size):
-            messages = [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": images[batch_id],
-                    },
-                    {
-                        "type":
-                        "text",
-                        "text":
-                        queries[batch_id][0] if isinstance(
-                            queries[batch_id], list) else queries[batch_id],
-                    },
-                ],
-            }]
-            text_inputs = self.vision_model_processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True)
-            # Preprocess images and query
-            processed_vision_data = self.vision_model_processor(
-                images=images[batch_id],
-                text=text_inputs,
-                padding=True,
-                return_tensors="pt")
-
-            # Create vision output tensors
-            for key in possible_output_names:
-                val = processed_vision_data.get(key.lower())
-                if val is not None:
-                    # Add two dummy dim to reshape pixel value tensor to 5 dim
-                    if key == 'PIXEL_VALUES':
-                        val = val.unsqueeze(0).unsqueeze(0).unsqueeze(0)
-                    elif key == 'INPUT_IDS':
-                        val = val.to(torch.int32)
-                        pre_process_val = val.clone()
-                        mask = (val == image_token_id) | (
-                            val == vision_token_id) | (val == video_token_id)
-                        cumulative_counts = mask.cumsum(dim=1,
-                                                        dtype=torch.int32)
-                        values = (vocab_size - 1) + cumulative_counts
-                        val[mask] = values[mask]
-                        preprocessor_outputs["VISION_INPUT_ID"].append(
-                            pre_process_val)
-                        preprocessor_outputs["REQUEST_INPUT_LEN"].append(
-                            torch.tensor([val.shape[1]],
-                                         dtype=torch.int32).unsqueeze(0))
-                    preprocessor_outputs[key].append(val)
-
-        for key, tensor_list in preprocessor_outputs.items():
-            val = self.convert_tensor_list_to_tensor(tensor_list)
-            if key in self.output_str_dtypes:
-                val = self.convert_tensor_to_str_dtype(
-                    val, self.output_str_dtypes[key])
-            vision_processed_tensors[key] = val
-        return vision_processed_tensors
diff --git a/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
deleted file mode 100755
index b21585e4bd3..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "preprocessing"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-dynamic_batching {
-    preferred_batch_size: [ ${triton_max_batch_size} ]
-    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
-    default_queue_policy: { max_queue_size: ${max_queue_size} }
-}
-
-input [
-    {
-        name: "QUERY"
-        data_type: TYPE_STRING
-        dims: [ 1 ]
-    },
-    {
-        name: "DECODER_QUERY"
-        data_type: TYPE_STRING
-        dims: [ 1 ]
-        optional: true
-    },
-    {
-        name: "IMAGE_BYTES"
-        data_type: TYPE_UINT8
-        dims: [ -1, -1, -1, -1 ]
-        optional: true
-    },
-    {
-        name: "IMAGE_URL"
-        data_type: TYPE_STRING
-        dims: [ 1 ]
-        optional: true
-    },
-    {
-        name: "VIDEO_BYTES"
-        data_type: TYPE_UINT8
-        dims: [ -1, -1, -1, -1 ]
-        optional: true
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "EMBEDDING_BIAS_WORDS"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "EMBEDDING_BIAS_WEIGHTS"
-        data_type: TYPE_FP32
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "END_ID"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-        optional: true
-    },
-    {
-        name: "PAD_ID"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-        optional: true
-    },
-    {
-        name: "PROMPT_TABLE_EXTRA_ID"
-        data_type: TYPE_UINT64
-        dims: [ 1 ]
-        optional: true
-    }
-]
-output [
-    {
-        name: "INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "DECODER_INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_DECODER_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "EMBEDDING_BIAS"
-        data_type: TYPE_FP32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "OUT_END_ID"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "OUT_PAD_ID"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "OUT_PROMPT_TABLE_EXTRA_IDS"
-        data_type: TYPE_UINT64
-        dims: [ -1 ]
-    },
-    {
-        name: "PIXEL_VALUES"
-        data_type: TYPE_FP16
-        dims: [ -1, -1, -1, -1 ]
-    },
-    {
-        name: "ASPECT_RATIO_IDS"
-        data_type: TYPE_INT64
-        dims: [ -1 ]
-    },
-    {
-        name: "ASPECT_RATIO_MASK"
-        data_type: TYPE_INT64
-        dims: [ -1, -1 ]
-    },
-    {
-        name: "CROSS_ATTENTION_MASK"
-        data_type: TYPE_INT64
-        dims: [ -1, -1, -1 ]
-    },
-    # Required for image postprocessing in the llava_onevision model
-    {
-        name: "IMAGE_SIZES"
-        data_type: TYPE_INT64
-        dims: [ 2 ]
-    },
-    # Indicates if the input is video in the llava_onevision model
-    {
-        name: "IS_VIDEO_INPUT"
-        data_type: TYPE_BOOL
-        dims: [ 1 ]
-    },
-    # Required for Qwen2-VL vision encoder
-    {
-        name: "ATTENTION_MASK"
-        data_type: TYPE_INT64
-        dims: [ -1 ]
-    },
-    {
-        name: "IMAGE_GRID_THW"
-        data_type: TYPE_INT64
-        dims: [ 3 ]
-    },
-    {
-        name: "VISION_INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-parameters {
-  key: "add_special_tokens"
-  value: {
-    string_value: "${add_special_tokens}"
-  }
-}
-
-parameters {
-  key: "multimodal_model_path"
-  value: {
-    string_value: "${multimodal_model_path}"
-  }
-}
-
-parameters: {
-  key: "gpt_model_path"
-  value: {
-    string_value: "${engine_dir}"
-  }
-}
-
-parameters: {
-  key: "max_num_images"
-  value: {
-    string_value: "${max_num_images}"
-  }
-}
-
-instance_group [
-    {
-        count: ${preprocessing_instance_count}
-        kind: KIND_CPU
-    }
-]
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
deleted file mode 100755
index 4f8863465b1..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
+++ /dev/null
@@ -1,1623 +0,0 @@
-import datetime
-import json
-import os
-import sys
-import time
-from dataclasses import dataclass
-from random import randint
-from threading import Lock, Thread
-from typing import Any, List
-
-import numpy as np
-import pandas as pd
-import torch
-import triton_python_backend_utils as pb_utils
-from torch import from_numpy
-from torch.utils.dlpack import from_dlpack
-
-import tensorrt_llm.bindings.executor as trtllm
-from tensorrt_llm.llmapi.tokenizer import _xgrammar_tokenizer_info
-
-METRIC_TOTAL_OUTPUT_TOKENS = "total_output_tokens"
-METRIC_TOTAL_INPUT_TOKENS = "total_input_tokens"
-import tensorrt_llm.logger as logger
-
-# From https://github.com/pytorch/pytorch/blob/39425feac799905402abe4d15667fa47c344f2d7/torch/testing/_internal/common_utils.py#L1761
-# Dict of NumPy dtype -> torch dtype (when the correspondence exists)
-numpy_to_torch_dtype_dict = {
-    np.bool_: torch.bool,
-    np.uint8: torch.uint8,
-    np.uint16: torch.uint16,
-    np.uint32: torch.uint32,
-    np.uint64: torch.uint64,
-    np.int8: torch.int8,
-    np.int16: torch.int16,
-    np.int32: torch.int32,
-    np.int64: torch.int64,
-    np.float16: torch.float16,
-    np.float32: torch.float32,
-    np.float64: torch.float64,
-    np.complex64: torch.complex64,
-    np.complex128: torch.complex128
-}
-
-# Dict of torch dtype -> NumPy dtype
-torch_to_numpy_dtype_dict = {
-    value: key
-    for (key, value) in numpy_to_torch_dtype_dict.items()
-}
-torch_to_numpy_dtype_dict.update({
-    torch.bfloat16: np.float32,
-    torch.complex32: np.complex64
-})
-
-
-@dataclass
-class RequestData:
-    triton_req_id: int
-    triton_user_id: str
-    batch_index: int
-    batch_size: int
-    num_return_sequences: int
-    num_input_tokens: int
-    num_output_tokens: int
-    response_sender: Any
-    return_num_input_tokens: bool = False
-    return_num_output_tokens: bool = False
-
-
-def mpi_comm():
-    from mpi4py import MPI
-    return MPI.COMM_WORLD
-
-
-def mpi_rank():
-    return mpi_comm().Get_rank()
-
-
-def get_input_tensor_by_name(request,
-                             name,
-                             expected_batch_size=None,
-                             batch_index=None,
-                             force_on_torch=False):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-
-    if tensor.is_cpu() and not force_on_torch:
-        tensor = tensor.as_numpy()
-    else:
-        tensor = from_dlpack(tensor.to_dlpack())
-
-    if expected_batch_size is not None and tensor.shape[
-            0] != expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
-        )
-
-    if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Invalid batch index in get_input_tensor_by_name for {name}")
-
-    if batch_index is not None:
-        # Add leading 1 batch dimension
-        if isinstance(tensor, np.ndarray):
-            return np.expand_dims(tensor[batch_index], axis=0)
-        elif isinstance(tensor, torch.Tensor):
-            return torch.unsqueeze(tensor[batch_index], dim=0)
-    else:
-        return tensor
-
-
-def get_input_scalar_by_name(request,
-                             name,
-                             expected_batch_size=1,
-                             batch_index=0):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-    tensor = tensor.as_numpy()
-
-    if tensor.size != expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Expected a scalar tensor for tensor {name}")
-
-    return tensor.item(batch_index)
-
-
-def read_parameter_as_type(value, name, pytype=str):
-    if value == "":
-        return None
-    if value.startswith("${") and value.endswith("}"):
-        return None
-    if pytype is bool:
-        return value.lower() in ["1", "true"]
-    try:
-        result = pytype(value)
-        return result
-    except:
-        pb_utils.Logger.log_warning(
-            f"Could not read parameter '{name}' with value '{value}', will use default."
-        )
-        return None
-
-
-def get_parameter(model_config, name, pytype=str):
-    if name not in model_config['parameters']:
-        return None
-    return read_parameter_as_type(
-        model_config['parameters'][name]['string_value'], name, pytype)
-
-
-def convert_word_list(word_list):
-    if word_list is None:
-        return None
-    word_list = word_list.tolist()
-    if len(word_list) == 0 or len(word_list[0]) != 2:
-        raise pb_utils.TritonModelException(f"Invalid format for word list.")
-    words, indices = word_list[0]
-    result = []
-    current_index = 0
-    for i in indices:
-        if i == -1:
-            continue
-        if i > len(words):
-            raise pb_utils.TritonModelException(
-                f"Invalid format for word list.")
-        current_word = []
-        while current_index < i:
-            current_word.append(words[current_index])
-            current_index += 1
-        result.append(current_word)
-    return result
-
-
-def parse_medusa_choices(medusa_choices):
-    if medusa_choices is None:
-        return None
-    try:
-        result = json.loads("[" +
-                            medusa_choices.replace("{", "[").replace("}", "]") +
-                            "]")
-        assert isinstance(result, list) and len(result) > 0
-        assert all([isinstance(x, list) for x in result])
-        assert all([isinstance(y, int) for x in result for y in x])
-    except Exception:
-        raise pb_utils.TritonModelException("Invalid format for medusa_choices")
-    return result
-
-
-def parse_eagle_choices(eagle_choices):
-    return parse_medusa_choices(eagle_choices)
-
-
-def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
-    kwargs = {}
-    kwargs['beam_width'] = get_input_scalar_by_name(
-        request, 'beam_width', batch_size, batch_index) or 1
-    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
-                                               batch_size, batch_index)
-    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
-                                               batch_size, batch_index)
-    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
-        'top_p'] <= 0 else kwargs['top_p']
-    kwargs['seed'] = get_input_scalar_by_name(request, 'seed', batch_size,
-                                              batch_index)
-    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
-                                                     batch_size, batch_index)
-    kwargs['min_tokens'] = get_input_scalar_by_name(request, 'min_tokens',
-                                                    batch_size, batch_index)
-    kwargs['repetition_penalty'] = get_input_scalar_by_name(
-        request, 'repetition_penalty', batch_size, batch_index)
-    kwargs['presence_penalty'] = get_input_scalar_by_name(
-        request, 'presence_penalty', batch_size, batch_index)
-    kwargs['frequency_penalty'] = get_input_scalar_by_name(
-        request, 'frequency_penalty', batch_size, batch_index)
-    kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty',
-                                                        batch_size, batch_index)
-    kwargs['top_p_min'] = get_input_scalar_by_name(request, 'runtime_top_p_min',
-                                                   batch_size, batch_index)
-    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
-        request, 'runtime_top_p_reset_ids', batch_size, batch_index)
-    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
-                                                     'runtime_top_p_decay',
-                                                     batch_size, batch_index)
-    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
-        request, 'beam_search_diversity_rate', batch_size, batch_index)
-    kwargs['early_stopping'] = get_input_scalar_by_name(request,
-                                                        'early_stopping',
-                                                        batch_size, batch_index)
-    kwargs['num_return_sequences'] = get_input_scalar_by_name(
-        request, 'num_return_sequences', batch_size, batch_index) or 1
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    return trtllm.SamplingConfig(**kwargs)
-
-
-def get_output_config_from_request(request, batch_size=1, batch_index=0):
-    kwargs = {}
-    kwargs["return_log_probs"] = get_input_scalar_by_name(
-        request, 'return_log_probs', batch_size, batch_index)
-    kwargs["return_context_logits"] = get_input_scalar_by_name(
-        request, 'return_context_logits', batch_size, batch_index)
-    kwargs["return_generation_logits"] = get_input_scalar_by_name(
-        request, 'return_generation_logits', batch_size, batch_index)
-    kwargs["return_perf_metrics"] = get_input_scalar_by_name(
-        request, 'return_perf_metrics', batch_size, batch_index)
-    if get_input_scalar_by_name(request, 'return_kv_cache_reuse_stats',
-                                batch_size, batch_index):
-        pb_utils.Logger.log_warn(
-            "return_kv_cache_reuse_stats is deprecated, please use return_perf_metrics instead."
-        )
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    return trtllm.OutputConfig(**kwargs)
-
-
-def get_external_draft_tokens_config_from_request(request,
-                                                  batch_size=1,
-                                                  batch_index=0):
-    kwargs = {}
-    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
-                                               batch_size, batch_index)
-    if draft_input_ids is not None:
-        kwargs['tokens'] = draft_input_ids[0].tolist()
-    draft_logits = get_input_tensor_by_name(request, 'draft_logits', batch_size,
-                                            batch_index)
-    if draft_logits is not None:
-        kwargs['logits'] = from_numpy(draft_logits).squeeze(dim=0)
-    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
-        request, 'draft_acceptance_threshold', batch_size, batch_index)
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.ExternalDraftTokensConfig(**kwargs)
-    return None
-
-
-def get_prompt_tuning_config_from_request(request,
-                                          batch_size=1,
-                                          batch_index=0,
-                                          input_length=0):
-    # prompt_vocab_size is unused by executor.
-    kwargs = {}
-    prompt_embedding_table = get_input_tensor_by_name(request,
-                                                      'prompt_embedding_table',
-                                                      batch_size, batch_index)
-    prompt_table_extra_ids = get_input_tensor_by_name(request,
-                                                      'prompt_table_extra_ids',
-                                                      batch_size, batch_index)
-    if prompt_embedding_table is not None:
-        if isinstance(prompt_embedding_table, np.ndarray):
-            kwargs["embedding_table"] = from_numpy(
-                prompt_embedding_table).squeeze(dim=0)
-        elif isinstance(prompt_embedding_table, torch.Tensor):
-            kwargs["embedding_table"] = prompt_embedding_table.squeeze(dim=0)
-
-        if prompt_table_extra_ids is not None:
-            prompt_table_extra_ids = prompt_table_extra_ids[0].tolist()
-            if len(prompt_table_extra_ids) != 0:
-                kwargs["input_token_extra_ids"] = prompt_table_extra_ids[
-                    0:input_length]
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.PromptTuningConfig(**kwargs)
-    return None
-
-
-def get_lora_config_from_request(request, batch_size=1, batch_index=0):
-    kwargs = {}
-    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
-                                                 batch_size, batch_index)
-    lora_weights = get_input_tensor_by_name(request, 'lora_weights', batch_size,
-                                            batch_index)
-    if lora_weights is not None:
-        kwargs["weights"] = from_numpy(lora_weights).squeeze(dim=0)
-    lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
-                                           batch_index)
-    if lora_config is not None:
-        kwargs["config"] = from_numpy(lora_config).squeeze(dim=0)
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.LoraConfig(**kwargs)
-    return None
-
-
-def get_guided_decoding_params_from_request(request,
-                                            batch_size=1,
-                                            batch_index=0):
-    kwargs = {}
-    guided_decoding_guide_type = get_input_tensor_by_name(
-        request, 'guided_decoding_guide_type', batch_size, batch_index)
-    if guided_decoding_guide_type is not None:
-        guided_decoding_guide_type = guided_decoding_guide_type.squeeze(
-            axis=0)[0].decode()
-        guided_decoding_guide_type_mapping = {
-            "json": trtllm.GuidedDecodingParams.GuideType.JSON,
-            "json_schema": trtllm.GuidedDecodingParams.GuideType.JSON_SCHEMA,
-            "regex": trtllm.GuidedDecodingParams.GuideType.REGEX,
-            "ebnf_grammar": trtllm.GuidedDecodingParams.GuideType.EBNF_GRAMMAR
-        }
-        guided_decoding_guide_type = guided_decoding_guide_type_mapping.get(
-            guided_decoding_guide_type)
-    kwargs['guide_type'] = guided_decoding_guide_type
-
-    guided_decoding_guide = get_input_tensor_by_name(request,
-                                                     'guided_decoding_guide',
-                                                     batch_size, batch_index)
-    if guided_decoding_guide is not None:
-        kwargs['guide'] = guided_decoding_guide.squeeze(axis=0)[0].decode()
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.GuidedDecodingParams(**kwargs)
-    return None
-
-
-def get_kv_cache_retention_config_from_request(request,
-                                               batch_size=1,
-                                               batch_index=0):
-
-    def get_tensor_and_check_length(name: str, expected_length: int):
-        tensor = get_input_tensor_by_name(request, name, batch_size,
-                                          batch_index)
-
-        if tensor is None:
-            raise RuntimeError(f"{name} must be provided.")
-
-        tensor = np.squeeze(tensor, axis=0)
-
-        if len(tensor) != expected_length:
-            raise RuntimeError(
-                f"Invalid {name} length. Expected length {expected_length}, got length {len(tensor)}"
-            )
-
-        return tensor
-
-    token_range_starts = get_input_tensor_by_name(
-        request, "retention_token_range_starts", batch_size, batch_index)
-
-    if token_range_starts is not None:
-        token_range_starts = np.squeeze(token_range_starts, axis=0)
-
-        token_range_ends = get_tensor_and_check_length(
-            "retention_token_range_ends", len(token_range_starts))
-        token_range_ends = [
-            None if end == -1 else end for end in token_range_ends
-        ]
-
-        token_range_priorities = get_tensor_and_check_length(
-            "retention_token_range_priorities", len(token_range_starts))
-
-        token_range_durations_ms = get_input_tensor_by_name(
-            request, "retention_token_range_durations_ms", batch_size,
-            batch_index)
-
-        if token_range_durations_ms is None:
-            token_range_durations_ms = [None] * len(token_range_starts)
-        else:
-            token_range_durations_ms = np.squeeze(token_range_durations_ms,
-                                                  axis=0)
-            token_range_durations_ms = [
-                None if duration == -1 else duration
-                for duration in token_range_durations_ms
-            ]
-
-            if len(token_range_durations_ms) != len(token_range_starts):
-                raise RuntimeError(
-                    f"Invalid retention_token_range_durations length. Expected length {len(token_range_starts)}, got length {len(token_range_durations_ms)}"
-                )
-
-        ranges = []
-
-        for start, end, priority, duration_ms in zip(token_range_starts,
-                                                     token_range_ends,
-                                                     token_range_priorities,
-                                                     token_range_durations_ms):
-            ranges.append(
-                trtllm.KvCacheRetentionConfig.TokenRangeRetentionConfig(
-                    token_start=start,
-                    token_end=end,
-                    priority=priority.item(),
-                    duration_ms=None if duration_ms is None else
-                    datetime.timedelta(milliseconds=duration_ms.item())))
-
-        decode_args = {}
-
-        decode_priority = get_input_scalar_by_name(request,
-                                                   "retention_decode_priority",
-                                                   batch_size, batch_index)
-        if decode_priority is not None:
-            decode_args['decode_retention_priority'] = decode_priority
-
-        decode_duration_ms = get_input_scalar_by_name(
-            request, "retention_decode_duration_ms", batch_size, batch_index)
-        if decode_duration_ms is not None:
-            decode_args[
-                'decode_duration_ms'] = decode_duration_ms if decode_duration_ms != -1 else None
-
-        return trtllm.KvCacheRetentionConfig(
-            token_range_retention_configs=ranges, **decode_args)
-
-    return None
-
-
-def get_lookahead_decoding_config_from_request(request,
-                                               executor_lookahead_config,
-                                               batch_size=1,
-                                               batch_index=0):
-    lookahead_window_size = get_input_tensor_by_name(request,
-                                                     "lookahead_window_size",
-                                                     batch_size, batch_index)
-
-    lookahead_ngram_size = get_input_tensor_by_name(request,
-                                                    "lookahead_ngram_size",
-                                                    batch_size, batch_index)
-
-    lookahead_verification_set_size = get_input_tensor_by_name(
-        request, "lookahead_verification_set_size", batch_size, batch_index)
-
-    # None lookahead config for requests.
-    if all(x is None for x in [
-            lookahead_window_size, lookahead_ngram_size,
-            lookahead_verification_set_size
-    ]):
-        return None
-
-    # Have request lookahead config but no executor config.
-    if executor_lookahead_config is None:
-        raise RuntimeError(
-            "The request lookahead decoding input tensors (window_size, ngram_size and verification_set_size) can only be set if the model instance lookahead parameters are also specified"
-        )
-
-    return trtllm.LookaheadDecodingConfig(lookahead_window_size,
-                                          lookahead_ngram_size,
-                                          lookahead_verification_set_size)
-
-
-def get_mrope_config_from_request(request, batch_size=1, batch_index=0):
-    mrope_rotary_cos_sin = get_input_tensor_by_name(request,
-                                                    'mrope_rotary_cos_sin',
-                                                    batch_size, batch_index)
-    mrope_position_deltas = get_input_tensor_by_name(request,
-                                                     'mrope_position_deltas',
-                                                     batch_size,
-                                                     batch_index,
-                                                     force_on_torch=False)
-    assert (mrope_rotary_cos_sin is None) == (
-        mrope_position_deltas is None
-    ), "Both mrope_rotary_cos_sin and mrope_position_detals must be either None or not None."
-
-    if mrope_rotary_cos_sin is not None and mrope_position_deltas is not None:
-        mrope_config = trtllm.MropeConfig(
-            mrope_rotary_cos_sin=mrope_rotary_cos_sin[0],
-            mrope_position_deltas=mrope_position_deltas[0])
-        return mrope_config
-    return None
-
-
-def build_1_2_5_buckets(max_value: int) -> List[int]:
-    """
-    Builds a list of buckets with increasing powers of 10 multiplied by
-    mantissa values (1, 5), starting from 10 until the value exceeds
-    the specified maximum.
-
-    Example:
-    >>> build_1_2_5_buckets(1000)
-    [10, 50, 100, 500, 1000]
-    """
-    mantissa_lst = [1, 5]
-    exponent = 1  # Start from exponent 1 instead of 0
-    buckets: List[int] = []
-    while True:
-        for m in mantissa_lst:
-            value = m * 10**exponent
-            if value <= max_value:
-                buckets.append(value)
-            else:
-                return buckets
-        exponent += 1
-
-
-def convert_request(request,
-                    exclude_input_from_output,
-                    decoupled,
-                    executor_lookahead_config=None):
-    inputs = {}
-    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
-    if input_token_ids is None:
-        raise pb_utils.TritonModelException("A value is required for input_ids")
-    if len(input_token_ids.shape) != 2:
-        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
-    batch_size = input_token_ids.shape[0]
-    requests = []
-    for batch_index in range(0, batch_size):
-        input_token_ids = get_input_tensor_by_name(request, 'input_ids',
-                                                   batch_size, batch_index)[0]
-        if input_token_ids is None:
-            raise pb_utils.TritonModelException(
-                "A value is required for input_ids")
-        input_token_ids = input_token_ids.tolist()
-        if len(input_token_ids) == 0:
-            raise pb_utils.TritonModelException(f"Invalid format for input_ids")
-
-        input_length = get_input_scalar_by_name(request, 'input_lengths',
-                                                batch_size, batch_index)
-        if input_length is None:
-            input_length = len(input_token_ids)
-        # Trim input token ids with input_lengths
-        inputs['input_token_ids'] = input_token_ids[0:input_length]
-        inputs['max_tokens'] = get_input_scalar_by_name(request,
-                                                        'request_output_len',
-                                                        batch_size, batch_index)
-        if inputs['max_tokens'] is None:
-            raise pb_utils.TritonModelException(
-                "A value is required for request_output_len")
-        inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
-                                                       batch_size, batch_index)
-        if inputs['streaming'] and not decoupled:
-            raise pb_utils.TritonModelException(
-                "Streaming is only supported in decoupled mode.")
-
-        inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
-                                                    batch_size, batch_index)
-        inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
-                                                    batch_size, batch_index)
-        inputs['stop_words'] = convert_word_list(
-            get_input_tensor_by_name(request, 'stop_words_list', batch_size,
-                                     batch_index))
-        inputs['bad_words'] = convert_word_list(
-            get_input_tensor_by_name(request, 'bad_words_list', batch_size,
-                                     batch_index))
-        embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
-                                                  batch_size, batch_index)
-        if embedding_bias is not None and embedding_bias.size != 0:
-            inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze(dim=0)
-
-        sampling_config = get_sampling_config_from_request(
-            request, batch_size, batch_index)
-        output_config = get_output_config_from_request(request, batch_size,
-                                                       batch_index)
-        req_exclude_input_from_output = get_input_scalar_by_name(
-            request, 'exclude_input_in_output', batch_size, batch_index)
-        if req_exclude_input_from_output is None:
-            # if request doesn't specify exclude_input_from_output, try to use the parameter
-            output_config.exclude_input_from_output = (
-                exclude_input_from_output
-                if exclude_input_from_output is not None else False)
-        else:
-            output_config.exclude_input_from_output = req_exclude_input_from_output
-
-        external_draft_tokens_config = get_external_draft_tokens_config_from_request(
-            request, batch_size, batch_index)
-        prompt_tuning_config = get_prompt_tuning_config_from_request(
-            request, batch_size, batch_index, input_length)
-        mrope_config = get_mrope_config_from_request(request, batch_size,
-                                                     batch_index)
-        lora_config = get_lora_config_from_request(request, batch_size,
-                                                   batch_index)
-        kv_cache_retention_config = get_kv_cache_retention_config_from_request(
-            request, batch_size, batch_index)
-        request_lookahead_config = get_lookahead_decoding_config_from_request(
-            request, executor_lookahead_config, batch_size, batch_index)
-
-        # Inputs for mllama support
-        encoder_input_features = get_input_tensor_by_name(
-            request, 'encoder_input_features', batch_size, batch_index)
-        if encoder_input_features is not None:
-            if isinstance(encoder_input_features, np.ndarray):
-                encoder_input_features = from_numpy(
-                    encoder_input_features).squeeze(dim=0)
-            elif isinstance(encoder_input_features, torch.Tensor):
-                encoder_input_features = encoder_input_features.squeeze(dim=0)
-            inputs['encoder_input_features'] = encoder_input_features
-            logger.debug(
-                f"inputs to llm: encoder_input_features ({encoder_input_features.shape}"
-            )
-
-            encoder_output_length = get_input_tensor_by_name(
-                request, 'encoder_output_lengths', batch_size, batch_index)
-            if encoder_output_length is not None:
-                inputs['encoder_output_length'] = np.squeeze(
-                    encoder_output_length, axis=0)
-
-            cross_attention_mask = get_input_tensor_by_name(
-                request, 'cross_attention_mask', batch_size, batch_index)
-            if cross_attention_mask is not None:
-                inputs['cross_attention_mask'] = cross_attention_mask[0]
-                logger.debug(
-                    f"inputs to llm: cross_attention_mask ({ cross_attention_mask.shape})"
-                )
-
-            skip_cross_attn_blocks = get_input_tensor_by_name(
-                request,
-                'skip_cross_attn_blocks',
-                batch_size,
-                batch_index,
-                force_on_torch=True)
-            if skip_cross_attn_blocks is not None:
-                inputs['skip_cross_attn_blocks'] = skip_cross_attn_blocks[0]
-                logger.debug(
-                    f"inputs to llm: skip_cross_attn_blocks ({ skip_cross_attn_blocks.shape})"
-                )
-
-        guided_decoding_params = get_guided_decoding_params_from_request(
-            request, batch_size, batch_index)
-
-        requests.append(
-            trtllm.Request(
-                **inputs,
-                sampling_config=sampling_config,
-                output_config=output_config,
-                external_draft_tokens_config=external_draft_tokens_config,
-                prompt_tuning_config=prompt_tuning_config,
-                mrope_config=mrope_config,
-                lora_config=lora_config,
-                guided_decoding_params=guided_decoding_params,
-                lookahead_config=request_lookahead_config,
-                kv_cache_retention_config=kv_cache_retention_config))
-    return requests
-
-
-def convert_response(response,
-                     batch_index,
-                     batch_size,
-                     num_return_sequences,
-                     expected_logits_dtype=torch.float32,
-                     input_token_count=None,
-                     return_num_input_tokens=False,
-                     return_num_output_tokens=False):
-
-    if response.has_error():
-        return pb_utils.InferenceResponse(output_tensors=[],
-                                          error=pb_utils.TritonError(
-                                              response.error_msg)), True, 0
-    result = response.result
-    beam_lengths = np.expand_dims(
-        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
-    max_beam_length = max([len(beam) for beam in result.output_token_ids])
-    output_ids = np.full((1, len(result.output_token_ids), max_beam_length), -1,
-                         np.int32)
-    for idx, beam in enumerate(result.output_token_ids):
-        output_ids[0, idx, :len(beam)] = beam
-
-    output_lengths = output_ids.size
-    output_tensors = [
-        pb_utils.Tensor("output_ids", output_ids),
-        pb_utils.Tensor("sequence_length", beam_lengths),
-    ]
-
-    if result.cum_log_probs is not None:
-        output_tensors.append(
-            pb_utils.Tensor(
-                "cum_log_probs",
-                np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)))
-
-    if result.log_probs is not None:
-        output_tensors.append(
-            pb_utils.Tensor(
-                "output_log_probs",
-                np.expand_dims(np.array(result.log_probs, np.float32), 0)))
-
-    if result.context_logits is not None:
-        assert (result.context_logits.dtype is expected_logits_dtype)
-        output_tensors.append(
-            pb_utils.Tensor(
-                "context_logits",
-                np.expand_dims(
-                    np.array(
-                        result.context_logits,
-                        torch_to_numpy_dtype_dict[result.context_logits.dtype]),
-                    0)))
-
-    if result.generation_logits is not None:
-        assert (result.generation_logits.dtype is expected_logits_dtype)
-        output_tensors.append(
-            pb_utils.Tensor(
-                "generation_logits",
-                np.expand_dims(
-                    np.array(
-                        result.generation_logits, torch_to_numpy_dtype_dict[
-                            result.generation_logits.dtype]), 0)))
-
-    if batch_size > 1:
-        output_tensors.append(
-            pb_utils.Tensor(
-                "batch_index",
-                np.expand_dims(np.array([batch_index], np.int32), 0)))
-
-    if num_return_sequences > 1:
-        output_tensors.append(
-            pb_utils.Tensor(
-                "sequence_index",
-                np.expand_dims(np.array([result.sequence_index], np.int32), 0)))
-
-    # Add token count outputs if requested
-    if return_num_input_tokens and input_token_count is not None:
-        triton_output_tensor = pb_utils.Tensor(
-            "num_input_tokens",
-            np.expand_dims(np.array([input_token_count], np.int32), 0))
-        output_tensors.append(triton_output_tensor)
-    if return_num_output_tokens:
-        triton_output_tensor = pb_utils.Tensor(
-            "num_output_tokens",
-            np.expand_dims(np.array([output_lengths], np.int32), 0))
-        output_tensors.append(triton_output_tensor)
-
-    if result.request_perf_metrics is not None:
-        kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
-        output_tensors.append(
-            pb_utils.Tensor(
-                "kv_cache_alloc_new_blocks",
-                np.expand_dims(
-                    np.array([kv_cache_metrics.num_new_allocated_blocks],
-                             np.int32), 0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "kv_cache_reused_blocks",
-                np.expand_dims(
-                    np.array([kv_cache_metrics.num_reused_blocks], np.int32),
-                    0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "kv_cache_alloc_total_blocks",
-                np.expand_dims(
-                    np.array([kv_cache_metrics.num_total_allocated_blocks],
-                             np.int32), 0)))
-
-        timing_metrics = result.request_perf_metrics.timing_metrics
-        output_tensors.append(
-            pb_utils.Tensor(
-                "arrival_time_ns",
-                np.expand_dims(
-                    np.array([pd.Timedelta(timing_metrics.arrival_time).value],
-                             np.int64), 0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "first_scheduled_time_ns",
-                np.expand_dims(
-                    np.array([
-                        pd.Timedelta(timing_metrics.first_scheduled_time).value
-                    ], np.int64), 0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "first_token_time_ns",
-                np.expand_dims(
-                    np.array(
-                        [pd.Timedelta(timing_metrics.first_token_time).value],
-                        np.int64), 0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "last_token_time_ns",
-                np.expand_dims(
-                    np.array(
-                        [pd.Timedelta(timing_metrics.last_token_time).value],
-                        np.int64), 0)))
-
-        spec_dec_metrics = result.request_perf_metrics.speculative_decoding
-        output_tensors.append(
-            pb_utils.Tensor(
-                "acceptance_rate",
-                np.expand_dims(
-                    np.array([spec_dec_metrics.acceptance_rate], np.float32),
-                    0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "total_accepted_draft_tokens",
-                np.expand_dims(
-                    np.array([spec_dec_metrics.total_accepted_draft_tokens],
-                             np.int32), 0)))
-        output_tensors.append(
-            pb_utils.Tensor(
-                "total_draft_tokens",
-                np.expand_dims(
-                    np.array([spec_dec_metrics.total_draft_tokens], np.int32),
-                    0)))
-
-    return pb_utils.InferenceResponse(
-        output_tensors), result.is_final, output_lengths
-
-
-def convert_scheduler_policy(batch_scheduler_policy: str):
-    if batch_scheduler_policy.lower() == "max_utilization":
-        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
-        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    raise pb_utils.TritonModelException(
-        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
-    )
-
-
-def convert_batching_type(gpt_model_type: str):
-    if gpt_model_type is None:
-        return None
-    if gpt_model_type.lower(
-    ) == "inflight_fused_batching" or gpt_model_type.lower(
-    ) == "inflight_batching":
-        return trtllm.BatchingType.INFLIGHT
-    elif gpt_model_type.lower() == "v1":
-        return trtllm.BatchingType.STATIC
-    raise pb_utils.TritonModelException(
-        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
-
-
-def convert_decoding_mode(decoding_mode: str):
-    if decoding_mode is None:
-        return None
-    elif decoding_mode == "auto":
-        return trtllm.DecodingMode.Auto()
-    elif decoding_mode == "top_k":
-        return trtllm.DecodingMode.TopK()
-    elif decoding_mode == "top_p":
-        return trtllm.DecodingMode.TopP()
-    elif decoding_mode == "top_k_top_p":
-        return trtllm.DecodingMode.TopKTopP()
-    elif decoding_mode == "beam_search":
-        return trtllm.DecodingMode.BeamSearch()
-    elif decoding_mode == "medusa":
-        return trtllm.DecodingMode.Medusa()
-    elif decoding_mode == "redrafter":
-        return trtllm.DecodingMode.ExplicitDraftTokens()
-    elif decoding_mode == "lookahead":
-        return trtllm.DecodingMode.Lookahead()
-    elif decoding_mode == "eagle":
-        return trtllm.DecodingMode.Eagle()
-    raise pb_utils.TritonModelException(
-        f"decoding_mode value of '{decoding_mode}' is not supported.")
-
-
-def convert_timestamp_to_seconds(timestamp: str):
-    return int(
-        datetime.datetime.strptime(timestamp,
-                                   "%m-%d-%Y %H:%M:%S.%f").timestamp())
-
-
-def triton_string_to_torch(dtype):
-    type_map = {
-        "TYPE_BOOL": torch.bool,
-        "TYPE_UINT8": torch.uint8,
-        "TYPE_INT8": torch.int8,
-        "TYPE_INT16": torch.int16,
-        "TYPE_INT32": torch.int32,
-        "TYPE_INT64": torch.int64,
-        "TYPE_FP16": torch.float16,
-        "TYPE_FP32": torch.float32,
-        "TYPE_FP64": torch.float64,
-        "TYPE_BF16": torch.bfloat16
-    }
-    return type_map[dtype]
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def get_scheduler_config(self, model_config):
-        batch_scheduler_policy = get_parameter(model_config,
-                                               "batch_scheduler_policy")
-        if batch_scheduler_policy is None:
-            return trtllm.SchedulerConfig()
-        return trtllm.SchedulerConfig(
-            convert_scheduler_policy(batch_scheduler_policy))
-
-    def get_kv_cache_config(self, model_config):
-        kwargs = {
-            "enable_block_reuse":
-            get_parameter(model_config, "enable_kv_cache_reuse", bool),
-            "max_tokens":
-            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
-            "sink_token_length":
-            get_parameter(model_config, "sink_token_length", int),
-            "free_gpu_memory_fraction":
-            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
-                          float),
-            "cross_kv_cache_fraction":
-            get_parameter(model_config, "cross_kv_cache_fraction", float),
-            "host_cache_size":
-            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
-            "onboard_blocks":
-            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
-        }
-        max_attention_window_size = get_parameter(model_config,
-                                                  "max_attention_window_size")
-        if max_attention_window_size:
-            kwargs["max_attention_window"] = [
-                int(x) for x in max_attention_window_size.split(",")
-            ]
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.KvCacheConfig(**kwargs)
-
-    def get_parallel_config(self, model_config):
-        kwargs = {}
-        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
-        if gpu_device_ids:
-            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
-        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
-                                                    "0") == "1"
-        if self.use_orchestrator_mode:
-            kwargs["communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
-            worker_path = get_parameter(model_config, "worker_path")
-            spawn_processes = os.environ.get(
-                "TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES", "1") == "1"
-            if not spawn_processes:
-                raise pb_utils.TritonModelException(
-                    "Orchestrator mode with --disable-spawn-processes is not supported in the Python backend."
-                )
-            is_orchestrator = (mpi_rank() == 0) if spawn_processes else True
-            if worker_path is not None:
-                raise pb_utils.TritonModelException(
-                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
-                )
-            executor_worker_path = get_parameter(model_config,
-                                                 "executor_worker_path")
-            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
-                is_orchestrator, executor_worker_path)
-        if len(kwargs) > 0:
-            return trtllm.ParallelConfig(**kwargs)
-        return None
-
-    def get_peft_cache_config(self, model_config):
-        kwargs = {
-            "optimal_adapter_size":
-            get_parameter(model_config, "lora_cache_optimal_adapter_size", int),
-            "max_adapter_size":
-            get_parameter(model_config, "lora_cache_max_adapter_size", int),
-            "device_cache_percent":
-            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
-                          float),
-            "host_cache_size":
-            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
-            "lora_prefetch_dir":
-            get_parameter(model_config, "lora_prefetch_dir", int),
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.PeftCacheConfig(**kwargs)
-
-    def get_executor_lookahead_config(self, model_config):
-        lookahead_window_size = get_parameter(model_config,
-                                              "lookahead_window_size", int)
-        lookahead_ngram_size = get_parameter(model_config,
-                                             "lookahead_ngram_size", int)
-        lookahead_verification_set_size = get_parameter(
-            model_config, "lookahead_verification_set_size", int)
-        # executor_lookahead_config is not set
-        if all(item is None for item in [
-                lookahead_window_size, lookahead_ngram_size,
-                lookahead_verification_set_size
-        ]):
-            return None
-
-        incomplete_config = None in [
-            lookahead_window_size, lookahead_ngram_size,
-            lookahead_verification_set_size
-        ]
-
-        assert (
-            not incomplete_config
-        ), "Please set executor_lookahead_window_size, executor_lookahead_ngram_size and executor_lookahead_verification_set_size together."
-
-        return trtllm.LookaheadDecodingConfig(lookahead_window_size,
-                                              lookahead_ngram_size,
-                                              lookahead_verification_set_size)
-
-    def get_decoding_config(self, model_config):
-
-        decoding_mode = convert_decoding_mode(
-            get_parameter(model_config, "decoding_mode"))
-        self.executor_lookahead_config = None
-        if decoding_mode == trtllm.DecodingMode.Lookahead():
-            # Add LAD config
-            self.executor_lookahead_config = self.get_executor_lookahead_config(
-                model_config)
-        eagle_choices = parse_eagle_choices(
-            get_parameter(model_config, "eagle_choices"))
-        kwargs = {
-            "medusa_choices":
-            parse_medusa_choices(get_parameter(model_config, "medusa_choices")),
-            "eagle_config":
-            None
-            if eagle_choices is None else trtllm.EagleConfig(eagle_choices),
-            "lookahead_decoding_config":
-            self.executor_lookahead_config,
-            "decoding_mode":
-            decoding_mode,
-        }
-        print(kwargs)
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.DecodingConfig(**kwargs)
-
-    def get_extended_runtime_perf_knob_config(self, model_config):
-        kwargs = {
-            "multi_block_mode":
-            get_parameter(model_config, "multi_block_mode", bool),
-            "enable_context_fmha_fp32_acc":
-            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool),
-            "cuda_graph_mode":
-            get_parameter(model_config, "cuda_graph_mode", bool),
-            "cuda_graph_cache_size":
-            get_parameter(model_config, "cuda_graph_cache_size", int),
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
-
-    def get_guided_decoding_config(self, model_config):
-
-        guided_decoding_backend = get_parameter(model_config,
-                                                "guided_decoding_backend", str)
-
-        tokenizer_dir = get_parameter(model_config, "tokenizer_dir", str)
-        if guided_decoding_backend not in ['xgrammar']:
-            if tokenizer_dir:
-                pb_utils.Logger.log_warn(
-                    f"Guided decoding backend has not been set but tokenizer_dir is given. Tokenizer_dir will be ignored."
-                )
-            return None
-
-        if guided_decoding_backend == 'xgrammar':
-            guided_decoding_backend = trtllm.GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR
-
-        if not tokenizer_dir:
-            raise ValueError(
-                "Guided decoding requires tokenizer's information. Please provide 'tokenizer_dir'."
-            )
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
-        pb_utils.Logger.log_info(
-            f"Guided decoding has been set with {guided_decoding_backend} backend"
-        )
-        return trtllm.GuidedDecodingConfig(
-            backend=guided_decoding_backend,
-            **_xgrammar_tokenizer_info(tokenizer))
-
-    def get_executor_config(self, model_config):
-        kwargs = {
-            "max_beam_width":
-            get_parameter(model_config, "max_beam_width", int),
-            "scheduler_config":
-            self.get_scheduler_config(model_config),
-            "kv_cache_config":
-            self.get_kv_cache_config(model_config),
-            "enable_chunked_context":
-            get_parameter(model_config, "enable_chunked_context", bool),
-            "normalize_log_probs":
-            get_parameter(model_config, "normalize_log_probs", bool),
-            "batching_type":
-            convert_batching_type(get_parameter(model_config,
-                                                "gpt_model_type")),
-            "parallel_config":
-            self.get_parallel_config(model_config),
-            "peft_cache_config":
-            self.get_peft_cache_config(model_config),
-            "decoding_config":
-            self.get_decoding_config(model_config),
-            "max_queue_size":
-            model_config.get(
-                "dynamic_batching",
-                {},
-            ).get(
-                "default_queue_policy",
-                {},
-            ).get("max_queue_size"),
-            "extended_runtime_perf_knob_config":
-            self.get_extended_runtime_perf_knob_config(model_config),
-            "guided_decoding_config":
-            self.get_guided_decoding_config(model_config)
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.ExecutorConfig(**kwargs)
-
-    def create_metrics(self, model: str, version: str, is_v1_model: bool):
-        self.request_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_request_metrics",
-            description="TRT LLM request metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.runtime_memory_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_runtime_memory_metrics",
-            description="TRT LLM runtime memory metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.kv_cache_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_kv_cache_block_metrics",
-            description="TRT LLM KV cache block metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        model_type = "v1" if is_v1_model else "inflight_batcher"
-        self.model_type_metric_family = pb_utils.MetricFamily(
-            name=f"nv_trt_llm_{model_type}_metrics",
-            description=f"TRT LLM {model_type}-specific metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.general_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_general_metrics",
-            description="General TRT LLM metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        # Set the metric using self.general_metric_output_family.observe(string_size)
-        self.request_tokens_metric_family = pb_utils.MetricFamily(
-            name="nv_llm_input_token_len",
-            description="TRT LLM response metrics",
-            kind=pb_utils.MetricFamily.HISTOGRAM,
-        )
-        self.response_tokens_metric_family = pb_utils.MetricFamily(
-            name="nv_llm_output_token_len",
-            description="TRT LLM response metrics",
-            kind=pb_utils.MetricFamily.HISTOGRAM,
-        )
-        common_labels = {"model": model, "version": version}
-        self.all_metrics = {
-            # Request metrics
-            "num_active_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "active",
-                **common_labels
-            }),
-            "max_num_active_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "max",
-                **common_labels
-            }),
-            "num_scheduled_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "scheduled",
-                **common_labels
-            }),
-            "num_context_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "context",
-                **common_labels
-            }),
-            "num_waiting_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "waiting",
-                **common_labels
-            }),
-            # Runtime metrics
-            "cpu_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "cpu",
-                **common_labels
-            }),
-            "gpu_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "gpu",
-                **common_labels
-            }),
-            "pinned_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "pinned",
-                **common_labels
-            }),
-            # KV cache metrics
-            "max_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "max",
-                **common_labels
-            }),
-            "free_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "free",
-                **common_labels
-            }),
-            "used_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "used",
-                **common_labels
-            }),
-            "tokens_per_block":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "tokens_per",
-                **common_labels
-            }),
-            "fraction_used_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "fraction",
-                **common_labels
-            }),
-            # General metrics
-            "timestamp":
-            self.general_metric_family.Metric(labels={
-                "general_type": "timestamp",
-                **common_labels
-            }),
-            "iter":
-            self.general_metric_family.Metric(labels={
-                "general_type": "iteration_counter",
-                **common_labels
-            }),
-            METRIC_TOTAL_OUTPUT_TOKENS:
-            self.response_tokens_metric_family.Metric(
-                labels={
-                    "response_metric_type": METRIC_TOTAL_OUTPUT_TOKENS,
-                    **common_labels
-                },
-                buckets=build_1_2_5_buckets(1000)),
-            METRIC_TOTAL_INPUT_TOKENS:
-            self.request_tokens_metric_family.Metric(
-                labels={
-                    "response_metric_type": METRIC_TOTAL_INPUT_TOKENS,
-                    **common_labels
-                },
-                buckets=build_1_2_5_buckets(1000)),
-        }
-        if is_v1_model:
-            self.all_metrics.update({
-                "num_ctx_tokens":
-                self.model_type_metric_family.Metric(labels={
-                    "v1_specific_metric": "total_context_tokens",
-                    **common_labels
-                }),
-                "num_gen_tokens":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "v1_specific_metric": "total_generation_tokens",
-                        **common_labels
-                    }),
-                "empty_gen_slots":
-                self.model_type_metric_family.Metric(labels={
-                    "v1_specific_metric": "empty_generation_slots",
-                    **common_labels
-                }),
-            })
-        else:
-            self.all_metrics.update({
-                "num_ctx_tokens":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric":
-                        "total_context_tokens",
-                        **common_labels
-                    }),
-                "num_gen_requests":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric":
-                        "generation_requests",
-                        **common_labels
-                    }),
-                "micro_batch_id":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric": "micro_batch_id",
-                        **common_labels
-                    }),
-                "num_paused_requests":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric": "paused_requests",
-                        **common_labels
-                    }),
-            })
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        model_config = json.loads(args['model_config'])
-        gpt_model_path = get_parameter(model_config, "gpt_model_path")
-        if get_parameter(model_config, "enable_trt_overlap", bool):
-            raise pb_utils.TritonModelException(
-                f"enable_trt_overlap=true is not supported.")
-        self.exclude_input_from_output = get_parameter(
-            model_config, "exclude_input_in_output", bool)
-        executor_config = self.get_executor_config(model_config)
-        self.executor = trtllm.Executor(gpt_model_path,
-                                        trtllm.ModelType.DECODER_ONLY,
-                                        executor_config)
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
-        self.cancellation_check_period_ms = get_parameter(
-            model_config, "cancellation_check_period_ms", int) or 100
-        self.stats_check_period_ms = get_parameter(
-            model_config, "stats_check_period_ms", int) or 100
-
-        self.logits_dtype = None
-        for output in model_config['output']:
-            if output['name'] == 'context_logits' or output[
-                    'name'] == 'generation_logits':
-                self.logits_dtype = triton_string_to_torch(output['data_type'])
-
-        self.create_metrics(args["model_name"],
-                            args["model_version"],
-                            is_v1_model=executor_config.batching_type ==
-                            trtllm.BatchingType.STATIC)
-        self.triton_user_id_to_req_ids = {}
-        self.triton_req_id_to_req_ids = {}
-        self.req_id_to_request_data = {}
-        self.lock = Lock()
-        self.running = False
-        self.awaiter_thread = Thread(target=self.awaiter_loop)
-        self.cancellation_thread = Thread(target=self.cancellation_loop)
-        self.metrics_thread = Thread(target=self.metrics_loop)
-        if self.executor.can_enqueue_requests():
-            self.running = True
-            self.awaiter_thread.start()
-            self.cancellation_thread.start()
-            self.metrics_thread.start()
-        else:
-            # In leader mode, worker ranks will wait here until leader is done.
-            self.executor.shutdown()
-
-    def handle_stop_request(self, triton_user_id, response_sender):
-        if triton_user_id is None or triton_user_id == "":
-            response_sender.send(
-                pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                    "A request id must be provided for request cancellation")),
-                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            return
-
-        with self.lock:
-            if triton_user_id in self.triton_user_id_to_req_ids:
-                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
-                for req_id in req_ids:
-                    self.executor.cancel_request(req_id)
-
-        response_sender.send(
-            pb_utils.InferenceResponse(),
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model.
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        if not self.executor.can_enqueue_requests():
-            return
-
-        # Convert to executor requests.
-
-        triton_requests = []
-        executor_requests = []
-        batch_indices = []
-        triton_user_ids = []
-        triton_req_ids = []
-
-        for request in requests:
-
-            triton_user_id = request.request_id()
-
-            response_sender = request.get_response_sender()
-            stop = get_input_scalar_by_name(request, 'stop')
-
-            if stop:
-                self.handle_stop_request(triton_user_id, response_sender)
-            else:
-                #Unique request id used to identify each triton request
-                triton_req_id = str(randint(0, sys.maxsize))
-                self.triton_req_id_to_req_ids[triton_req_id] = set()
-                if triton_user_id is not None and triton_user_id != "":
-                    self.triton_user_id_to_req_ids[triton_user_id] = set()
-
-                try:
-                    converted_reqs = convert_request(
-                        request, self.exclude_input_from_output, self.decoupled,
-                        self.executor_lookahead_config)
-                except Exception as e:
-                    response_sender.send(
-                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
-                        )),
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                else:
-                    for batch_index, converted_req in enumerate(converted_reqs):
-                        triton_requests.append(request)
-                        executor_requests.append(converted_req)
-                        triton_user_ids.append(triton_user_id)
-                        triton_req_ids.append(triton_req_id)
-                        batch_indices.append(batch_index)
-
-        with self.lock:
-            request_ids = self.executor.enqueue_requests(executor_requests)
-            for req_id, triton_req_id, triton_user_id, executor_request, triton_request, batch_index in zip(
-                    request_ids, triton_req_ids, triton_user_ids,
-                    executor_requests, triton_requests, batch_indices):
-
-                self.req_id_to_request_data[req_id] = RequestData(
-                    triton_req_id, triton_user_id, batch_index,
-                    len(batch_indices),
-                    executor_request.sampling_config.num_return_sequences, 0, 0,
-                    triton_request.get_response_sender(),
-                    get_input_scalar_by_name(triton_request,
-                                             'return_num_input_tokens',
-                                             batch_index=batch_index),
-                    get_input_scalar_by_name(triton_request,
-                                             'return_num_output_tokens',
-                                             batch_index=batch_index))
-                self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
-                input_len = len(
-                    executor_request.input_token_ids
-                ) if executor_request.input_token_ids is not None else 0
-                self.req_id_to_request_data[
-                    req_id].num_input_tokens += input_len
-                # This checks both request level and instance config level
-                if executor_request.output_config.exclude_input_from_output == False and executor_request.streaming == False:
-                    self.req_id_to_request_data[
-                        req_id].num_output_tokens -= self.req_id_to_request_data[
-                            req_id].num_input_tokens * executor_request.sampling_config.beam_width
-                if triton_user_id is not None and triton_user_id != "":
-                    self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
-
-        return None
-
-    def awaiter_loop(self):
-        """Gets responses from executor and returns the results."""
-        while self.running:
-            for response in self.executor.await_responses(
-                    timeout=datetime.timedelta(milliseconds=1)):
-                req_id = response.request_id
-                request_data = None
-                with self.lock:
-                    if req_id not in self.req_id_to_request_data:
-                        continue
-                    request_data = self.req_id_to_request_data[req_id]
-
-                triton_response, is_final, output_length = convert_response(
-                    response, request_data.batch_index, request_data.batch_size,
-                    request_data.num_return_sequences, self.logits_dtype,
-                    request_data.num_input_tokens,
-                    request_data.return_num_input_tokens,
-                    request_data.return_num_output_tokens)
-                with self.lock:
-                    self.req_id_to_request_data[
-                        req_id].num_output_tokens += output_length
-                triton_request_final = False
-                if is_final:
-                    with self.lock:
-                        # Check if all executor requests part of that triton request are finished
-                        self.triton_req_id_to_req_ids[
-                            request_data.triton_req_id].remove(req_id)
-                        if len(self.triton_req_id_to_req_ids[
-                                request_data.triton_req_id]) == 0:
-                            pb_utils.Logger.log_info(
-                                f"DELETING Req id {req_id}, triton_req_id {request_data.triton_req_id} "
-                            )
-                            triton_request_final = True
-                            del self.triton_req_id_to_req_ids[
-                                request_data.triton_req_id]
-                            if request_data.triton_user_id is not None and request_data.triton_user_id != "":
-                                del self.triton_user_id_to_req_ids[
-                                    request_data.triton_user_id]
-                        self.update_metrics_per_request(req_id)
-                        del self.req_id_to_request_data[req_id]
-
-                request_data.response_sender.send(
-                    triton_response,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
-                    if triton_request_final else 0)
-
-    def cancellation_loop(self):
-        """Checks if any pending requests have been cancelled."""
-        while self.running:
-            time.sleep(self.cancellation_check_period_ms / 1000.0)
-            with self.lock:
-                for req_id, request_data in self.req_id_to_request_data.items():
-                    if request_data.response_sender.is_cancelled():
-                        self.executor.cancel_request(req_id)
-
-    def update_metrics_per_request(self, req_id):
-        """Updates triton metrics after completing one request"""
-        output_tokens = self.req_id_to_request_data[req_id].num_output_tokens
-        input_tokens = self.req_id_to_request_data[req_id].num_input_tokens
-
-        self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
-        self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
-
-    def get_composite_metric_map(self, stat):
-
-        def get_metric(metric_name, family_stats=None):
-            if family_stats is None:
-                if hasattr(stat, metric_name):
-                    return getattr(stat, metric_name)
-                elif stat.kv_cache_stats is not None and hasattr(
-                        stat.kv_cache_stats, metric_name):
-                    return getattr(stat.kv_cache_stats, metric_name)
-                elif stat.static_batching_stats is not None and hasattr(
-                        stat.static_batching_stats, metric_name):
-                    return getattr(stat.static_batching_stats, metric_name)
-                elif stat.inflight_batching_stats is not None and hasattr(
-                        stat.inflight_batching_stats, metric_name):
-                    return getattr(stat.inflight_batching_stats, metric_name)
-            elif family_stats is not None and hasattr(family_stats,
-                                                      metric_name):
-                return getattr(family_stats, metric_name)
-            pb_utils.Logger.log_warn(
-                f"Constituent metric \"{metric_name}\" not found.")
-            return None
-
-        composite_metrics = {}
-
-        # compute fraction_used_blocks
-        max_blocks = get_metric("max_num_blocks", stat.kv_cache_stats)
-        used_blocks = get_metric("used_num_blocks", stat.kv_cache_stats)
-        if max_blocks is not None and used_blocks is not None:
-            composite_metrics[
-                "fraction_used_blocks"] = 0.0 if max_blocks <= 0 else used_blocks / max_blocks
-        else:
-            pb_utils.Logger.log_warn(
-                f"fraction_used_blocks is missing one or more constituent metric."
-            )
-
-        # compute num_waiting_requests
-        active_requests = get_metric("num_active_requests")
-        scheduled_requests = get_metric("num_scheduled_requests")
-        if active_requests is not None and scheduled_requests is not None:
-            composite_metrics[
-                "num_waiting_requests"] = active_requests - scheduled_requests
-        else:
-            pb_utils.Logger.log_warn(
-                f"num_waiting_requests is missing one or more constituent metric."
-            )
-
-        return composite_metrics
-
-    def metrics_loop(self):
-        """Updates triton metrics using stats from the executor."""
-        while self.running:
-            time.sleep(self.stats_check_period_ms / 1000.0)
-            for stat in self.executor.get_latest_iteration_stats():
-                try:
-                    composite_metrics = self.get_composite_metric_map(stat)
-                    for key, metric in self.all_metrics.items():
-                        # Skip processing for both histogram metrics
-                        if isinstance(key, str) and key in [
-                                METRIC_TOTAL_OUTPUT_TOKENS,
-                                METRIC_TOTAL_INPUT_TOKENS
-                        ]:
-                            continue
-                        value = None
-                        if hasattr(stat, key):
-                            value = getattr(stat, key)
-                        elif stat.kv_cache_stats is not None and hasattr(
-                                stat.kv_cache_stats, key):
-                            value = getattr(stat.kv_cache_stats, key)
-                        elif stat.static_batching_stats is not None and hasattr(
-                                stat.static_batching_stats, key):
-                            value = getattr(stat.static_batching_stats, key)
-                        elif stat.inflight_batching_stats is not None and hasattr(
-                                stat.inflight_batching_stats, key):
-                            value = getattr(stat.inflight_batching_stats, key)
-                        elif key in composite_metrics:
-                            value = composite_metrics[key]
-                        if value is not None:
-                            if key == "timestamp":
-                                value = convert_timestamp_to_seconds(value)
-                            metric.set(value)
-                        else:
-                            pb_utils.Logger.log_warn(
-                                f"Metric \"{key}\" not found.")
-                except Exception as e:
-                    pb_utils.Logger.log_warn(
-                        f"Error while processing metrics: {e}")
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        if self.executor.can_enqueue_requests():
-            self.running = False
-            self.awaiter_thread.join()
-            self.cancellation_thread.join()
-            self.metrics_thread.join()
-            self.executor.shutdown()
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
deleted file mode 100644
index 4f06581c049..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+++ /dev/null
@@ -1,887 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "tensorrt_llm"
-backend: "${triton_backend}"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-  decoupled: ${decoupled_mode}
-}
-
-dynamic_batching {
-    preferred_batch_size: [ ${triton_max_batch_size} ]
-    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
-    default_queue_policy: { max_queue_size: ${max_queue_size} }
-}
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    allow_ragged_batch: true
-    optional: true
-  },
-  {
-    name: "encoder_input_features"
-    data_type: ${encoder_input_features_data_type}
-    dims: [ -1, -1 ]
-    allow_ragged_batch: true
-    optional: true
-  },
-  {
-    name: "encoder_output_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "num_return_sequences"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "draft_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "draft_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "draft_acceptance_threshold"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "stop_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "bad_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "embedding_bias"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_min"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_decay"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_reset_ids"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "early_stopping"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_search_diversity_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_perf_metrics"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_num_input_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_num_output_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "exclude_input_in_output"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "stop"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "streaming"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "prompt_table_extra_ids"
-    data_type: TYPE_UINT64
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  # cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
-  {
-    name: "cross_attention_mask"
-    data_type: TYPE_BOOL
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  # Mrope param when mrope is used
-  {
-    name: "mrope_rotary_cos_sin"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "mrope_position_deltas"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-    optional: true
-  },
-  # the unique task ID for the given LoRA.
-  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
-  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
-  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
-  {
-    name: "lora_task_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
-  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
-  # each of the in / out tensors are first flattened and then concatenated together in the format above.
-  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
-  {
-    name: "lora_weights"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  # module identifier (same size a first dimension of lora_weights)
-  # See LoraModule::ModuleType for model id mapping
-  #
-  # "attn_qkv": 0     # compbined qkv adapter
-  # "attn_q": 1       # q adapter
-  # "attn_k": 2       # k adapter
-  # "attn_v": 3       # v adapter
-  # "attn_dense": 4   # adapter for the dense layer in attention
-  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
-  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
-  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
-  #
-  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
-  {
-    name: "lora_config"
-    data_type: TYPE_INT32
-    dims: [ -1, 3 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "context_phase_params"
-    data_type: TYPE_UINT8
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  # skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
-  {
-    name: "skip_cross_attn_blocks"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_token_range_starts"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_token_range_ends"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_token_range_priorities"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_token_range_durations_ms"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_decode_priority"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "retention_decode_duration_ms"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide_type"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "guided_decoding_guide"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "lookahead_window_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "lookahead_ngram_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "lookahead_verification_set_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    allow_ragged_batch: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "sequence_length"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1, -1 ]
-  },
-  {
-    name: "batch_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "sequence_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "context_phase_params"
-    data_type: TYPE_UINT8
-    dims: [ -1 ]
-  },
-  {
-    name: "kv_cache_alloc_new_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_reused_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "arrival_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_scheduled_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "last_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "acceptance_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_accepted_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_input_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_output_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-instance_group [
-  {
-    count: 1
-    kind : KIND_CPU
-  }
-]
-parameters: {
-  key: "max_beam_width"
-  value: {
-    string_value: "${max_beam_width}"
-  }
-}
-parameters: {
-  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
-  value: {
-    string_value: "no"
-  }
-}
-parameters: {
-  key: "gpt_model_type"
-  value: {
-    string_value: "${batching_strategy}"
-  }
-}
-parameters: {
-  key: "gpt_model_path"
-  value: {
-    string_value: "${engine_dir}"
-  }
-}
-parameters: {
-  key: "encoder_model_path"
-  value: {
-    string_value: "${encoder_engine_dir}"
-  }
-}
-parameters: {
-  key: "max_tokens_in_paged_kv_cache"
-  value: {
-    string_value: "${max_tokens_in_paged_kv_cache}"
-  }
-}
-parameters: {
-  key: "max_attention_window_size"
-  value: {
-    string_value: "${max_attention_window_size}"
-  }
-}
-parameters: {
-  key: "sink_token_length"
-  value: {
-    string_value: "${sink_token_length}"
-  }
-}
-parameters: {
-  key: "batch_scheduler_policy"
-  value: {
-    string_value: "${batch_scheduler_policy}"
-  }
-}
-parameters: {
-  key: "kv_cache_free_gpu_mem_fraction"
-  value: {
-    string_value: "${kv_cache_free_gpu_mem_fraction}"
-  }
-}
-parameters: {
-  key: "cross_kv_cache_fraction"
-  value: {
-    string_value: "${cross_kv_cache_fraction}"
-  }
-}
-parameters: {
-  key: "kv_cache_host_memory_bytes"
-  value: {
-    string_value: "${kv_cache_host_memory_bytes}"
-  }
-}
-# kv_cache_onboard_blocks is for internal implementation.
-parameters: {
-  key: "kv_cache_onboard_blocks"
-  value: {
-    string_value: "${kv_cache_onboard_blocks}"
-  }
-}
-# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
-# parameters: {
-#   key: "enable_trt_overlap"
-#   value: {
-#     string_value: "${enable_trt_overlap}"
-#   }
-# }
-parameters: {
-  key: "exclude_input_in_output"
-  value: {
-    string_value: "${exclude_input_in_output}"
-  }
-}
-parameters: {
-  key: "cancellation_check_period_ms"
-  value: {
-    string_value: "${cancellation_check_period_ms}"
-  }
-}
-parameters: {
-  key: "stats_check_period_ms"
-  value: {
-    string_value: "${stats_check_period_ms}"
-  }
-}
-parameters: {
-  key: "iter_stats_max_iterations"
-  value: {
-    string_value: "${iter_stats_max_iterations}"
-  }
-}
-parameters: {
-  key: "request_stats_max_iterations"
-  value: {
-    string_value: "${request_stats_max_iterations}"
-  }
-}
-parameters: {
-  key: "enable_kv_cache_reuse"
-  value: {
-    string_value: "${enable_kv_cache_reuse}"
-  }
-}
-parameters: {
-  key: "normalize_log_probs"
-  value: {
-    string_value: "${normalize_log_probs}"
-  }
-}
-parameters: {
-  key: "enable_chunked_context"
-  value: {
-    string_value: "${enable_chunked_context}"
-  }
-}
-parameters: {
-  key: "gpu_device_ids"
-  value: {
-    string_value: "${gpu_device_ids}"
-  }
-}
-parameters: {
-  key: "participant_ids"
-  value: {
-    string_value: "${participant_ids}"
-  }
-}
-parameters: {
-  key: "num_nodes"
-  value: {
-    string_value: "${num_nodes}"
-  }
-}
-parameters: {
-  key: "lora_cache_optimal_adapter_size"
-  value: {
-    string_value: "${lora_cache_optimal_adapter_size}"
-  }
-}
-parameters: {
-  key: "lora_cache_max_adapter_size"
-  value: {
-    string_value: "${lora_cache_max_adapter_size}"
-  }
-}
-parameters: {
-  key: "lora_cache_gpu_memory_fraction"
-  value: {
-    string_value: "${lora_cache_gpu_memory_fraction}"
-  }
-}
-parameters: {
-  key: "lora_cache_host_memory_bytes"
-  value: {
-    string_value: "${lora_cache_host_memory_bytes}"
-  }
-}
-parameters: {
-  key: "lora_prefetch_dir"
-  value: {
-    string_value: "${lora_prefetch_dir}"
-  }
-}
-parameters: {
-  key: "decoding_mode"
-  value: {
-    string_value: "${decoding_mode}"
-  }
-}
-parameters: {
-  key: "executor_worker_path"
-  value: {
-    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
-  }
-}
-parameters: {
-  key: "lookahead_window_size"
-    value: {
-      string_value: "${lookahead_window_size}"
-  }
-}
-parameters: {
-  key: "lookahead_ngram_size"
-    value: {
-      string_value: "${lookahead_ngram_size}"
-  }
-}
-parameters: {
-  key: "lookahead_verification_set_size"
-    value: {
-      string_value: "${lookahead_verification_set_size}"
-  }
-}
-parameters: {
-  key: "medusa_choices"
-    value: {
-      string_value: "${medusa_choices}"
-  }
-}
-parameters: {
-  key: "eagle_choices"
-    value: {
-      string_value: "${eagle_choices}"
-  }
-}
-parameters: {
-  key: "gpu_weights_percent"
-    value: {
-      string_value: "${gpu_weights_percent}"
-  }
-}
-parameters: {
-  key: "enable_context_fmha_fp32_acc"
-  value: {
-    string_value: "${enable_context_fmha_fp32_acc}"
-  }
-}
-parameters: {
-  key: "multi_block_mode"
-  value: {
-    string_value: "${multi_block_mode}"
-  }
-}
-parameters: {
-  key: "cuda_graph_mode"
-  value: {
-    string_value: "${cuda_graph_mode}"
-  }
-}
-parameters: {
-  key: "cuda_graph_cache_size"
-  value: {
-    string_value: "${cuda_graph_cache_size}"
-  }
-}
-parameters: {
-  key: "speculative_decoding_fast_logits"
-  value: {
-    string_value: "${speculative_decoding_fast_logits}"
-  }
-}
-parameters: {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-parameters: {
-  key: "guided_decoding_backend"
-  value: {
-    string_value: "${guided_decoding_backend}"
-  }
-}
-parameters: {
-  key: "xgrammar_tokenizer_info_path"
-  value: {
-    string_value: "${xgrammar_tokenizer_info_path}"
-  }
-}
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
deleted file mode 100644
index 566e62cb7cd..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from collections.abc import Generator
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-import torch
-
-
-class RequestValidationError(Exception):
-    pass
-
-
-def _validate_that(condition: bool, msg: str):
-    if not condition:
-        raise RequestValidationError(msg)
-
-
-def _validate_non_empty(data, msg: str):
-    if isinstance(data, torch.Tensor):
-        _validate_that(data is not None and data.numel() > 0, msg)
-    else:
-        _validate_that(data is not None and data.size > 0, msg)
-
-
-def _validate_single_gt_0(data, msg: str):
-    _validate_non_empty(data, msg)
-    _validate_that(data.flatten()[0] > 0, msg)
-
-
-def _single_value(data: Optional[np.ndarray]):
-    if data is None:
-        return None
-    return data.flatten()[0]
-
-
-@dataclass
-class Request:
-    text_input: np.ndarray = field(default_factory=lambda: np.array([]))
-    decoder_text_input: np.ndarray = None
-    image_input: Optional[np.ndarray] = None
-    image_bytes_input: Optional[np.ndarray] = None
-    image_url_input: Optional[np.ndarray] = None
-    video_bytes_input: Optional[np.ndarray] = None
-    max_tokens: Optional[np.ndarray] = None
-    bad_words: Optional[np.ndarray] = None
-    stop_words: Optional[np.ndarray] = None
-    end_id: Optional[np.ndarray] = None
-    pad_id: Optional[np.ndarray] = None
-    top_k: Optional[np.ndarray] = None
-    top_p: Optional[np.ndarray] = None
-    temperature: Optional[np.ndarray] = None
-    length_penalty: Optional[np.ndarray] = None
-    repetition_penalty: Optional[np.ndarray] = None
-    min_tokens: Optional[np.ndarray] = None
-    return_log_probs: Optional[np.ndarray] = None
-    prompt_embedding_table: Optional[np.ndarray] = None
-    prompt_vocab_size: Optional[np.ndarray] = None
-    prompt_table_extra_id: Optional[np.ndarray] = None
-    embedding_bias_words: Optional[np.ndarray] = None
-    embedding_bias_weights: Optional[np.ndarray] = None
-    num_draft_tokens: Optional[np.ndarray] = None
-    use_draft_logits: Optional[np.ndarray] = None
-    stream: Optional[np.ndarray] = None
-    beam_width: Optional[np.ndarray] = None
-    return_context_logits: Optional[np.ndarray] = None
-    return_generation_logits: Optional[np.ndarray] = None
-    seed: Optional[np.ndarray] = None
-    presence_penalty: Optional[np.ndarray] = None
-    frequency_penalty: Optional[np.ndarray] = None
-    lora_task_id: Optional[np.ndarray] = None
-    lora_weights: Optional[np.ndarray] = None
-    lora_config: Optional[np.ndarray] = None
-    exclude_input_in_output: Optional[np.ndarray] = None
-    return_perf_metrics: Optional[np.ndarray] = None
-    guided_decoding_guide_type: Optional[np.ndarray] = None
-    guided_decoding_guide: Optional[np.ndarray] = None
-    request_id: Optional[str] = None
-    mrope_rotary_cos_sin: Optional[np.ndarray] = None
-    mrope_position_deltas: Optional[np.ndarray] = None
-
-    def validate(self):
-        _validate_non_empty(self.text_input, "text_input is required")
-        _validate_single_gt_0(self.max_tokens,
-                              "max_tokens must be a single value > 0")
-
-        num_draft_tokens = _single_value(self.num_draft_tokens)
-        _single_value(self.return_generation_logits)
-        context_logits = _single_value(self.return_context_logits)
-
-        if num_draft_tokens:
-            _validate_that(
-                not self.stream.any(),
-                "streaming is not supported with speculative decoding")
-            _validate_that(
-                not context_logits,
-                "context logits are not supported with speculative decoding")
-
-
-@dataclass
-class DraftRequest:
-    draft_input_ids: Optional[np.ndarray] = None
-    draft_logits: Optional[np.ndarray] = None
-
-
-@dataclass
-class PreprocResponse:
-    input_ids: np.ndarray = field(default_factory=lambda: np.array([]))
-    decoder_input_ids: np.ndarray = None
-    input_lengths: np.ndarray = field(default_factory=lambda: np.array([]))
-    decoder_input_lengths: np.ndarray = None
-    bad_words_list: Optional[np.ndarray] = None
-    stop_words_list: Optional[np.ndarray] = None
-    embedding_bias: Optional[np.ndarray] = None
-    end_id: Optional[np.ndarray] = None
-    pad_id: Optional[np.ndarray] = None
-    prompt_table_extra_ids: Optional[np.ndarray] = None
-    pixel_values: Optional[np.ndarray] = None
-    image_sizes: Optional[np.ndarray] = None
-    is_video_input: Optional[np.ndarray] = None
-    attention_mask: Optional[np.ndarray] = None
-    image_grid_thw: Optional[np.ndarray] = None
-    vision_input_id: Optional[np.ndarray] = None
-
-    @classmethod
-    def with_new_inputs(cls,
-                        other,
-                        input_ids: Optional[np.ndarray] = None,
-                        input_lengths: Optional[np.ndarray] = None):
-        return cls(
-            input_ids=(input_ids if input_ids is not None else other.input_ids),
-            input_lengths=(input_lengths if input_lengths is not None else
-                           other.input_lengths),
-            decoder_input_ids=other.decoder_input_ids,
-            decoder_input_lengths=other.decoder_input_lengths,
-            bad_words_list=other.bad_words_list,
-            stop_words_list=other.stop_words_list,
-            end_id=other.end_id,
-            pad_id=other.pad_id,
-            prompt_table_extra_ids=other.prompt_table_extra_ids)
-
-
-@dataclass
-class MultimodalEncResponse:
-    prompt_embedding_table: Optional[torch.Tensor] = None
-    prompt_vocab_size: Optional[np.ndarray] = None
-    mrope_rotary_cos_sin: Optional[np.ndarray] = None
-    mrope_position_deltas: Optional[np.ndarray] = None
-
-
-@dataclass
-class GenerationResponse:
-    output_ids: np.ndarray = field(default_factory=lambda: np.array([]))
-    sequence_length: np.ndarray = field(default_factory=lambda: np.array([]))
-    cum_log_probs: Optional[np.ndarray] = None
-    output_log_probs: Optional[np.ndarray] = None
-    context_logits: Optional[np.ndarray] = None
-    generation_logits: Optional[np.ndarray] = None
-    batch_index: Optional[np.ndarray] = None
-    sequence_index: Optional[np.ndarray] = None
-    kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
-    kv_cache_reused_blocks: Optional[np.ndarray] = None
-    kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
-    arrival_time_ns: Optional[np.ndarray] = None
-    first_scheduled_time_ns: Optional[np.ndarray] = None
-    first_token_time_ns: Optional[np.ndarray] = None
-    last_token_time_ns: Optional[np.ndarray] = None
-    acceptance_rate: Optional[np.ndarray] = None
-    total_accepted_draft_tokens: Optional[np.ndarray] = None
-    total_draft_tokens: Optional[np.ndarray] = None
-
-
-@dataclass
-class Response:
-    text_output: np.ndarray = field(default_factory=lambda: np.array([]))
-    cum_log_probs: Optional[np.ndarray] = None
-    output_log_probs: Optional[np.ndarray] = None
-    context_logits: Optional[np.ndarray] = None
-    generation_logits: Optional[np.ndarray] = None
-    batch_index: Optional[np.ndarray] = None
-    sequence_index: Optional[np.ndarray] = None
-    kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
-    kv_cache_reused_blocks: Optional[np.ndarray] = None
-    kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
-    arrival_time_ns: Optional[np.ndarray] = None
-    first_scheduled_time_ns: Optional[np.ndarray] = None
-    first_token_time_ns: Optional[np.ndarray] = None
-    last_token_time_ns: Optional[np.ndarray] = None
-    acceptance_rate: Optional[np.ndarray] = None
-    total_accepted_draft_tokens: Optional[np.ndarray] = None
-    total_draft_tokens: Optional[np.ndarray] = None
-
-    def __eq__(self, o) -> bool:
-        """Just for testing"""
-        if not isinstance(o, Response):
-            return False
-        return (np.array_equal(self.text_output, o.text_output)
-                and np.array_equal(self.cum_log_probs, o.cum_log_probs)
-                and np.array_equal(self.output_log_probs, o.output_log_probs)
-                and np.array_equal(self.context_logits, o.context_logits)
-                and np.array_equal(self.generation_logits, o.generation_logits)
-                and np.array_equal(self.batch_index, o.batch_index)
-                and np.array_equal(self.sequence_index, o.sequence_index)
-                and np.array_equal(self.kv_cache_alloc_new_blocks,
-                                   o.kv_cache_alloc_new_blocks)
-                and np.array_equal(self.kv_cache_reused_blocks,
-                                   o.kv_cache_reused_blocks)
-                and np.array_equal(self.kv_cache_alloc_total_blocks,
-                                   o.kv_cache_alloc_total_blocks)
-                and np.array_equal(self.arrival_time_ns, o.arrival_time_ns)
-                and np.array_equal(self.first_scheduled_time_ns,
-                                   o.first_scheduled_time_ns) and
-                np.array_equal(self.first_token_time_ns, o.first_token_time_ns)
-                and np.array_equal(self.last_token_time_ns,
-                                   o.last_token_time_ns)
-                and np.array_equal(self.acceptance_rate, o.acceptance_rate)
-                and np.array_equal(self.total_accepted_draft_tokens,
-                                   o.total_accepted_draft_tokens) and
-                np.array_equal(self.total_draft_tokens, o.total_draft_tokens))
-
-
-class Decoder:
-
-    def __init__(self, streaming=False, accumulate=False):
-        self._streaming = streaming
-        self._accumulate = accumulate
-
-        self._accumulated_tokens = []
-
-    def decode(self,
-               request: Request,
-               speculative_decoding=False,
-               is_multimodal=False) -> Generator[Response, None, None]:
-
-        batch_size = request.text_input.shape[0]
-        self._accumulated_tokens = [None] * batch_size
-        preproc_response = self.preprocess(request)
-
-        multimodal_enc_response = None
-        if is_multimodal:
-            multimodal_enc_response = self._multimodal_enc_generate(
-                request, preproc_response)
-
-        if speculative_decoding:
-            if batch_size > 1:
-                raise Exception(
-                    "speculative decoding is not supported with batch size > 1")
-            for gen_response in self._spec_generate(preproc_response, request):
-                yield self.postprocess(gen_response, batch_size)
-        else:
-            if not self._streaming and batch_size == 1:
-                gen_response = self._generate_non_streaming(
-                    preproc_response,
-                    request,
-                    multimodal_enc_response=multimodal_enc_response)
-                yield self.postprocess(gen_response, batch_size)
-            else:
-                for gen_response in self._generate(
-                        preproc_response,
-                        request,
-                        multimodal_enc_response=multimodal_enc_response):
-                    yield self.postprocess(gen_response, batch_size)
-
-    def encountered_stop_words(self, input_ids, stop_words_ids):
-        for stop_word_ids in stop_words_ids:
-            if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
-                return True
-        return False
-
-    def _spec_generate(
-            self, preproc: PreprocResponse,
-            request: Request) -> Generator[GenerationResponse, None, None]:
-
-        if preproc.input_ids.shape[0] > 1:
-            raise Exception(
-                "Speculative decoding does not support batch size > 1.")
-
-        prompt_input_ids: np.ndarray = preproc.input_ids[0]
-        input_ids: np.ndarray = prompt_input_ids
-        output_len: int = request.max_tokens[0][0]
-        last_input_ids: np.ndarray = None
-        draft_output_ids: np.ndarray = None
-        draft_logits: np.ndarray = None
-
-        target_response: GenerationResponse = None
-
-        cur_preproc = preproc
-
-        counter = 0
-        while True:
-            counter += 1
-            num_draft_tokens = min(
-                request.num_draft_tokens[0][0],
-                len(prompt_input_ids) + output_len - len(input_ids) - 1)
-
-            draft_request = None
-            if num_draft_tokens > 0:
-                request.min_tokens = np.array([num_draft_tokens],
-                                              dtype=np.int32)
-                draft_response: GenerationResponse = self._draft_generate_non_streaming(
-                    cur_preproc, request, num_draft_tokens)
-                seq_len: int = draft_response.sequence_length[0][0]
-                # [1, beamWidth, outputLength] -> [outputLen]
-                draft_output_ids = draft_response.output_ids[0][0]
-                # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
-                if request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    if draft_response.generation_logits is not None:
-                        draft_logits = draft_response.generation_logits[0][0]
-
-                input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
-                if len(input_draft_tokens) > 0:
-                    draft_request = DraftRequest(
-                        draft_input_ids=np.expand_dims(input_draft_tokens, 0))
-                    if request.use_draft_logits is not None and request.use_draft_logits[
-                            0]:
-                        draft_request.draft_logits = np.expand_dims(
-                            draft_logits[-len(input_draft_tokens):], 0)
-                else:
-                    draft_request = DraftRequest()
-                request.min_tokens = None
-            else:
-                draft_request = DraftRequest()
-            target_response = self._generate_non_streaming(
-                cur_preproc, request, draft_request)
-            last_input_ids = input_ids
-            input_ids = target_response.output_ids[0][0]
-            cur_preproc = PreprocResponse.with_new_inputs(
-                cur_preproc, np.expand_dims(input_ids, 0),
-                np.array([[len(input_ids)]], dtype=np.int32))
-
-            # Evaluate criteria to stop generation loop.
-            # If we've hit or exceeded the max output length, should stop
-            length_stop = (len(input_ids) >= len(prompt_input_ids) + output_len)
-            if length_stop:
-                break
-            # If draft and target have same outputs, should stop. Normally target should return 1 more token.
-            # If they are the same length, they should differ at the last token
-            target_draft_equal = draft_output_ids is not None and np.array_equal(
-                draft_output_ids, input_ids)
-            if target_draft_equal:
-                break
-            # If tokens no longer change, should stop, means we have hit early stopping
-            last_current_equal = np.array_equal(last_input_ids, input_ids)
-            if last_current_equal:
-                break
-            # Need to check if stop words was encountered
-            hit_stop_words = self.encountered_stop_words(
-                input_ids, preproc.stop_words_list[0])
-            if hit_stop_words:
-                break
-
-        yield target_response
-
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        raise NotImplementedError()
-
-    def _multimodal_enc_generate(
-        self,
-        request: Request,
-    ) -> MultimodalEncResponse:
-        raise NotImplementedError()
-
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
-    ) -> Generator[GenerationResponse, None, None]:
-        raise NotImplementedError()
-
-    def _generate_non_streaming(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
-    ) -> GenerationResponse:
-        raise NotImplementedError()
-
-    def send_cancellation_request(self, request_id, decoupled):
-        raise NotImplementedError()
-
-    def postprocess(self, gen_response: GenerationResponse,
-                    batch_size) -> Response:
-        if self._accumulate and self._streaming:
-            new_tokens: np.ndarray = gen_response.output_ids
-            if new_tokens.ndim != 3:
-                raise Exception("Expected output_ids tensor to have 3 dims.")
-            if new_tokens.shape[0] != 1:
-                raise Exception("Expected batch size of 1")
-            if new_tokens.shape[1] != 1:
-                raise Exception(
-                    "Accumulation of tokens is only implemented for beam width = 1"
-                )
-
-            batch_index = gen_response.batch_index
-            if batch_index is not None:
-                if batch_index.ndim != 2:
-                    raise Exception(
-                        "Expected batch_index tensor to have 2 dims.")
-                if batch_index.shape[0] != 1:
-                    raise Exception("Expected batch size of 1")
-                if batch_index.shape[1] != 1:
-                    raise Exception("Expected only one batch_index")
-
-            batch_index = batch_index[0][0] if batch_index is not None else 0
-
-            self._accumulated_tokens[batch_index] = new_tokens if (
-                self._accumulated_tokens[batch_index]
-                is None) else np.concatenate(
-                    (self._accumulated_tokens[batch_index], new_tokens), axis=2)
-            sequence_lengths = np.array(
-                [[self._accumulated_tokens[batch_index].shape[2]]],
-                dtype=np.int32)
-            return self._postprocess(self._accumulated_tokens[batch_index],
-                                     sequence_lengths, gen_response)
-        else:
-            return self._postprocess(gen_response.output_ids, None,
-                                     gen_response)
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        raise NotImplementedError()
-
-    def preprocess(self, request: Request) -> PreprocResponse:
-        raise NotImplementedError()
-
-    def reset_decoder(self):
-        self._accumulated_tokens = []
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
deleted file mode 100644
index 00bd315b138..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from collections.abc import Callable
-from typing import Dict, Optional
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from lib.decode import *
-from torch.utils.dlpack import from_dlpack, to_dlpack
-from typing_extensions import override
-
-
-class TritonDecoder(Decoder):
-
-    def __init__(self,
-                 streaming=False,
-                 accumulate=False,
-                 preproc_model_name="preprocessing",
-                 postproc_model_name="postprocessing",
-                 llm_model_name="tensorrt_llm",
-                 draft_llm_model_name: Optional[str] = None,
-                 multimodal_encoders_name: Optional[str] = None):
-        super().__init__(streaming=streaming, accumulate=accumulate)
-        self.preproc_model_name = preproc_model_name
-        self.postproc_model_name = postproc_model_name
-        self.llm_model_name = llm_model_name
-        self.draft_llm_model_name = draft_llm_model_name
-        self.multimodal_encoders_name = multimodal_encoders_name
-
-        self._preproc_outputs = [
-            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
-            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
-            "EMBEDDING_BIAS", "OUT_PAD_ID", "OUT_END_ID",
-            "OUT_PROMPT_TABLE_EXTRA_IDS", "PIXEL_VALUES", "IMAGE_SIZES",
-            "IS_VIDEO_INPUT", "VISION_INPUT_ID", "ATTENTION_MASK",
-            "IMAGE_GRID_THW"
-        ]
-
-        self._multimodal_enc_outputs = [
-            "OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE",
-            "MROPE_ROTARY_COS_SIN", "MROPE_POSITION_DELTAS"
-        ]
-
-        self._llm_outputs = [
-            "output_ids", "sequence_length", "cum_log_probs",
-            "output_log_probs", "context_logits", "generation_logits",
-            "batch_index", "sequence_index", "kv_cache_alloc_new_blocks",
-            "kv_cache_reused_blocks", "kv_cache_alloc_total_blocks",
-            "arrival_time_ns", "first_scheduled_time_ns", "first_token_time_ns",
-            "last_token_time_ns", "acceptance_rate",
-            "total_accepted_draft_tokens", "total_draft_tokens"
-        ]
-
-        self._postproc_outputs = [
-            "OUTPUT",
-        ]
-
-        self.input_names = [
-            "text_input", "decoder_text_input", "image_input",
-            "image_bytes_input", "image_url_input", "video_bytes_input",
-            "max_tokens", "bad_words", "stop_words", "end_id", "pad_id",
-            "top_k", "top_p", "temperature", "length_penalty",
-            "repetition_penalty", "min_tokens", "presence_penalty",
-            "frequency_penalty", "seed", "return_log_probs",
-            "return_context_logits", "return_generation_logits", "beam_width",
-            "stream", "prompt_embedding_table", "prompt_vocab_size",
-            "prompt_table_extra_id", "embedding_bias_words",
-            "embedding_bias_weights", "num_draft_tokens", "use_draft_logits",
-            "lora_task_id", "lora_weights", "lora_config",
-            "exclude_input_in_output", "return_perf_metrics",
-            "guided_decoding_guide_type", "guided_decoding_guide"
-        ]
-
-        self.__undo_reshape_whitelist = {
-            "max_tokens", "end_id", "pad_id", "top_k", "top_p", "temperature",
-            "length_penalty", "repetition_penalty", "min_tokens",
-            "presence_penalty", "frequency_penalty", "seed", "return_log_probs",
-            "return_context_logits", "return_generation_logits", "beam_width",
-            "stream", "prompt_vocab_size", "num_draft_tokens",
-            "use_draft_logits", "exclude_input_in_output",
-            "return_perf_metrics", "lora_weights", "lora_config", "lora_task_id"
-        }
-
-    def _exec_triton_request(self, request):
-        responses = request.exec(decoupled=True)
-        for r in responses:
-            if r.has_error():
-                raise pb_utils.TritonModelException(r.error().message())
-            yield r
-
-    def _exec_triton_request_single(self, request):
-        responses = request.exec(decoupled=False)
-        if responses.has_error():
-            raise pb_utils.TritonModelException(responses.error().message())
-        return responses
-
-    def create_triton_response(self, response: Response):
-        name_map = {
-            "text_output": "text_output",
-            "cum_log_probs": "cum_log_probs",
-            "output_log_probs": "output_log_probs",
-            "context_logits": "context_logits",
-            "generation_logits": "generation_logits",
-            "batch_index": "batch_index",
-            "sequence_index": "sequence_index",
-            "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
-            "kv_cache_reused_blocks": "kv_cache_reused_blocks",
-            "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks",
-            "arrival_time_ns": "arrival_time_ns",
-            "first_scheduled_time_ns": "first_scheduled_time_ns",
-            "first_token_time_ns": "first_token_time_ns",
-            "last_token_time_ns": "last_token_time_ns",
-            "acceptance_rate": "acceptance_rate",
-            "total_accepted_draft_tokens": "total_accepted_draft_tokens",
-            "total_draft_tokens": "total_draft_tokens"
-        }
-        tensors = self.create_triton_tensors(response, name_map)
-        return pb_utils.InferenceResponse(output_tensors=tensors)
-
-    def convert_triton_request(self, triton_request) -> Request:
-        request = Request()
-        for triton_name in self.input_names:
-            tensor = pb_utils.get_input_tensor_by_name(triton_request,
-                                                       triton_name)
-            target_name = triton_name
-            if tensor is None:
-                continue
-            if not hasattr(request, target_name):
-                raise AttributeError(
-                    f"Request has no attribute '{target_name}'")
-            setattr(request, target_name, tensor.as_numpy())
-        return request
-
-    def convert_triton_response(self,
-                                triton_response,
-                                response_factory: Callable,
-                                name_map=None):
-        response = response_factory()
-        for tensor in triton_response.output_tensors():
-            if tensor is None:
-                continue
-            triton_name = tensor.name()
-            if tensor.is_cpu():
-                value = tensor.as_numpy()
-            else:
-                # If the tensor is in GPU memory make it torch.Tensor type
-                value = from_dlpack(tensor.to_dlpack())
-            target_name = triton_name
-            if name_map and triton_name in name_map:
-                target_name = name_map[triton_name]
-            if name_map and not triton_name in name_map:
-                continue
-            if target_name is None:
-                # explicitly ignore this triton input
-                continue
-            if not hasattr(response, target_name):
-                raise AttributeError(
-                    f"response object has not attribute '{target_name}'")
-            setattr(response, target_name, value)
-        return response
-
-    def __undo_reshape(self, x, name):
-        if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
-            # handle reshapes
-            return np.expand_dims(x, 0)
-        else:
-            return x
-
-    def send_cancellation_request(self, request_id, decoupled):
-        tensors = []
-        tensors.append(
-            pb_utils.Tensor("input_ids", np.empty([1, 1], dtype=np.int32)))
-        tensors.append(
-            pb_utils.Tensor("input_lengths", np.zeros([1, 1], dtype=np.int32)))
-        tensors.append(
-            pb_utils.Tensor("request_output_len", np.array([[0]],
-                                                           dtype=np.int32)))
-        tensors.append(pb_utils.Tensor("stop", np.array([[True]],
-                                                        dtype='bool')))
-
-        inference_request = pb_utils.InferenceRequest(
-            model_name=self.llm_model_name,
-            requested_output_names=[],
-            inputs=tensors,
-            request_id=request_id)
-        inference_response = inference_request.exec(decoupled=decoupled)
-        if decoupled:
-            inference_response = list(inference_response)[0]
-        if inference_response.has_error():
-            raise pb_utils.TritonModelException(
-                inference_response.error().message())
-
-    def create_triton_tensors(self, obj, name_map: dict):
-        tensors = []
-        for name, triton_name in name_map.items():
-            if triton_name is None:
-                continue
-            value = getattr(obj, name)
-            if value is None:
-                continue
-            if isinstance(value, np.ndarray):
-                t = pb_utils.Tensor(triton_name,
-                                    self.__undo_reshape(value, name))
-            elif isinstance(value, torch.Tensor):
-                t = pb_utils.Tensor.from_dlpack(
-                    triton_name, to_dlpack(self.__undo_reshape(value, name)))
-            tensors.append(t)
-        return tensors
-
-    @override
-    def preprocess(self, request: Request) -> PreprocResponse:
-        input_tensors = self._get_preproc_tensors(request)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.preproc_model_name,
-            inputs=input_tensors,
-            request_id=request.request_id,
-            requested_output_names=self._preproc_outputs)
-        triton_output = self._exec_triton_request_single(triton_req)
-        return self._get_preproc_response(triton_output)
-
-    def _get_preproc_tensors(self, request: Request):
-        name_map = {
-            "text_input": "QUERY",
-            "image_bytes_input": "IMAGE_BYTES",
-            "image_url_input": "IMAGE_URL",
-            "video_bytes_input": "VIDEO_BYTES",
-            "decoder_text_input": "DECODER_QUERY",
-            "max_tokens": "REQUEST_OUTPUT_LEN",
-            "bad_words": "BAD_WORDS_DICT",
-            "stop_words": "STOP_WORDS_DICT",
-            "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
-            "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
-            "pad_id": "PAD_ID",
-            "end_id": "END_ID",
-            "prompt_table_extra_id": "PROMPT_TABLE_EXTRA_ID",
-        }
-        return self.create_triton_tensors(request, name_map)
-
-    def _get_preproc_response(self, triton_output):
-        name_map = {
-            "INPUT_ID": "input_ids",
-            "DECODER_INPUT_ID": "decoder_input_ids",
-            "REQUEST_INPUT_LEN": "input_lengths",
-            "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
-            "BAD_WORDS_IDS": "bad_words_list",
-            "STOP_WORDS_IDS": "stop_words_list",
-            "EMBEDDING_BIAS": "embedding_bias",
-            "OUT_PAD_ID": "pad_id",
-            "OUT_END_ID": "end_id",
-            "OUT_PROMPT_TABLE_EXTRA_IDS": "prompt_table_extra_ids",
-            "PIXEL_VALUES": "pixel_values",
-            "IMAGE_SIZES": "image_sizes",
-            "IS_VIDEO_INPUT": "is_video_input",
-            "ATTENTION_MASK": "attention_mask",
-            "IMAGE_GRID_THW": "image_grid_thw",
-            "VISION_INPUT_ID": "vision_input_id"
-        }
-        return self.convert_triton_response(triton_output, PreprocResponse,
-                                            name_map)
-
-    @override
-    def _multimodal_enc_generate(
-        self,
-        request: Request,
-        preproc: PreprocResponse,
-    ) -> MultimodalEncResponse:
-        input_tensors = self._get_multimodal_enc_tensors(request, preproc)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.multimodal_encoders_name,
-            inputs=input_tensors,
-            request_id=request.request_id,
-            requested_output_names=self._multimodal_enc_outputs)
-        triton_output = self._exec_triton_request_single(triton_req)
-        return self._get_multimodal_enc_response(triton_output)
-
-    def _get_multimodal_enc_tensors(self, request: Request,
-                                    preproc: PreprocResponse):
-        name_map_request = {
-            "image_input": "IMAGE",
-        }
-        name_map_preproc = {
-            "pixel_values": "pixel_values",
-            "image_sizes": "image_sizes",
-            "is_video_input": "is_video_input",
-            "attention_mask": "attention_mask",
-            "image_grid_thw": "image_grid_thw",
-            "vision_input_id": "vision_input_id"
-        }
-        tensors = []
-        tensors.extend(self.create_triton_tensors(request, name_map_request))
-        tensors.extend(self.create_triton_tensors(preproc, name_map_preproc))
-        return tensors
-
-    def _get_multimodal_enc_response(self, triton_output):
-        name_map = {
-            "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
-            "OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
-            "MROPE_ROTARY_COS_SIN": "mrope_rotary_cos_sin",
-            "MROPE_POSITION_DELTAS": "mrope_position_deltas"
-        }
-        return self.convert_triton_response(triton_output,
-                                            MultimodalEncResponse, name_map)
-
-    @override
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        input_tensors = self._get_llm_tensors(preproc, request,
-                                              num_draft_tokens, None, True)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.draft_llm_model_name,
-            inputs=input_tensors,
-            request_id=request.request_id,
-            requested_output_names=self._llm_outputs)
-        triton_response = self._exec_triton_request_single(triton_req)
-        llm_response = self._get_llm_response(triton_response)
-        return llm_response
-
-    @override
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None
-    ) -> Generator[GenerationResponse, None, None]:
-        input_tensors = self._get_llm_tensors(
-            preproc,
-            request,
-            None,
-            draft_request,
-            multimodal_enc_response=multimodal_enc_response)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.llm_model_name,
-            inputs=input_tensors,
-            request_id=request.request_id,
-            requested_output_names=self._llm_outputs)
-        for r in self._exec_triton_request(triton_req):
-            yield self._get_llm_response(r)
-
-    @override
-    def _generate_non_streaming(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None
-    ) -> GenerationResponse:
-        input_tensors = self._get_llm_tensors(
-            preproc,
-            request,
-            None,
-            draft_request,
-            multimodal_enc_response=multimodal_enc_response)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.llm_model_name,
-            inputs=input_tensors,
-            request_id=request.request_id,
-            requested_output_names=self._llm_outputs)
-        r = self._exec_triton_request_single(triton_req)
-        return self._get_llm_response(r)
-
-    def _get_llm_tensors(self,
-                         preproc: PreprocResponse,
-                         request: Request,
-                         num_output_tokens: Optional[int] = None,
-                         draft_request: Optional[DraftRequest] = None,
-                         is_draft_model_request: bool = False,
-                         multimodal_enc_response: MultimodalEncResponse = None):
-        tensors = []
-        tensors.extend(self._get_tensors_from_preproc(preproc))
-        if multimodal_enc_response is not None:
-            tensors.extend(
-                self._get_tensors_from_multimodal_enc(multimodal_enc_response))
-        tensors.extend(
-            self._get_llm_tensors_from_request(request, num_output_tokens,
-                                               draft_request,
-                                               is_draft_model_request))
-        return tensors
-
-    def _get_tensors_from_preproc(self, preproc: PreprocResponse):
-        name_map = {
-            "input_ids": "input_ids",
-            "decoder_input_ids": "decoder_input_ids",
-            "input_lengths": "input_lengths",
-            "bad_words_list": "bad_words_list",
-            "stop_words_list": "stop_words_list",
-            "embedding_bias": "embedding_bias",
-            "pad_id": "pad_id",
-            "end_id": "end_id",
-            "prompt_table_extra_ids": "prompt_table_extra_ids",
-        }
-        return self.create_triton_tensors(preproc, name_map)
-
-    def _get_tensors_from_multimodal_enc(
-            self, multimodal_enc_response: MultimodalEncResponse):
-        name_map = {
-            "prompt_embedding_table": "prompt_embedding_table",
-            "prompt_vocab_size": "prompt_vocab_size",
-            "mrope_rotary_cos_sin": "mrope_rotary_cos_sin",
-            "mrope_position_deltas": "mrope_position_deltas"
-        }
-        return self.create_triton_tensors(multimodal_enc_response, name_map)
-
-    def _get_llm_tensors_from_request(
-            self,
-            request: Request,
-            num_output_tokens: Optional[int] = None,
-            draft_request: Optional[DraftRequest] = None,
-            is_draft_model_request: bool = False):
-        name_map: Dict[str, Optional[str]] = {
-            "beam_width": "beam_width",
-            "top_k": "runtime_top_k",
-            "top_p": "runtime_top_p",
-            "temperature": "temperature",
-            "length_penalty": "len_penalty",
-            "repetition_penalty": "repetition_penalty",
-            "min_tokens": "min_tokens",
-            "presence_penalty": "presence_penalty",
-            "frequency_penalty": "frequency_penalty",
-            "seed": "seed",
-            "return_log_probs": "return_log_probs",
-            "stream": "streaming",
-            "prompt_embedding_table": "prompt_embedding_table",
-            "prompt_vocab_size": "prompt_vocab_size",
-            "lora_task_id": "lora_task_id",
-            "lora_weights": "lora_weights",
-            "lora_config": "lora_config",
-            "exclude_input_in_output": "exclude_input_in_output",
-            "return_perf_metrics": "return_perf_metrics",
-            "guided_decoding_guide_type": "guided_decoding_guide_type",
-            "guided_decoding_guide": "guided_decoding_guide"
-        }
-        batch_size = request.text_input.shape[0]
-        tensors = self.create_triton_tensors(request, name_map)
-        out_len_tensor = None
-        if request.max_tokens is not None:
-            out_len_tensor = request.max_tokens
-
-        out_len = None
-        if num_output_tokens is not None:
-            out_len = num_output_tokens
-        elif draft_request:
-            out_len = len(
-                draft_request.draft_input_ids[0]
-            ) + 1 if draft_request.draft_input_ids is not None else 1
-
-        if out_len is not None:
-            out_len_tensor = [[out_len]] * batch_size
-
-        if out_len_tensor is None:
-            raise Exception("Could not determine request_output_len")
-        else:
-            tensors.append(
-                pb_utils.Tensor("request_output_len",
-                                np.array(out_len_tensor, dtype=np.int32)))
-
-        if draft_request:
-            if draft_request.draft_input_ids is not None:
-                tensors.append(
-                    pb_utils.Tensor("draft_input_ids",
-                                    draft_request.draft_input_ids))
-                if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    tensors.append(
-                        pb_utils.Tensor("draft_logits",
-                                        draft_request.draft_logits))
-
-        return_context_logits_data = [False]
-        return_generation_logits_data = [False]
-        if draft_request is None:
-            if is_draft_model_request:
-                return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
-                    False
-                ]
-            else:
-                return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
-                    False
-                ]
-                return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
-                    False
-                ]
-        return_context_logits = np.array([return_context_logits_data] *
-                                         batch_size,
-                                         dtype=bool)
-        return_generation_logits = np.array([return_generation_logits_data] *
-                                            batch_size,
-                                            dtype=bool)
-
-        assert len(return_context_logits.shape) == 2
-        assert len(return_generation_logits.shape) == 2
-
-        tensors.append(
-            pb_utils.Tensor("return_context_logits", return_context_logits))
-        tensors.append(
-            pb_utils.Tensor("return_generation_logits",
-                            return_generation_logits))
-        return tensors
-
-    def _get_llm_response(self, triton_output):
-        name_map = {
-            "output_ids": "output_ids",
-            "sequence_length": "sequence_length",
-            "cum_log_probs": "cum_log_probs",
-            "output_log_probs": "output_log_probs",
-            "context_logits": "context_logits",
-            "generation_logits": "generation_logits",
-            "batch_index": "batch_index",
-            "sequence_index": "sequence_index",
-            "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
-            "kv_cache_reused_blocks": "kv_cache_reused_blocks",
-            "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks",
-            "arrival_time_ns": "arrival_time_ns",
-            "first_scheduled_time_ns": "first_scheduled_time_ns",
-            "first_token_time_ns": "first_token_time_ns",
-            "last_token_time_ns": "last_token_time_ns",
-            "acceptance_rate": "acceptance_rate",
-            "total_accepted_draft_tokens": "total_accepted_draft_tokens",
-            "total_draft_tokens": "total_draft_tokens"
-        }
-        return self.convert_triton_response(triton_output, GenerationResponse,
-                                            name_map)
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
-                                                   gen_response)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.postproc_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._postproc_outputs)
-        r = self._exec_triton_request_single(triton_req)
-        response = self._get_response(r, gen_response)
-        return response
-
-    def _get_postproc_tensors(self, tokens: np.ndarray,
-                              sequence_lengths: Optional[np.ndarray],
-                              gen_response: GenerationResponse):
-        tensors = [
-            pb_utils.Tensor("TOKENS_BATCH", tokens),
-            pb_utils.Tensor(
-                "SEQUENCE_LENGTH", sequence_lengths
-                if sequence_lengths else gen_response.sequence_length)
-        ]
-        return tensors
-
-    def _get_response(self, triton_output, gen_res: GenerationResponse):
-        tensors = triton_output.output_tensors()
-        t_map = {}
-        for named_t in tensors:
-            name = named_t.name()
-            t = named_t.as_numpy()
-            t_map[name] = t
-        response = Response(
-            text_output=t_map["OUTPUT"],
-            cum_log_probs=gen_res.cum_log_probs,
-            output_log_probs=gen_res.output_log_probs,
-            context_logits=gen_res.context_logits,
-            generation_logits=gen_res.generation_logits,
-            batch_index=gen_res.batch_index,
-            sequence_index=gen_res.sequence_index,
-            kv_cache_alloc_new_blocks=gen_res.kv_cache_alloc_new_blocks,
-            kv_cache_reused_blocks=gen_res.kv_cache_reused_blocks,
-            kv_cache_alloc_total_blocks=gen_res.kv_cache_alloc_total_blocks,
-            arrival_time_ns=gen_res.arrival_time_ns,
-            first_scheduled_time_ns=gen_res.first_scheduled_time_ns,
-            first_token_time_ns=gen_res.first_token_time_ns,
-            last_token_time_ns=gen_res.last_token_time_ns,
-            acceptance_rate=gen_res.acceptance_rate,
-            total_accepted_draft_tokens=gen_res.total_accepted_draft_tokens,
-            total_draft_tokens=gen_res.total_draft_tokens)
-        return response
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py
deleted file mode 100644
index 33fc4e06056..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-import traceback
-import uuid
-from collections import defaultdict
-from dataclasses import dataclass, field
-
-import triton_python_backend_utils as pb_utils
-from lib.triton_decoder import TritonDecoder
-
-
-def get_valid_param_value(param, default_value=''):
-    value = param.get('string_value', '')
-    return default_value if value.startswith('${') or value == '' else value
-
-
-@dataclass
-class StopWordsState:
-    beam_indices: set[int] = field(default_factory=set)
-    prefix: str = ""
-
-
-class TritonPythonModel:
-
-    def initialize(self, args):
-
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-
-        params = model_config['parameters']
-
-        accumulate_tokens_str = get_valid_param_value(
-            params.get('accumulate_tokens', {}))
-        self.accumulate_tokens = accumulate_tokens_str.lower() in [
-            'true', 'yes', '1', 't'
-        ]
-
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
-
-        self.logger = pb_utils.Logger
-
-        default_tensorrt_llm_model_name = 'tensorrt_llm'
-        self.llm_model_name = get_valid_param_value(
-            params.get('tensorrt_llm_model_name', {}),
-            default_tensorrt_llm_model_name)
-
-        self.draft_llm_model_name = get_valid_param_value(
-            params.get('tensorrt_llm_draft_model_name', {}), None)
-
-        self.multimodal_encoders_name = get_valid_param_value(
-            params.get('multimodal_encoders_name', {}), None)
-
-        self.decoder = TritonDecoder(
-            streaming=self.decoupled,
-            accumulate=self.accumulate_tokens,
-            preproc_model_name="preprocessing",
-            postproc_model_name="postprocessing",
-            llm_model_name=self.llm_model_name,
-            draft_llm_model_name=self.draft_llm_model_name,
-            multimodal_encoders_name=self.multimodal_encoders_name)
-
-    def get_batch_index(self, response):
-        if hasattr(
-                response, 'batch_index'
-        ) and response.batch_index is not None and response.batch_index.shape == (
-                1, 1):
-            return response.batch_index[0][0]
-        else:
-            return 0
-
-    def get_sequence_index(self, response):
-        if hasattr(
-                response, 'sequence_index'
-        ) and response.sequence_index is not None and response.sequence_index.shape == (
-                1, 1):
-            return response.sequence_index[0][0]
-        else:
-            return 0
-
-    def check_stop_words(self, request, response, state):
-        batch_index = self.get_batch_index(response)
-        seq_index = self.get_sequence_index(response)
-        if not (hasattr(request, 'stop_words')
-                and request.stop_words is not None):
-            return False
-
-        text_input = str(request.text_input[batch_index][0], 'utf-8')
-        is_streaming = hasattr(
-            request,
-            'stream') and request.stream and request.stream[batch_index][0]
-        exclude_input_in_output = hasattr(
-            request, 'exclude_input_in_output'
-        ) and request.exclude_input_in_output and request.exclude_input_in_output[
-            batch_index][0]
-        if is_streaming:
-            # For every beam in the response, check if the stop word is detected
-            for j, text_output in enumerate(response.text_output):
-                text_output = str(text_output, "utf-8")
-                if (batch_index, seq_index, j) not in state:
-                    state[(batch_index, seq_index, j)] = StopWordsState()
-
-                response_state = state[(batch_index, seq_index, j)]
-                # If stop word is already detected for a beam, skip it
-                if j in response_state.beam_indices:
-                    response.text_output[j] = b""
-                    continue
-
-                for i, stop_word in enumerate(request.stop_words[batch_index]):
-                    stop_word = str(stop_word, encoding='utf-8')
-                    if stop_word == "":
-                        continue
-
-                    if stop_word.startswith(response_state.prefix +
-                                            text_output):
-                        response_state.prefix += text_output
-                        if stop_word == response_state.prefix:
-                            response_state.beam_indices.add(j)
-                            break
-                    else:
-                        response_state.prefix = ""
-        else:
-            for j, text_output in enumerate(response.text_output):
-                if (batch_index, seq_index, j) not in state:
-                    state[(batch_index, seq_index, j)] = StopWordsState()
-
-                response_state = state[(batch_index, seq_index, j)]
-                generation_start = 0
-                text_output = str(text_output, "utf-8")
-                if not exclude_input_in_output:
-                    generation_start = len(text_input)
-                for i, stop_word in enumerate(request.stop_words[batch_index]):
-                    stop_word = str(stop_word, encoding='utf-8')
-                    if stop_word == "":
-                        continue
-                    stop_word_index = text_output.find(stop_word,
-                                                       generation_start)
-                    if stop_word_index != -1:
-                        response.text_output[
-                            j] = text_output[:stop_word_index +
-                                             len(stop_word)].encode('utf-8')
-                        response_state.beam_indices.add(j)
-                        break
-        return len(response_state.beam_indices) == len(response.text_output)
-
-    def execute(self, requests):
-
-        responses = []
-
-        for request in requests:
-            if self.decoupled:
-                response_sender = request.get_response_sender()
-            try:
-
-                req = self.decoder.convert_triton_request(request)
-                req.validate()
-                speculative_decode = (req.num_draft_tokens is not None
-                                      and req.num_draft_tokens[0][0] > 0)
-                if speculative_decode and (self.draft_llm_model_name is None
-                                           or self.draft_llm_model_name == ""):
-                    raise Exception(
-                        "cannot perform speculative decoding without draft model"
-                    )
-                is_multimodal = req.image_input is not None or req.image_bytes_input is not None or req.image_url_input is not None or req.video_bytes_input is not None
-
-                if speculative_decode and is_multimodal:
-                    raise Exception(
-                        "Multimodal and speculative decoding is not currently supported"
-                    )
-                req.request_id = str(uuid.uuid4())
-                res_gen = self.decoder.decode(
-                    req,
-                    speculative_decoding=speculative_decode,
-                    is_multimodal=is_multimodal)
-
-                stopped_batch_seq_indices = defaultdict(set)
-                stopped_word_status = defaultdict(StopWordsState)
-                for res in res_gen:
-                    batch_index = self.get_batch_index(res)
-                    if batch_index in stopped_batch_seq_indices and self.get_sequence_index(
-                            res) in stopped_batch_seq_indices[batch_index]:
-                        continue
-
-                    if self.check_stop_words(req, res, stopped_word_status):
-                        stopped_batch_seq_indices[batch_index].add(
-                            self.get_sequence_index(res))
-                        if len(stopped_batch_seq_indices
-                               ) == req.text_input.shape[0]:
-                            self.decoder.send_cancellation_request(
-                                req.request_id, self.decoupled)
-
-                    triton_response = self.decoder.create_triton_response(res)
-                    if self.decoupled:
-                        response_sender.send(triton_response)
-                    else:
-                        responses.append(triton_response)
-
-                if self.decoupled:
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-            except Exception:
-                self.logger.log_error(traceback.format_exc())
-                # If encountering an error, send a response with err msg
-                error_response = pb_utils.InferenceResponse(
-                    output_tensors=[],
-                    error=pb_utils.TritonError(traceback.format_exc()))
-
-                if self.decoupled:
-                    response_sender.send(error_response)
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                else:
-                    responses.append(error_response)
-
-            self.decoder.reset_decoder()
-        if self.decoupled:
-            return None
-        else:
-            assert len(responses) == len(requests)
-            return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
deleted file mode 100644
index 7f38bf903a6..00000000000
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "tensorrt_llm_bls"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-  decoupled: ${decoupled_mode}
-}
-
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  },
-  {
-    name: "decoder_text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "image_input"
-    data_type: TYPE_FP16
-    dims: [ -1, 3, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "image_bytes_input"
-    data_type: TYPE_UINT8
-    dims: [ -1, -1, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "image_url_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "video_bytes_input"
-    data_type: TYPE_UINT8
-    dims: [ -1, -1, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-    name: "exclude_input_in_output"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "num_return_sequences"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stream"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_table_extra_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-      name: "embedding_bias_words"
-      data_type: TYPE_STRING
-      dims: [ -1 ]
-      optional: true
-  },
-  {
-      name: "embedding_bias_weights"
-      data_type: TYPE_FP32
-      dims: [ -1 ]
-      optional: true
-  },
-  {
-      name: "num_draft_tokens",
-      data_type: TYPE_INT32,
-      dims: [ 1 ]
-      optional: true
-  },
-  {
-      name: "use_draft_logits",
-      data_type: TYPE_BOOL,
-      dims: [ 1 ]
-      reshape: { shape: [ ] }
-      optional: true
-  },
-  # the unique task ID for the given LoRA.
-  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
-  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
-  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
-  {
-    name: "lora_task_id"
-	data_type: TYPE_UINT64
-	dims: [ 1 ]
-    reshape: { shape: [ ] }
-	optional: true
-  },
-  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
-  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
-  # each of the in / out tensors are first flattened and then concatenated together in the format above.
-  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
-  {
-    name: "lora_weights"
-	data_type: TYPE_FP16
-	dims: [ -1, -1 ]
-	optional: true
-	allow_ragged_batch: true
-  },
-  # module identifier (same size a first dimension of lora_weights)
-  # See LoraModule::ModuleType for model id mapping
-  #
-  # "attn_qkv": 0     # compbined qkv adapter
-  # "attn_q": 1       # q adapter
-  # "attn_k": 2       # k adapter
-  # "attn_v": 3       # v adapter
-  # "attn_dense": 4   # adapter for the dense layer in attention
-  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
-  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
-  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
-  #
-  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
-  {
-    name: "lora_config"
-	data_type: TYPE_INT32
-	dims: [ -1, 3 ]
-	optional: true
-	allow_ragged_batch: true
-  },
-  {
-    name: "return_perf_metrics"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_num_input_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_num_output_tokens"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "guided_decoding_guide_type"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "guided_decoding_guide"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1, -1 ]
-  },
-  {
-    name: "batch_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "sequence_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_new_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_reused_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "arrival_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_scheduled_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "last_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "acceptance_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_accepted_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_input_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "num_output_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-
-parameters: {
-  key: "accumulate_tokens"
-  value: {
-    string_value: "${accumulate_tokens}"
-  }
-}
-parameters: {
-  key: "tensorrt_llm_model_name"
-  value: {
-    string_value: "${tensorrt_llm_model_name}"
-  }
-}
-parameters: {
-  key: "tensorrt_llm_draft_model_name"
-  value: {
-    string_value: "${tensorrt_llm_draft_model_name}"
-  }
-}
-parameters: {
-  key: "multimodal_encoders_name"
-  value: {
-    string_value: "${multimodal_encoders_name}"
-  }
-}
-
-instance_group [
-  {
-    count: ${bls_instance_count}
-    kind : KIND_CPU
-  }
-]
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py b/triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py
deleted file mode 100644
index 22f2562bf4a..00000000000
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import from_dlpack
-
-
-def convert_request_input_to_dict(request, param_mappings, default_values,
-                                  batch_size, batch_index):
-    kwargs = {}
-    for source_name, target_name in param_mappings.items():
-        value = get_input_scalar_by_name(request, source_name, batch_size,
-                                         batch_index)
-        if value is None and source_name in default_values:
-            kwargs[target_name] = default_values[source_name]
-        elif value is not None:
-            kwargs[target_name] = value
-
-    return kwargs
-
-
-def get_sampling_params_from_request(request, batch_size=1, batch_index=0):
-    """
-    Helper function to get trtllm.SamplingParams (LLMAPI) parameters from request
-    Used in llmapi/tensorrt_llm
-    """
-    sampling_params_args = [
-        'best_of',
-        'temperature',
-        'top_k',
-        'top_p',
-        'frequency_penalty',
-        'presence_penalty',
-        'max_tokens',
-        'seed',
-        'exclude_input_from_output',
-        'return_perf_metrics',
-    ]
-    param_mappings = {}
-    for arg in sampling_params_args:
-        param_mappings[f"sampling_param_{arg}"] = arg
-    default_values = {
-        'sampling_param_best_of': 1,
-        'sampling_param_exclude_input_from_output': False,
-        'sampling_param_return_perf_metrics': False,
-    }
-    kwargs = convert_request_input_to_dict(request, param_mappings,
-                                           default_values, batch_size,
-                                           batch_index)
-    # Pasrse stop as a list of strings not scalar
-    kwargs['stop'] = get_input_tensor_by_name(request, 'sampling_param_stop')
-    return kwargs
-
-
-def get_output_config_from_request(request, batch_size=1, batch_index=0):
-    """
-    Helper function to get trtllm.SamplingParams (LLMAPI) parameters from request
-    Used in llmapi/tensorrt_llm
-    """
-    output_config_args = [
-        'return_finish_reason', 'return_stop_reason',
-        'return_cumulative_logprob'
-    ]
-    param_mappings = {}
-    for arg in output_config_args:
-        param_mappings[arg] = arg
-
-    default_values = {
-        'return_finish_reason': False,
-        'return_stop_reason': False,
-        'return_cumulative_logprob': False
-    }
-    kwargs = convert_request_input_to_dict(request, param_mappings,
-                                           default_values, batch_size,
-                                           batch_index)
-    return kwargs
-
-
-def get_streaming_from_request(request, batch_size=1, batch_index=0):
-    """
-    Helper function to get streaming from request
-    Used in llmapi/tensorrt_llm
-    """
-    streaming = get_input_scalar_by_name(request, 'streaming', batch_size,
-                                         batch_index) or False
-    return streaming
-
-
-def get_input_scalar_by_name(request,
-                             name,
-                             expected_batch_size=1,
-                             batch_index=0):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-    tensor = tensor.as_numpy()
-
-    if tensor.size != expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Expected a scalar tensor for tensor {name}")
-
-    return tensor.item(batch_index)
-
-
-def get_input_tensor_by_name(request,
-                             name,
-                             expected_batch_size=None,
-                             batch_index=None,
-                             force_on_torch=False):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-
-    if tensor.is_cpu() and not force_on_torch:
-        tensor = tensor.as_numpy()
-    else:
-        tensor = from_dlpack(tensor.to_dlpack())
-
-    if expected_batch_size is not None and tensor.shape[
-            0] != expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
-        )
-
-    if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Invalid batch index in get_input_tensor_by_name for {name}")
-
-    if batch_index is not None:
-        # Add leading 1 batch dimension
-        if isinstance(tensor, np.ndarray):
-            return np.expand_dims(tensor[batch_index], axis=0)
-        elif isinstance(tensor, torch.Tensor):
-            return torch.unsqueeze(tensor[batch_index], dim=0)
-    else:
-        return tensor
-
-
-def read_parameter_as_type(value, name, pytype=str):
-    if value == "":
-        return None
-    if value.startswith("${") and value.endswith("}"):
-        return None
-    if pytype is bool:
-        return value.lower() in ["1", "true"]
-    try:
-        result = pytype(value)
-        return result
-    except:
-        pb_utils.Logger.log_warning(
-            f"Could not read parameter '{name}' with value '{value}', will use default."
-        )
-        return None
-
-
-def get_parameter(model_config, name, pytype=str):
-    if name not in model_config["parameters"]:
-        return None
-    return read_parameter_as_type(
-        model_config["parameters"][name]['string_value'], name, pytype)
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
deleted file mode 100755
index 2594fd47760..00000000000
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import asyncio
-import gc
-import json
-import os
-import queue
-import sys
-import threading
-from contextlib import asynccontextmanager
-from dataclasses import dataclass
-from random import randint
-from typing import Any
-
-import numpy as np
-import pandas as pd
-import triton_python_backend_utils as pb_utils
-import yaml
-from helpers import (get_input_tensor_by_name, get_output_config_from_request,
-                     get_sampling_params_from_request,
-                     get_streaming_from_request)
-from mpi4py.futures import MPICommExecutor
-from mpi4py.MPI import COMM_WORLD
-
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
-from tensorrt_llm.llmapi.llm import RequestOutput
-from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
-
-
-@dataclass
-class RequestData:
-    triton_req_id: int
-    triton_user_id: str
-    triton_request: Any
-    response_iterator: RequestOutput
-
-
-def get_model_config(filename, include_keys=None, exclude_keys=None):
-    engine_args_filepath = os.path.join(pb_utils.get_model_dir(), filename)
-    engine_config = None
-    if os.path.isfile(engine_args_filepath):
-        try:
-            with open(engine_args_filepath) as file:
-                engine_config = yaml.safe_load(file)
-        except Exception as e:
-            raise pb_utils.TritonModelException(
-                f"Failed to parse YAML engine config: {e}")
-
-    assert engine_config is not None, f"'{filename}' containing TRT-LLM engine args not found in '{pb_utils.get_model_dir()}'"
-
-    if include_keys:
-        engine_config = {
-            k: v
-            for k, v in engine_config.items() if k in include_keys
-        }
-    if exclude_keys:
-        engine_config = {
-            k: v
-            for k, v in engine_config.items() if k not in exclude_keys
-        }
-    return engine_config
-
-
-def get_input_scalar_by_name(request,
-                             name,
-                             expected_batch_size=1,
-                             batch_index=0):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-    tensor = tensor.as_numpy()
-
-    if tensor.size != expected_batch_size:
-        raise pb_utils.TritonModelException(
-            f"Expected a scalar tensor for tensor {name}")
-
-    return tensor.item(batch_index)
-
-
-class TritonPythonModel:
-
-    @classmethod
-    def auto_complete_config(cls, auto_complete_model_config):
-        """
-        Set triton_config values in model.yaml to auto_complete_model_config
-
-        Args:
-            auto_complete_model_config: Default configurations loaded from config.pbtxt
-
-        Returns:
-            auto_complete_model_config: Updated triton server configurations loading triton_config from model.yaml
-
-        Notes:
-            - This function is called when Triton server starts.
-            - It combines the default configurations in config.pbtxt with the triton_config values in model.yaml
-        """
-        triton_config = get_model_config(os.environ.get('LLM_CONFIG_PATH',
-                                                        'model.yaml'),
-                                         include_keys=["triton_config"
-                                                       ])["triton_config"]
-        auto_complete_model_config.set_model_transaction_policy(
-            dict(decoupled=bool(triton_config["decoupled"])))
-        auto_complete_model_config.set_max_batch_size(
-            int(triton_config["max_batch_size"]))
-
-        return auto_complete_model_config
-
-    def initialize(self, args):
-        """
-        Function allows the model to initialize any state associated with it.
-
-        Args:
-            args: triton configurations loaded from config.pbtxt and extended by auto_complete_config
-        Note:
-            - `initialize` is called only once when the model is being loaded.
-            - Implementing `initialize` function is optional.
-        """
-        from tensorrt_llm.llmapi import MpiCommSession
-
-        self.model_config = json.loads(args["model_config"])
-        triton_config = get_model_config(os.environ.get('LLM_CONFIG_PATH',
-                                                        'model.yaml'),
-                                         include_keys=["triton_config"
-                                                       ])["triton_config"]
-        self.decoupled = bool(triton_config["decoupled"])
-        self.params = self.model_config['parameters']
-        self.logger = pb_utils.Logger
-
-        text_output_config = pb_utils.get_output_config_by_name(
-            self.model_config, "text_output")
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            text_output_config["data_type"])
-        if global_mpi_rank() == 0:
-            # Initialize engine arguments
-            self.llm_engine_args = update_llm_args_with_extra_dict(
-                {},
-                get_model_config(os.environ.get('LLM_CONFIG_PATH',
-                                                'model.yaml'),
-                                 exclude_keys=["triton_config"]),
-            )
-            self.logger.log_info(
-                f"[trtllm] rank{global_mpi_rank()} is starting trtllm engine with args: {self.llm_engine_args}"
-            )
-
-            triton_config = get_model_config(
-                os.environ.get('LLM_CONFIG_PATH', 'model.yaml'),
-                include_keys=["triton_config"])["triton_config"]
-            self.cancellation_check_period_ms = int(
-                triton_config["cancellation_check_period_ms"]
-            ) if "cancellation_check_period_ms" in triton_config else 100
-
-            if global_mpi_size() > 1:
-                mpi_session = MpiCommSession(comm=COMM_WORLD,
-                                             n_workers=COMM_WORLD.Get_size())
-                self.llm_engine_args["_mpi_session"] = mpi_session
-
-            # Starting the TRT-LLM engine with LLM API and its event thread running the AsyncIO event loop.
-            self._init_engine()
-
-            self.running = False
-
-            # Starting the response thread. It allows TRT-LLM to keep making progress while
-            # response sender(s) are sending responses to server frontend.
-            self._response_queue = queue.Queue()
-            self._response_thread = threading.Thread(target=self._response_loop)
-            self._response_thread.start()
-
-            self.req_id_to_request_data = {}
-            self.triton_user_id_to_req_ids = {}
-            self.lock = threading.Lock()
-            self.cancellation_thread = threading.Thread(
-                target=self.cancellation_loop)
-            self.running = True
-            self.cancellation_thread.start()
-        else:
-            self.logger.log_info(
-                f"[trtllm] rank{global_mpi_rank()} is waiting for the leader node..."
-            )
-            with MPICommExecutor(COMM_WORLD) as executor:
-                if executor is not None:
-                    raise RuntimeError(
-                        f"[trtllm] rank{COMM_WORLD.rank} should not have executor"
-                    )
-            return
-
-    def _init_engine(self):
-        """
-        Initialize the LLM engine in a separate thread running the AsyncIO event loop.
-        """
-        self._llm_engine = None
-        self._llm_engine_start_cv = threading.Condition()
-        self._llm_engine_shutdown_event = asyncio.Event()
-        self._event_thread = threading.Thread(target=asyncio.run,
-                                              args=(self._run_llm_engine(), ))
-        self._event_thread.start()
-        with self._llm_engine_start_cv:
-            while self._llm_engine is None:
-                self._llm_engine_start_cv.wait()
-
-        # The 'threading.Thread()' will not raise the exception here should the engine
-        # failed to start, so the exception is passed back via the engine variable.
-        if isinstance(self._llm_engine, Exception):
-            e = self._llm_engine
-            self.logger.log_error(f"[trtllm] Failed to start engine: {e}")
-            if self._event_thread is not None:
-                self._event_thread.join()
-                self._event_thread = None
-            raise e
-
-    async def _run_llm_engine(self):
-        """
-        Run the LLM engine in an asynchronous context.
-        """
-        # Counter to keep track of ongoing request counts.
-        self._ongoing_request_count = 0
-
-        @asynccontextmanager
-        async def async_llm_wrapper():
-            # Create LLM in a thread to avoid blocking
-            loop = asyncio.get_running_loop()
-            try:
-                llm = await loop.run_in_executor(
-                    None, lambda: LLM(**self.llm_engine_args))
-                yield llm
-            finally:
-                if 'llm' in locals():
-                    # Run shutdown in a thread to avoid blocking
-                    await loop.run_in_executor(None, llm.shutdown)
-
-        try:
-            async with async_llm_wrapper() as engine:
-                # Capture the engine event loop and make it visible to other threads.
-                self._event_loop = asyncio.get_running_loop()
-
-                # Signal the engine is started and make it visible to other threads.
-                with self._llm_engine_start_cv:
-                    self._llm_engine = engine
-                    self._llm_engine_start_cv.notify_all()
-
-                # Wait for the engine shutdown signal.
-                await self._llm_engine_shutdown_event.wait()
-
-                # Wait for the ongoing requests to complete.
-                while self._ongoing_request_count > 0:
-                    self.logger.log_info(
-                        "[trtllm] Awaiting remaining {} requests".format(
-                            self._ongoing_request_count))
-                    await asyncio.sleep(1)
-
-                # Cancel all tasks in the event loop.
-                for task in asyncio.all_tasks(loop=self._event_loop):
-                    if task is not asyncio.current_task():
-                        task.cancel()
-
-        except Exception as e:
-            # Signal and pass the exception back via the engine variable if the engine
-            # failed to start. If the engine has started, re-raise the exception.
-            with self._llm_engine_start_cv:
-                if self._llm_engine is None:
-                    self._llm_engine = e
-                    self._llm_engine_start_cv.notify_all()
-                    return
-            raise e
-
-        self._llm_engine = None
-        self.logger.log_info("[trtllm] Shutdown complete")
-
-    def _response_loop(self):
-        """
-        Helper function to process responses from the response queue when streaming is enabled.
-        """
-        while True:
-            item = self._response_queue.get()
-            # To signal shutdown a None item will be added to the queue.
-            if item is None:
-                break
-            response_state, response, response_flag = item
-            response_sender = response_state["response_sender"]
-            try:
-                response_sender.send(response, response_flag)
-                # Stop checking for cancellation if the last response is generated.
-                if not response_state["last_response_generated"]:
-                    response_state[
-                        "is_cancelled"] = response_sender.is_cancelled()
-            except Exception as e:
-                self.logger.log_error(
-                    f"An error occurred while sending a response: {e}")
-            finally:
-                if response_flag == pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL:
-                    self._ongoing_request_count -= 1
-
-    def cancellation_loop(self):
-        """Checks if any pending requests have been cancelled."""
-        while self.running:
-            import time
-            time.sleep(self.cancellation_check_period_ms / 1000.0)
-            with self.lock:
-                cancelled_req_ids = []
-                for req_id, request_data in self.req_id_to_request_data.items():
-                    if request_data.triton_request.is_cancelled():
-                        request_data.response_iterator.abort()
-
-                        response_sender = request_data.triton_request.get_response_sender(
-                        )
-                        response_sender.send(
-                            pb_utils.InferenceResponse(
-                                error=pb_utils.TritonError(
-                                    "Request cancelled by client",
-                                    pb_utils.TritonError.CANCELLED)),
-                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                        cancelled_req_ids.append(req_id)
-                for req_id in cancelled_req_ids:
-                    del self.req_id_to_request_data[req_id]
-
-    def handle_stop_request(self, triton_user_id, response_sender):
-        if triton_user_id is None or triton_user_id == "":
-            response_sender.send(
-                pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                    "A request id must be provided for request cancellation")),
-                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            return
-
-        with self.lock:
-            if triton_user_id in self.triton_user_id_to_req_ids:
-                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
-                for req_id in req_ids:
-                    request_data = self.req_id_to_request_data[req_id]
-                    request_data.response_iterator.abort()
-                    del self.req_id_to_request_data[req_id]
-
-        response_sender.send(
-            pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                "Request cancelled by client", pb_utils.TritonError.CANCELLED)),
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-    def execute(self, requests):
-        """
-        Function is called by Triton server when a new request is received.
-
-        Args:
-            requests: a list of pb_utils.InferenceRequest
-
-        Notes:
-            - `execute` must be implemented in every Triton Python model.
-        """
-        # TODO: [JIRA-4040] Add health check here
-        for request in requests:
-            # TODO : [JIRA-4040] Verify Lora
-            if request is not None:
-                assert (
-                    self._llm_engine_shutdown_event.is_set() is False
-                ), "Cannot create tasks after shutdown has been requested"
-                coro = self._execute_single_request(request)
-                asyncio.run_coroutine_threadsafe(coro, self._event_loop)
-
-        return None
-
-    async def _execute_single_request(self, request):
-        """
-        Execute a single inference request asynchronously.
-        """
-        response_sender = request.get_response_sender()
-        triton_user_id = request.request_id()
-
-        stop = get_input_scalar_by_name(request, 'stop')
-        if stop:
-            self.handle_stop_request(triton_user_id, response_sender)
-            return
-
-        response_state = {
-            "response_sender": response_sender,
-            "is_cancelled": False,
-            "last_response_generated":
-            False,  # last response ready but not yet sent
-        }
-        self._ongoing_request_count += 1
-        decrement_ongoing_request_count = True
-
-        # Unique request id used to identify each triton request
-        triton_req_id = str(randint(0, sys.maxsize))
-
-        try:
-            # TODO: [JIRA-4496] Implement when request contains batched prompts
-            (prompt, sampling_params, streaming,
-             output_config) = self._convert_request(request)
-            if streaming and not self.decoupled:
-                raise pb_utils.TritonModelException(
-                    "Streaming is only supported in decoupled mode.")
-            # Generate the response.
-            response_iterator = self._llm_engine.generate_async(
-                prompt, SamplingParams(**sampling_params), streaming)
-
-            with self.lock:
-                self.req_id_to_request_data[triton_req_id] = RequestData(
-                    triton_req_id=triton_req_id,
-                    triton_user_id=request.request_id(),
-                    triton_request=request,
-                    response_iterator=response_iterator,
-                )
-                if triton_user_id is not None and triton_user_id != "" and triton_user_id:
-                    self.triton_user_id_to_req_ids[triton_user_id] = set()
-                    # TODO: [JIRA-4496] Add all batched request ids to the set
-                    self.triton_user_id_to_req_ids[triton_user_id].add(
-                        triton_req_id)
-
-            async for request_output in response_iterator:
-                # Send each response if streaming.
-                if streaming:
-                    response = self._create_response(
-                        request_output=request_output,
-                        output_config=output_config)
-                    flags = 0
-                    if request_output.finished:
-                        response_state["last_response_generated"] = True
-                        flags = pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
-                        # with streaming, self._response_loop will decrement self._ongoing_request_count
-                        decrement_ongoing_request_count = False
-                    self._response_queue.put_nowait(
-                        (response_state, response, flags))
-
-            # Send the last response which contains all the outputs if not streaming.
-            if not streaming:
-                # If the request was cancelled, we don't need to send the last response
-                with self.lock:
-                    was_cancelled = triton_req_id not in self.req_id_to_request_data
-                    if not was_cancelled:
-                        # Remove the request from the request data map so the cancellation loop stops querying
-                        # is_cancelled() on the request
-                        del self.req_id_to_request_data[triton_req_id]
-
-                if not was_cancelled:
-                    response_sender.send(
-                        self._create_response(request_output=request_output,
-                                              output_config=output_config),
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-        except Exception as e:
-            self.logger.log_error(f"[trtllm] Error generating request: {e}")
-            error = pb_utils.TritonError(f"Error generating request: {e}")
-            text_output_tensor = pb_utils.Tensor(
-                "text_output", np.asarray(["N/A"], dtype=self.output_dtype))
-            response = pb_utils.InferenceResponse(
-                output_tensors=[text_output_tensor], error=error)
-            response_sender.send(
-                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            raise e
-
-        finally:
-            if decrement_ongoing_request_count:
-                self._ongoing_request_count -= 1
-            with self.lock:
-                if triton_req_id in self.req_id_to_request_data:
-                    del self.req_id_to_request_data[triton_req_id]
-                if triton_user_id is not None and triton_user_id != "" and triton_user_id in self.triton_user_id_to_req_ids:
-                    del self.triton_user_id_to_req_ids[triton_user_id]
-
-    def _convert_request(self, request):
-        """Helper function to convert the request into a prompt for LLM.generate_async
-
-        Args:
-            request: Triton Server request
-
-        Returns:
-            prompt: A LLM PromptInputs object
-
-        Notes:
-            - The current implementation only supports text_input being a 1D tensor(a single prompt).
-        """
-        text_input = get_input_tensor_by_name(request, 'text_input')
-        if text_input is None:
-            raise pb_utils.TritonModelException(
-                f"text_input is missing from the request")
-        if len(text_input.shape) > 1:
-            raise pb_utils.TritonModelException(
-                f"The current implementation only supports text_input being a 1D tensor."
-            )
-
-        prompt = text_input[0]
-
-        if isinstance(prompt, bytes):
-            prompt = prompt.decode("utf-8")
-
-        sampling_params = get_sampling_params_from_request(request)
-        output_config = get_output_config_from_request(request)
-        streaming = get_streaming_from_request(request)
-        return prompt, sampling_params, streaming, output_config
-
-    def _create_response(self, request_output, output_config):
-        """Process the generated request_output and create the client response.
-
-        Args:
-            request_output (tensorrt_llm.llmapi.RequestOutput): Inferred results running the LLM engine and input prompt.
-            Parameters:
-                request_id (int): The unique ID of the request.
-                prompt (str, optional): The prompt string of the request.
-                prompt_token_ids (List[int]): The token ids of the prompt.
-                outputs (List[CompletionOutput]): The output sequences of the request.
-                    Args:
-                        index (int): The index of the output in the request.
-                        text (str): The generated output text.
-                        token_ids (List[int], optional): The token ids of the generated output text.
-                        cumulative_logprob (float, optional): The cumulative log probability of the generated output text.
-                        logprobs (List[float], optional): The log probabilities of the top probability words at each position if the logprobs are requested.
-                        finish_reason (Literal['stop', 'length', 'timeout', 'cancelled'], optional): The reason why the sequence is finished.
-                        stop_reason (int, str, optional): The stop string or token id that caused the completion to stop, None if the completion finished for some other reason.
-                        generation_logits (torch.Tensor, optional): The logits on the generated output token ids.
-                        disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional): Parameters needed for disaggregated serving.
-                context_logits (torch.Tensor, optional): The logits on the prompt token ids.
-                finished (bool): Whether the whole request is finished.
-
-        Returns:
-            pb_utils.InferenceResponse: Converted output response
-                The arguments are defined in config.pbtxt output
-                triton_config:output_config in model.yaml controls which output to send besides text_output
-        """
-        # TODO: [JIRA-4040] Check if request_output has_error and handle it
-        response = []
-        text_output = [
-            output.text.encode("utf-8") for output in request_output.outputs
-        ]
-
-        response.append(
-            pb_utils.Tensor("text_output",
-                            np.asarray(text_output, dtype=self.output_dtype)))
-
-        # Extract and add configurable output fields
-        # The output_config loads related input from request
-        output_fields = {
-            "return_finish_reason":
-            ("finish_reason", lambda output: output.finish_reason),
-            "return_stop_reason":
-            ("stop_reason", lambda output: output.stop_reason),
-            "return_cumulative_logprob":
-            ("cumulative_logprob", lambda output: output.cumulative_logprob)
-        }
-
-        for config_key, (output_name, extractor) in output_fields.items():
-            if output_config[config_key]:
-                tensor_data = [
-                    str(extractor(output)) for output in request_output.outputs
-                ]
-                response.append(
-                    pb_utils.Tensor(output_name,
-                                    np.asarray(tensor_data, dtype=np.object_)))
-
-        if hasattr(request_output.outputs[0], 'request_perf_metrics'
-                   ) and request_output.outputs[0].request_perf_metrics:
-
-            perf_metrics = request_output.outputs[0].request_perf_metrics
-
-            # kv cache perf metrics per request
-            kv_metrics = perf_metrics.kv_cache_metrics
-
-            response.append(
-                pb_utils.Tensor(
-                    "kv_cache_reused_block",
-                    np.asarray([kv_metrics.num_reused_blocks],
-                               dtype=self.output_dtype)))
-            response.append(
-                pb_utils.Tensor(
-                    "kv_cache_hit_rate",
-                    np.asarray([kv_metrics.kv_cache_hit_rate],
-                               dtype=self.output_dtype)))
-            response.append(
-                pb_utils.Tensor(
-                    "kv_cache_alloc_new_blocks",
-                    np.asarray([kv_metrics.num_new_allocated_blocks],
-                               dtype=self.output_dtype)))
-            response.append(
-                pb_utils.Tensor(
-                    "kv_cache_alloc_total_blocks",
-                    np.asarray([kv_metrics.num_total_allocated_blocks],
-                               dtype=self.output_dtype)))
-            response.append(
-                pb_utils.Tensor(
-                    "kv_cache_missed_block",
-                    np.asarray([kv_metrics.num_missed_blocks],
-                               dtype=self.output_dtype)))
-
-            # timing perf metrics per request
-            timing_metrics = perf_metrics.timing_metrics
-            response.append(
-                pb_utils.Tensor(
-                    "arrival_time_ns",
-                    np.asarray(
-                        [pd.Timedelta(timing_metrics.arrival_time).value],
-                        dtype=self.output_dtype)))
-
-            response.append(
-                pb_utils.Tensor(
-                    "first_scheduled_time_ns",
-                    np.asarray([
-                        pd.Timedelta(timing_metrics.first_scheduled_time).value
-                    ],
-                               dtype=self.output_dtype)))
-
-            response.append(
-                pb_utils.Tensor(
-                    "first_token_time_ns",
-                    np.asarray(
-                        [pd.Timedelta(timing_metrics.first_token_time).value],
-                        dtype=self.output_dtype)))
-
-            response.append(
-                pb_utils.Tensor(
-                    "last_token_time_ns",
-                    np.asarray(
-                        [pd.Timedelta(timing_metrics.last_token_time).value],
-                        dtype=self.output_dtype)))
-
-            #spec dec perf metrics per request
-            spec_dec_metrics = perf_metrics.speculative_decoding
-
-            response.append(
-                pb_utils.Tensor(
-                    "acceptance_rate",
-                    np.asarray([spec_dec_metrics.acceptance_rate],
-                               dtype=self.output_dtype)))
-
-            response.append(
-                pb_utils.Tensor(
-                    "total_accepted_draft_tokens",
-                    np.asarray([spec_dec_metrics.total_accepted_draft_tokens],
-                               dtype=self.output_dtype)))
-
-            response.append(
-                pb_utils.Tensor(
-                    "total_draft_tokens",
-                    np.asarray([spec_dec_metrics.total_draft_tokens],
-                               dtype=self.output_dtype)))
-
-        return pb_utils.InferenceResponse(output_tensors=response)
-
-    def finalize(self):
-        """
-        Function is called by Triton server before exiting.
-
-        Notes:
-            - `finalize` is called only once when the model is being unloaded.
-            - Implementing `finalize` function is optional.
-        """
-        self.logger.log_info("[trtllm] Issuing finalize to trtllm backend")
-        self._event_loop.call_soon_threadsafe(
-            self._llm_engine_shutdown_event.set)
-
-        # Shutdown the event thread.
-        if self._event_thread is not None:
-            self._event_thread.join()
-            self._event_thread = None
-
-        # # Shutdown the response thread.
-        self._response_queue.put(None)
-        if self._response_thread is not None:
-            self._response_thread.join()
-            self._response_thread = None
-
-        if self.cancellation_thread is not None:
-            self.running = False
-            self.cancellation_thread.join()
-            self.cancellation_thread = None
-
-        # When using parallel tensors, the stub process may not shutdown due to
-        # unreleased references, so manually run the garbage collector once.
-        self.logger.log_info(
-            "[trtllm] Running Garbage Collector on finalize...")
-        gc.collect()
-        self.logger.log_info("[trtllm] Garbage Collector on finalize... done")
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
deleted file mode 100644
index c9ad98f9dfc..00000000000
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# ======= TRTLLM Model Configurations =======
-# The below configurations are used to initialize the LLM model
-# Check tensorrt_llm.llmapi.LLM for more details
-model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-backend: "pytorch"
-
-# Parallel Configurations
-tensor_parallel_size: 1
-pipeline_parallel_size: 1
-
-
-# ======= Triton Server Configurations =======
-# Triton Configurations to override the default values in config.pbtxt
-# If initialized in config.pbtxt, the values cannot be overwritten in model.py
-# Therefore specify here to allow flexibility for testing
-# Check config.pbtxt for other triton configurations
-triton_config:
-  max_batch_size: 0 # The current implementation does not support batching, batch support is tracked in JIRA-4496
-  decoupled: False
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt b/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
deleted file mode 100644
index 7ec628be9aa..00000000000
--- a/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "tensorrt_llm"
-backend: "python"
-
-#######################################################
-# The below config arguments are specific in model.yaml
-# Please don't add it in config.pbtxt
-#
-# max_batch_size: 64
-#
-# model_transaction_policy {
-#  decoupled: False
-# }
-#######################################################
-
-instance_group [
-  {
-    count: 1
-    kind : KIND_CPU
-  }
-]
-
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "streaming"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  ## SamplingParams Arguments for Each Request ##
-  {
-    name: "sampling_param_best_of"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_max_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_stop"
-    data_type: TYPE_STRING
-    dims: [-1]
-    optional: true
-  },
-  {
-    name: "sampling_param_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_exclude_input_from_output"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "sampling_param_return_perf_metrics"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  ## Arguments for Controlling Response Output Fields ##
-  {
-    name: "return_finish_reason"
-    data_type: TYPE_BOOL
-    dims: [1]
-    optional: true
-  },
-  {
-    name: "return_stop_reason"
-    data_type: TYPE_BOOL
-    dims: [1]
-    optional: true
-  },
-  {
-    name: "return_cumulative_logprob"
-    data_type: TYPE_BOOL
-    dims: [1]
-    optional: true
-  },
-  {
-    name: "stop"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  }
-]
-###################################################################
-# The below output parameters are arguments from LLM.RequestOutput
-###################################################################
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [-1]
-  },
-  {
-    name: "finish_reason"
-    data_type: TYPE_STRING
-    dims: [-1]
-  },
-  {
-    name: "stop_reason"
-    data_type: TYPE_STRING
-    dims: [-1]
-  },
-  {
-    name: "cumulative_logprob"
-    data_type: TYPE_FP32
-    dims: [-1]
-  },
-  {
-    name: "kv_cache_reused_block"
-    data_type: TYPE_INT32
-    dims: [-1]
-  },
-  {
-    name: "kv_cache_missed_block"
-    data_type: TYPE_INT32
-    dims: [-1]
-  },
-  {
-    name: "kv_cache_alloc_new_blocks"
-    data_type: TYPE_INT32
-    dims: [-1]
-  },
-  {
-    name: "kv_cache_alloc_total_blocks"
-    data_type: TYPE_INT32
-    dims: [-1]
-  },
-  {
-    name: "kv_cache_hit_rate"
-    data_type: TYPE_FP32
-    dims: [-1]
-  },
-  {
-    name: "arrival_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_scheduled_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "first_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "last_token_time_ns"
-    data_type: TYPE_INT64
-    dims: [ 1 ]
-  },
-  {
-    name: "acceptance_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_accepted_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "total_draft_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
diff --git a/triton_backend/all_models/multimodal/ensemble/config.pbtxt b/triton_backend/all_models/multimodal/ensemble/config.pbtxt
deleted file mode 100755
index d3affefabf5..00000000000
--- a/triton_backend/all_models/multimodal/ensemble/config.pbtxt
+++ /dev/null
@@ -1,638 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "ensemble"
-platform: "ensemble"
-max_batch_size: ${triton_max_batch_size}
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  },
-  {
-    name: "decoder_text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "image_input"
-    data_type: TYPE_FP16
-    dims: [ -1, -1, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "image_bytes_input"
-    data_type: TYPE_UINT8
-    dims: [ -1, -1, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "image_url_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "video_bytes_input"
-    data_type: TYPE_UINT8
-    dims: [ -1, -1, -1, -1 ]
-    optional: true
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_tokens"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stream"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_table_extra_id"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "embedding_bias_words"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "embedding_bias_weights"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: ${logits_datatype}
-    dims: [ -1, -1, -1 ]
-  },
-  {
-    name: "batch_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "sequence_index"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "preprocessing"
-      model_version: -1
-      input_map {
-        key: "QUERY"
-        value: "text_input"
-      }
-      input_map {
-        key: "DECODER_QUERY"
-        value: "decoder_text_input"
-      }
-      input_map {
-        key: "IMAGE_BYTES"
-        value: "image_bytes_input"
-      }
-      input_map {
-        key: "IMAGE_URL"
-        value: "image_url_input"
-      }
-      input_map {
-        key: "VIDEO_BYTES"
-        value: "video_bytes_input"
-      }
-      input_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "max_tokens"
-      }
-      input_map {
-        key: "BAD_WORDS_DICT"
-        value: "bad_words"
-      }
-      input_map {
-        key: "STOP_WORDS_DICT"
-        value: "stop_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WORDS"
-        value: "embedding_bias_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WEIGHTS"
-        value: "embedding_bias_weights"
-      }
-      input_map {
-        key: "END_ID"
-        value: "end_id"
-      }
-      input_map {
-        key: "PAD_ID"
-        value: "pad_id"
-      }
-      input_map {
-        key: "PROMPT_TABLE_EXTRA_ID"
-        value: "prompt_table_extra_id"
-      }
-      output_map {
-        key: "REQUEST_INPUT_LEN"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      output_map {
-        key: "INPUT_ID"
-        value: "_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_DECODER_INPUT_LEN"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      output_map {
-        key: "DECODER_INPUT_ID"
-        value: "_DECODER_INPUT_ID"
-      }
-      output_map {
-        key: "PIXEL_VALUES"
-        value: "_PIXEL_VALUES"
-      }
-      output_map {
-        key: "ASPECT_RATIO_IDS"
-        value: "_ASPECT_RATIO_IDS"
-      }
-      output_map {
-        key: "ASPECT_RATIO_MASK"
-        value: "_ASPECT_RATIO_MASK"
-      }
-      output_map {
-        key: "CROSS_ATTENTION_MASK"
-        value: "_CROSS_ATTENTION_MASK"
-      }
-      output_map {
-        key: "ATTENTION_MASK"
-        value: "_ATTENTION_MASK"
-      }
-      output_map {
-        key: "IMAGE_GRID_THW"
-        value: "_IMAGE_GRID_THW"
-      }
-      output_map {
-        key: "IMAGE_SIZES"
-        value: "_IMAGE_SIZES"
-      }
-      output_map {
-        key: "IS_VIDEO_INPUT"
-        value: "_IS_VIDEO_INPUT"
-      }
-      output_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      output_map {
-        key: "STOP_WORDS_IDS"
-        value: "_STOP_WORDS_IDS"
-      }
-      output_map {
-        key: "BAD_WORDS_IDS"
-        value: "_BAD_WORDS_IDS"
-      }
-      output_map {
-        key: "EMBEDDING_BIAS"
-        value: "_EMBEDDING_BIAS"
-      }
-      output_map {
-        key: "OUT_END_ID"
-        value: "_PREPROCESSOR_END_ID"
-      }
-      output_map {
-        key: "OUT_PAD_ID"
-        value: "_PREPROCESSOR_PAD_ID"
-      }
-      output_map {
-        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
-        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
-      }
-      output_map {
-        key: "VISION_INPUT_ID"
-        value: "_VISION_INPUT_ID"
-      }
-    },
-    {
-      model_name: "multimodal_encoders"
-      model_version: -1
-      input_map {
-        key: "IMAGE"
-        value: "image_input"
-      }
-      input_map {
-        key: "pixel_values"
-        value: "_PIXEL_VALUES"
-      }
-      input_map {
-        key: "aspect_ratio_ids"
-        value: "_ASPECT_RATIO_IDS"
-      }
-      input_map {
-        key: "aspect_ratio_mask"
-        value: "_ASPECT_RATIO_MASK"
-      }
-      input_map {
-        key: "cross_attention_mask"
-        value: "_CROSS_ATTENTION_MASK"
-      }
-      input_map {
-        key: "attention_mask"
-        value: "_ATTENTION_MASK"
-      }
-      input_map {
-        key: "image_grid_thw"
-        value: "_IMAGE_GRID_THW"
-      }
-      input_map {
-       key: "max_tokens"
-       value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-        key: "image_sizes"
-        value: "_IMAGE_SIZES"
-      }
-      input_map {
-        key: "vision_input_id"
-        value: "_VISION_INPUT_ID"
-      }
-      input_map {
-        key: "is_video_input"
-        value: "_IS_VIDEO_INPUT"
-      }
-      output_map {
-        key: "OUT_PROMPT_EMBEDDING_TABLE"
-        value: "out_prompt_embedding_table"
-      }
-      output_map {
-        key: "OUT_PROMPT_VOCAB_SIZE"
-        value: "out_prompt_vocab_size"
-      }
-      output_map {
-        key: "ENCODER_INPUT_FEATURES"
-        value: "encoder_input_features"
-      }
-      output_map {
-        key: "ENCODER_OUTPUT_LENGTHS"
-        value: "encoder_output_lengths"
-      }
-      output_map {
-        key: "CROSS_ATTENTION_MASK"
-        value: "cross_attention_mask"
-      }
-      output_map {
-        key: "SKIP_CROSS_ATTN_BLOCKS"
-        value: "skip_cross_attn_blocks"
-      }
-      output_map {
-        key: "MROPE_ROTARY_COS_SIN"
-        value: "mrope_rotary_cos_sin"
-      }
-      output_map {
-        key: "MROPE_POSITION_DELTAS"
-        value: "mrope_position_deltas"
-      }
-    },
-    {
-      model_name: "tensorrt_llm"
-      model_version: -1
-      input_map {
-        key: "input_ids"
-        value: "_INPUT_ID"
-      }
-      input_map {
-        key: "decoder_input_ids"
-        value: "_DECODER_INPUT_ID"
-      }
-      input_map {
-        key: "input_lengths"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      input_map {
-        key: "decoder_input_lengths"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      input_map {
-        key: "request_output_len"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-          key: "end_id"
-          value: "_PREPROCESSOR_END_ID"
-      }
-      input_map {
-          key: "pad_id"
-          value: "_PREPROCESSOR_PAD_ID"
-      }
-      input_map {
-          key: "embedding_bias"
-          value: "_EMBEDDING_BIAS"
-      }
-      input_map {
-          key: "runtime_top_k"
-          value: "top_k"
-      }
-      input_map {
-          key: "runtime_top_p"
-          value: "top_p"
-      }
-      input_map {
-          key: "temperature"
-          value: "temperature"
-      }
-      input_map {
-          key: "len_penalty"
-          value: "length_penalty"
-      }
-      input_map {
-          key: "repetition_penalty"
-          value: "repetition_penalty"
-      }
-      input_map {
-          key: "min_tokens"
-          value: "min_tokens"
-      }
-      input_map {
-          key: "presence_penalty"
-          value: "presence_penalty"
-      }
-      input_map {
-          key: "frequency_penalty"
-          value: "frequency_penalty"
-      }
-      input_map {
-          key: "seed"
-          value: "seed"
-      }
-      input_map {
-          key: "return_log_probs"
-          value: "return_log_probs"
-      }
-      input_map {
-          key: "return_context_logits"
-          value: "return_context_logits"
-      }
-      input_map {
-          key: "return_generation_logits"
-          value: "return_generation_logits"
-      }
-      input_map {
-          key: "beam_width"
-          value: "beam_width"
-      }
-      input_map {
-          key: "streaming"
-          value: "stream"
-      }
-      input_map {
-        key: "prompt_embedding_table"
-        value: "out_prompt_embedding_table"
-      }
-      input_map {
-        key: "prompt_vocab_size"
-        value: "out_prompt_vocab_size"
-      }
-      input_map {
-        key: "encoder_input_features"
-        value: "encoder_input_features"
-      }
-      input_map {
-        key: "encoder_output_lengths"
-        value: "encoder_output_lengths"
-      }
-      input_map {
-        key: "cross_attention_mask"
-        value: "cross_attention_mask"
-      }
-      input_map {
-        key: "stop_words_list"
-        value: "_STOP_WORDS_IDS"
-      }
-      input_map {
-        key: "bad_words_list"
-        value: "_BAD_WORDS_IDS"
-      }
-      input_map {
-        key: "prompt_table_extra_ids"
-        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
-      }
-      input_map {
-        key: "skip_cross_attn_blocks"
-        value: "skip_cross_attn_blocks"
-      }
-      input_map {
-        key: "mrope_rotary_cos_sin"
-        value: "mrope_rotary_cos_sin"
-      }
-      input_map {
-        key: "mrope_position_deltas"
-        value: "mrope_position_deltas"
-      }
-      output_map {
-        key: "output_ids"
-        value: "_TOKENS_BATCH"
-      }
-      output_map {
-        key: "sequence_length"
-        value: "_SEQUENCE_LENGTH"
-      },
-      output_map {
-        key: "cum_log_probs"
-        value: "cum_log_probs"
-      }
-      output_map {
-        key: "output_log_probs"
-        value: "output_log_probs"
-      },
-      output_map {
-        key: "context_logits"
-        value: "context_logits"
-      },
-      output_map {
-        key: "generation_logits"
-        value: "generation_logits"
-      },
-      output_map {
-        key: "batch_index"
-        value: "batch_index"
-      }
-      output_map {
-        key: "sequence_index"
-        value: "sequence_index"
-      }
-    },
-    {
-      model_name: "postprocessing"
-      model_version: -1
-      input_map {
-        key: "TOKENS_BATCH"
-        value: "_TOKENS_BATCH"
-      }
-      input_map {
-        key: "SEQUENCE_LENGTH"
-        value: "_SEQUENCE_LENGTH"
-      }
-      output_map {
-        key: "OUTPUT"
-        value: "text_output"
-      }
-    }
-  ]
-}
diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py
deleted file mode 100755
index acf601ac806..00000000000
--- a/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-import os
-from collections import defaultdict
-from typing import Dict, List
-
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from safetensors.torch import load_file
-from torch.utils.dlpack import from_dlpack, to_dlpack
-
-import tensorrt_llm
-import tensorrt_llm.logger as logger
-from tensorrt_llm._utils import (str_dtype_to_torch, torch_dtype_to_trt,
-                                 trt_dtype_to_torch)
-from tensorrt_llm.runtime import Session, TensorInfo
-
-logger.set_level('info')
-
-
-def triton_string_to_torch(dtype):
-    type_map = {
-        "TYPE_BOOL": torch.bool,
-        "TYPE_UINT8": torch.uint8,
-        "TYPE_INT8": torch.int8,
-        "TYPE_INT16": torch.int16,
-        "TYPE_INT32": torch.int32,
-        "TYPE_INT64": torch.int64,
-        "TYPE_FP16": torch.float16,
-        "TYPE_FP32": torch.float32,
-        "TYPE_FP64": torch.float64,
-        "TYPE_BF16": torch.bfloat16
-    }
-    return type_map[dtype]
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-
-        # Will load non-llm engiens only to GPU0 since the requests are coming only to GPU0
-
-        self.rank = tensorrt_llm.mpi_rank()
-        if self.rank != 0:
-            return
-
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-
-        self.max_batch_size = model_config['max_batch_size']
-
-        # First vision engine
-        multimodal_model_path = model_config['parameters'].get(
-            'multimodal_model_path', None)
-        if multimodal_model_path:
-            multimodal_model_path = multimodal_model_path['string_value']
-            self.vision_stream = torch.cuda.current_stream()
-
-            visual_config_path = os.path.join(multimodal_model_path,
-                                              'config.json')
-            with open(visual_config_path, 'r') as f:
-                visual_config = json.load(f)
-            self.model_type = visual_config['builder_config']['model_type']
-
-            multimodal_encoder_path = os.path.join(multimodal_model_path,
-                                                   'model.engine')
-            with open(multimodal_encoder_path, 'rb') as f:
-                engine_buffer = f.read()
-            self.image_session = Session.from_serialized_engine(engine_buffer)
-
-            self.vision_dtype_str = visual_config['builder_config']['precision']
-            features_output_name = "OUT_PROMPT_EMBEDDING_TABLE"
-            if self.model_type == "mllama":
-                features_output_name = "ENCODER_INPUT_FEATURES"
-            self.vision_output_dtype = triton_string_to_torch(
-                pb_utils.get_output_config_by_name(
-                    model_config, features_output_name)['data_type'])
-
-            if self.model_type == 'llava_onevision':
-                from multimodal_utils import LlavaOnevisionUtils
-                from transformers import AutoConfig
-                hf_model_path = model_config['parameters'].get(
-                    'hf_model_path', None)
-                assert hf_model_path is not None and hf_model_path[
-                    'string_value'] != "${hf_model_path}", "Need to provide hf_model_path for the LLaVA OneVision model"
-
-                newline_buffer_path = os.path.join(
-                    multimodal_model_path, 'image_newlines.safetensors')
-                image_newline = load_file(
-                    newline_buffer_path)['image_newline'].cuda()
-                model_config = AutoConfig.from_pretrained(
-                    hf_model_path['string_value'])
-                self.llava_onevision_utils = LlavaOnevisionUtils(
-                    model_config, image_newline)
-
-            if self.model_type == 'mllama':
-                from transformers import AutoConfig
-                hf_model_path = model_config['parameters'].get(
-                    'hf_model_path', None)
-                assert hf_model_path is not None and hf_model_path[
-                    'string_value'] != "${hf_model_path}", "Need to provide hf_model_path for the MLLaMA model"
-                model_config = AutoConfig.from_pretrained(
-                    hf_model_path['string_value'])
-                self.text_hidden_size = model_config.text_config.hidden_size
-
-            if self.model_type == "qwen2_vl":
-                from multimodal_utils import Qwen2VLUtils
-                from transformers import AutoConfig
-                hf_model_path = model_config['parameters'].get(
-                    'hf_model_path', None)
-                assert hf_model_path is not None and hf_model_path[
-                    'string_value'] != "${hf_model_path}", "Need to provide hf_model_path for the Qwen2-VL model"
-                hf_config = AutoConfig.from_pretrained(
-                    hf_model_path['string_value'])
-                self.config = hf_config
-                self.vision_token_id = hf_config.vision_token_id
-                self.image_token_id = hf_config.image_token_id
-                self.video_token_id = hf_config.video_token_id
-                self.vocab_size = hf_config.vocab_size
-                self.qwen2vl_utils = Qwen2VLUtils(hf_config)
-
-    def get_requests(self, request: List) -> Dict[str, torch.Tensor]:
-        """
-        Processes the incoming request to extract and organize input tensors
-        for different model types.
-
-        This function retrieves image tensors from the request, reshapes them
-        as needed, and organizes them into a dictionary based on the model type
-        being used. It supports handling both 'mllama' and 'llava_onevision' models,
-        as well as other types.
-
-        Args:
-            request: The incoming request containing input tensors. The request
-                    should contain tensors named either 'pixel_values' or 'IMAGE',
-                    and optionally 'aspect_ratio_ids', 'aspect_ratio_mask',
-                    'is_video_input' and 'image_sizes'.
-
-        Returns:
-            A tuple containing:
-                - input_tensors (Dict[str, List[torch.Tensor]]): A dictionary
-                of processed input tensors organized by their type.
-                - batch_size (int): The size of the batch processed.
-                - num_image (int): The number of images in the batch.
-
-        Raises:
-            AssertionError: If no valid image tensor is found in the request.
-        """
-
-        input_tensors: Dict[str, List(torch.Tensor)] = defaultdict(list)
-
-        img_tensor = (pb_utils.get_input_tensor_by_name(request, 'pixel_values')
-                      or pb_utils.get_input_tensor_by_name(request, 'IMAGE'))
-        # mllama supports img_tensor is None case
-        assert img_tensor != None or self.model_type == 'mllama', "There is no preprocessed image tensor to encode"
-        if img_tensor is not None:
-            img_tensor = from_dlpack(img_tensor.to_dlpack())
-
-            batch_size = img_tensor.shape[0]
-            num_image = img_tensor.shape[1]
-            img_tensor = img_tensor.to(str_dtype_to_torch(
-                self.vision_dtype_str)).pin_memory()
-        else:
-            batch_size = 1
-            num_image = 0
-        # TODO these should be refactored into a factory
-        if self.model_type == 'mllama':
-            if img_tensor is not None:
-                aspect_ratio_ids = from_dlpack(
-                    pb_utils.get_input_tensor_by_name(
-                        request, "aspect_ratio_ids").to_dlpack()).to(
-                            torch.int64).pin_memory()
-                aspect_ratio_mask = from_dlpack(
-                    pb_utils.get_input_tensor_by_name(
-                        request, "aspect_ratio_mask").to_dlpack()).to(
-                            torch.int64).pin_memory()
-                num_tiles = aspect_ratio_mask.shape[-1]
-                # Reshape img_tensor to [bs, num_image, num_tiles, ...]
-                if img_tensor is not None:
-                    pixel_values = img_tensor.view(img_tensor.shape[0], -1,
-                                                   num_tiles,
-                                                   *(img_tensor.shape[2:]))
-            else:
-                pixel_values = None
-                aspect_ratio_ids = None
-                aspect_ratio_mask = None
-            input_tensors['pixel_values'].append(pixel_values)
-            input_tensors['aspect_ratio_ids'].append(aspect_ratio_ids)
-            input_tensors['aspect_ratio_mask'].append(aspect_ratio_mask)
-        elif self.model_type == 'llava_onevision':
-            is_video_input = pb_utils.get_input_tensor_by_name(
-                request, 'is_video_input')
-            is_video_input = is_video_input.as_numpy(
-            )[0] if is_video_input is not None else False
-            if is_video_input:
-                input_tensors['input'].append(
-                    img_tensor.view(-1, img_tensor.shape[2],
-                                    img_tensor.shape[3], img_tensor.shape[4]))
-            else:
-                image_sizes = from_dlpack(
-                    pb_utils.get_input_tensor_by_name(
-                        request, 'image_sizes').to_dlpack())
-                from transformers.models.llava_onevision.modeling_llava_onevision import \
-                    image_size_to_num_patches
-                image_num_patches = [
-                    image_size_to_num_patches(
-                        image_size=imsize,
-                        grid_pinpoints=self.llava_onevision_utils.config.
-                        image_grid_pinpoints,
-                        patch_size=self.llava_onevision_utils.config.
-                        vision_config.image_size) for imsize in image_sizes
-                ]
-                img_tensor = img_tensor.to('cuda')
-                img_list = [
-                    img[:num_patch]
-                    for img, num_patch in zip(img_tensor, image_num_patches)
-                ]
-                img_tensor = torch.cat(img_list, dim=0).contiguous()
-                input_tensors['input'].append(img_tensor)
-
-        elif self.model_type == 'qwen2_vl':
-            image_grid_thw = from_dlpack(
-                pb_utils.get_input_tensor_by_name(
-                    request,
-                    "image_grid_thw").to_dlpack()).to(torch.int64).pin_memory()
-            attention_mask = from_dlpack(
-                pb_utils.get_input_tensor_by_name(
-                    request,
-                    "attention_mask").to_dlpack()).to(torch.int64).pin_memory()
-            #remove dummy dim and reshape to 2D dim
-            img_tensor = img_tensor.squeeze(1).squeeze(1)
-            img_tensor = img_tensor.view(-1, img_tensor.shape[-1])
-            input_tensors['input'].append(img_tensor)
-            input_tensors['attention_mask_llm'].append(attention_mask)
-            input_tensors['image_grid_thw'].append(image_grid_thw)
-
-        else:
-            input_tensors['input'].append(
-                img_tensor.view(-1, img_tensor.shape[2], img_tensor.shape[3],
-                                img_tensor.shape[4]))
-
-        return input_tensors, batch_size, num_image
-
-    def postprocess_output_tensors(
-            self, output_tensor: torch.Tensor, requests: List,
-            batch_sizes: List[int], num_images: List[int],
-            is_skip_encoders: List[bool],
-            other_vision_input_tensors: Dict[str, torch.Tensor]):
-        """
-        Processes the batched output tensor and generates inference responses
-        for each request.
-
-        This function splits the output tensor into individual embeddings for
-        each request, reshapes the embeddings as needed based on the model type,
-        and prepares the output tensors for returning. It supports handling
-        multiple model types, including 'mllama' and 'llava_onevision'.
-
-        Args:
-            output_tensor (torch.Tensor): The batched output tensor containing
-                                        the embeddings. Note that the num_images of
-                                        all requests are fused together in dimension 0.
-            requests (List): A list of requests containing input
-                                        tensors for each model.
-            batch_sizes (List[int]): A list of batch sizes corresponding to each
-                                    request.
-            num_images (List[int]): A list of image counts corresponding to each
-                                    request.
-            is_skip_encoders (List[bool]): A list of skipping the computing of each
-                                    request since image_url is None.
-            other_vision_input_tensors (Dict[str, torch.Tensor]): A dict of tensor containing tensor needed for some models postprocessing
-
-        Returns:
-            List[pb_utils.InferenceResponse]: A list of inference responses,
-                                                each containing the output tensors
-                                                for the corresponding request.
-
-        Raises:
-            Any exceptions raised by tensor operations (e.g., shape mismatches,
-            invalid tensor types) during processing.
-
-        Notes:
-            - The function assumes that the input tensors are structured correctly
-            and that the provided model types are supported.
-            - The output tensors are created in a format suitable for the specific
-            model being processed.
-        """
-
-        responses = []
-        # split the batched output back to no batching
-        if self.model_type == 'mllama':
-            # Here, output_tensor is not the full tensor because we skip the encoder
-            # engine computation for the requests which don't have img_url
-            if output_tensor is not None:
-                splitted_output_tensor = torch.tensor_split(
-                    output_tensor, output_tensor.shape[0], dim=0)
-            output_tensor_idx = 0
-            for req_idx, request in enumerate(requests):
-                batch_size = batch_sizes[req_idx]
-                max_tokens = pb_utils.get_input_tensor_by_name(
-                    request, 'max_tokens')
-                # max_tokens is needed to prepare the cross_attention_mask
-                max_tokens = 0 if max_tokens is None else max_tokens.as_numpy()[
-                    0, 0]
-
-                if is_skip_encoders[req_idx]:
-                    # For the case that img_url is None, creating a dummy output tensor with short length
-                    # and set skip_cross_attn_blocks as True. Also, creating corresponding dummy attention mask.
-                    embeddings = torch.zeros(
-                        [batch_size, 1, 4, 1,
-                         self.text_hidden_size]).to(self.vision_output_dtype)
-                    encoder_input_features = embeddings
-                    output_shape = encoder_input_features.shape
-                    skip_cross_attn_blocks = torch.ones([output_shape[0], 1],
-                                                        dtype=torch.bool,
-                                                        device='cpu')
-                    cross_attention_mask = torch.zeros(
-                        [batch_size, max_tokens, 1, 4],
-                        dtype=torch.bool,
-                        device='cuda')
-                else:
-                    embeddings = splitted_output_tensor[output_tensor_idx]
-                    encoder_input_features = embeddings
-                    output_shape = encoder_input_features.shape
-                    skip_cross_attn_blocks = torch.zeros([batch_size, 1],
-                                                         dtype=torch.bool,
-                                                         device='cpu')
-                    # prepare cross_attention_mask
-                    # [bs, seq_len, num_tiles] to [bs, seq_len+max_tokens, encoder_length]
-                    cross_attention_mask = pb_utils.get_input_tensor_by_name(
-                        request, "cross_attention_mask")
-                    if cross_attention_mask != None:
-                        cross_attention_mask = from_dlpack(
-                            pb_utils.get_input_tensor_by_name(
-                                request, "cross_attention_mask").to_dlpack())
-                    output_tensor_idx += 1
-
-                cross_attention_mask = cross_attention_mask.repeat_interleave(
-                    output_shape[3], dim=3)
-                cross_attention_mask = cross_attention_mask.to(
-                    encoder_input_features.device).to(torch.bool).view(
-                        [output_shape[0], -1, encoder_input_features.shape[1]])
-                tmp_mask = [cross_attention_mask] + [
-                    cross_attention_mask[:, -1:, :] for _ in range(max_tokens)
-                ]
-                cross_attention_mask = torch.concat(tmp_mask, dim=1)
-                logger.debug(
-                    f"cross attention mask shape: {cross_attention_mask.shape}")
-                logger.debug(
-                    f"skip_cross_attn_blocks: {skip_cross_attn_blocks}")
-
-                # reshape encoder output
-                # [bs, num_image, num_tiles, num_patches, hidden_size] to [bs, encoder_length, hidden_size]
-                encoder_input_features = encoder_input_features.view(
-                    output_shape[0],
-                    output_shape[1] * output_shape[2] * output_shape[3],
-                    output_shape[4])
-                logger.debug(
-                    f"encoder_input_features shape: {encoder_input_features.shape}"
-                )
-                # prepare encoder output lengths
-                # shape [bs], value [encoder_length]
-                encoder_output_lengths = torch.tensor(
-                    [[output_shape[1] * output_shape[2] * output_shape[3]]],
-                    dtype=torch.int32)
-                logger.debug(
-                    f"encoder_output_lengths: {encoder_output_lengths}")
-                # True when the request does not have image input
-
-                output_tensors = [
-                    pb_utils.Tensor.from_dlpack(
-                        'ENCODER_INPUT_FEATURES',
-                        to_dlpack(encoder_input_features)),
-                    pb_utils.Tensor.from_dlpack(
-                        'ENCODER_OUTPUT_LENGTHS',
-                        to_dlpack(encoder_output_lengths))
-                ]
-                if cross_attention_mask is not None:
-                    output_tensors.append(
-                        pb_utils.Tensor.from_dlpack(
-                            'CROSS_ATTENTION_MASK',
-                            to_dlpack(cross_attention_mask)))
-                output_tensors.append(
-                    pb_utils.Tensor.from_dlpack(
-                        'SKIP_CROSS_ATTN_BLOCKS',
-                        to_dlpack(skip_cross_attn_blocks)))
-                inference_response = pb_utils.InferenceResponse(
-                    output_tensors=output_tensors)
-                responses.append(inference_response)
-        elif self.model_type == 'llava_onevision':
-            for req_idx, embeddings in enumerate(
-                    torch.split(output_tensor, num_images, dim=0)):
-                request = requests[req_idx]
-                batch_size = batch_sizes[req_idx]
-                num_image = num_images[req_idx]
-                is_video_input = pb_utils.get_input_tensor_by_name(
-                    request, 'is_video_input')
-                if is_video_input:
-                    prompt_table = self.llava_onevision_utils.postprocess_video(
-                        embeddings, batch_size, num_image)
-                else:
-                    image_sizes = from_dlpack(
-                        pb_utils.get_input_tensor_by_name(
-                            request, 'image_sizes').to_dlpack())
-                    from transformers.models.llava_onevision.modeling_llava_onevision import \
-                        image_size_to_num_patches
-                    image_num_patches = [
-                        image_size_to_num_patches(
-                            image_size=imsize,
-                            grid_pinpoints=self.llava_onevision_utils.config.
-                            image_grid_pinpoints,
-                            patch_size=self.llava_onevision_utils.config.
-                            vision_config.image_size) for imsize in image_sizes
-                    ]
-
-                    prompt_table = self.llava_onevision_utils.postprocess_image(
-                        embeddings, image_sizes, image_num_patches)
-                prompt_embedding_table_tensor = pb_utils.Tensor.from_dlpack(
-                    'OUT_PROMPT_EMBEDDING_TABLE', to_dlpack(prompt_table))
-                output_tensors = [prompt_embedding_table_tensor]
-
-                inference_response = pb_utils.InferenceResponse(
-                    output_tensors=output_tensors)
-                responses.append(inference_response)
-        elif self.model_type == 'qwen2_vl':
-            image_grid_thw = other_vision_input_tensors.get('image_grid_thw')
-            attention_mask = other_vision_input_tensors.get('attention_mask')
-            total_num_image = [i * j for i, j in zip(batch_sizes, num_images)]
-            image_grid_thw_list = list(
-                torch.split(image_grid_thw, total_num_image, dim=0))
-            attention_mask_list = list(
-                torch.split(attention_mask, total_num_image, dim=0))
-            single_image_prompt_table_size = int(output_tensor.shape[0] /
-                                                 len(requests))
-            prompt_embedding_table_tensor_tuple = torch.split(
-                output_tensor, single_image_prompt_table_size)
-
-            for req_idx in range(len(requests)):
-                input_ids = from_dlpack(
-                    pb_utils.get_input_tensor_by_name(
-                        requests[req_idx], 'vision_input_id').to_dlpack())
-                image_grid_thw = image_grid_thw_list[req_idx]
-                attention_mask = attention_mask_list[req_idx]
-                mrope_rotary_cos_sin, mrope_position_deltas = self.qwen2vl_utils.compute_mrope(
-                    input_ids, image_grid_thw, attention_mask)
-                prompt_embedding_table_tensor = pb_utils.Tensor.from_dlpack(
-                    'OUT_PROMPT_EMBEDDING_TABLE',
-                    to_dlpack(
-                        prompt_embedding_table_tensor_tuple[req_idx].unsqueeze(
-                            0)))
-                mrope_rotary_cos_sin_tensor = pb_utils.Tensor.from_dlpack(
-                    'MROPE_ROTARY_COS_SIN', to_dlpack(mrope_rotary_cos_sin))
-                mrope_position_deltas_tensor = pb_utils.Tensor.from_dlpack(
-                    'MROPE_POSITION_DELTAS', to_dlpack(mrope_position_deltas))
-                output_tensors = [
-                    prompt_embedding_table_tensor, mrope_rotary_cos_sin_tensor,
-                    mrope_position_deltas_tensor
-                ]
-                inference_response = pb_utils.InferenceResponse(
-                    output_tensors=output_tensors)
-                responses.append(inference_response)
-        else:
-            for req_idx, embeddings in enumerate(
-                    torch.tensor_split(output_tensor,
-                                       output_tensor.shape[0],
-                                       dim=0)):
-                batch_size = batch_sizes[req_idx]
-                num_image = num_images[req_idx]
-                vision_prompt_table = embeddings
-                vision_prompt_vocab_size = np.array(
-                    [[vision_prompt_table.shape[1]]])
-                # Concatenate the prompt tables if there are multiple images in single request
-                if num_image > 1:
-                    prompt_table = vision_prompt_table.view(
-                        batch_size, -1, vision_prompt_table.shape[-1])
-                    prompt_vocab_size = np.repeat(vision_prompt_vocab_size,
-                                                  batch_size,
-                                                  axis=0)
-                else:
-                    # Use the single prompt table directly
-                    vision_prompt_vocab_size = np.repeat(
-                        vision_prompt_vocab_size, batch_size, axis=0)
-                    prompt_table = vision_prompt_table
-                    prompt_vocab_size = vision_prompt_vocab_size
-
-                prompt_embedding_table_tensor = pb_utils.Tensor.from_dlpack(
-                    'OUT_PROMPT_EMBEDDING_TABLE', to_dlpack(prompt_table))
-
-                prompt_vocab_size_tensor = pb_utils.Tensor(
-                    'OUT_PROMPT_VOCAB_SIZE', prompt_vocab_size.astype(np.int32))
-
-                output_tensors = [
-                    prompt_embedding_table_tensor, prompt_vocab_size_tensor
-                ]
-
-                inference_response = pb_utils.InferenceResponse(
-                    output_tensors=output_tensors)
-                responses.append(inference_response)
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def execute(self, requests: List):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        start_idx = 0
-        responses = []
-        next_input_tensors: Dict[str, List(torch.Tensor)] = defaultdict(list)
-        next_batch_sizes = []
-        next_num_images = []
-        next_micro_batch_size = 0
-        next_is_skip_encoders = []
-        other_vision_input_tensors: Dict[str, torch.Tensor] = defaultdict(
-            lambda: torch.tensor([]))
-        while start_idx < len(requests):
-            # Split the full requests into several micro batches
-            # the batch size of each micro batch is smaller than self.max_batch_size
-            input_tensors = next_input_tensors
-            batch_sizes = next_batch_sizes
-            num_images = next_num_images
-            micro_batch_size = next_micro_batch_size
-            is_skip_encoders = next_is_skip_encoders
-            end_idx = start_idx
-            # Continue adding requests to the current batch until we reach the maximum batch size or process all requests
-            while micro_batch_size <= self.max_batch_size and end_idx < len(
-                    requests):
-                input_tensor, bs, num_image = self.get_requests(
-                    requests[end_idx])
-                # Check if adding this request to the current batch would exceed the maximum batch size
-                if micro_batch_size + bs <= self.max_batch_size:
-                    is_skip_encoder = False
-                    for tensor_name, tensor in input_tensor.items():
-                        if tensor[0] is None:
-                            is_skip_encoder = True
-                            break
-                        if tensor_name in input_tensors:
-                            input_tensors[tensor_name].extend(tensor)
-                        else:
-                            input_tensors[tensor_name] = tensor
-                    # Update this batch information
-                    batch_sizes.append(bs)
-                    num_images.append(num_image)
-                    is_skip_encoders.append(is_skip_encoder)
-                    micro_batch_size += bs
-                    end_idx += 1
-                else:
-                    # If adding this request would exceed the max batch size, prepare it for the next batch
-                    is_skip_encoder = False
-                    for tensor_name, tensor in input_tensor.items():
-                        if tensor[0] is None:
-                            is_skip_encoder = True
-                            break
-                        if tensor_name in next_input_tensors:
-                            next_input_tensors[tensor_name].extend(tensor)
-                        else:
-                            next_input_tensors[tensor_name] = tensor
-                    # Update next batch information
-                    next_batch_sizes.append(bs)
-                    next_num_images.append(num_image)
-                    next_is_skip_encoders.append(is_skip_encoder)
-                    next_micro_batch_size += bs
-                    end_idx += 1
-
-            logger.info(
-                f"batch {end_idx - start_idx} requests (batch size {micro_batch_size}) together to run encoder model."
-            )
-            embeddings = None
-            if not all(is_skip_encoders):
-                vit_input = {}
-                with torch.cuda.stream(self.vision_stream):
-                    for tensor_name, tensors in input_tensors.items():
-                        vit_input[tensor_name] = torch.cat(
-                            [t.to('cuda', non_blocking=True) for t in tensors])
-
-                if self.model_type == 'qwen2_vl':
-                    import torch.nn.functional as F
-                    from transformers.models.qwen2_vl.modeling_qwen2_vl import \
-                        VisionRotaryEmbedding
-
-                    from tensorrt_llm.tools.multimodal_builder import \
-                        compute_rotary_pos_emb
-                    img_tensor = vit_input.get('input')
-                    image_grid_thw = vit_input.get('image_grid_thw')
-                    vit_input.pop('image_grid_thw')
-                    other_vision_input_tensors[
-                        'image_grid_thw'] = image_grid_thw
-                    attention_mask = vit_input.get('attention_mask_llm')
-                    other_vision_input_tensors[
-                        'attention_mask'] = attention_mask
-                    vit_input.pop('attention_mask_llm')
-                    cu_seqlens = torch.repeat_interleave(
-                        image_grid_thw[:, 1] * image_grid_thw[:, 2],
-                        image_grid_thw[:, 0]).cumsum(dim=0, dtype=torch.int32)
-                    cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
-                    seq_length = img_tensor.shape[0]
-                    attention_mask_vit = torch.full([1, seq_length, seq_length],
-                                                    torch.finfo(
-                                                        torch.float16).min,
-                                                    dtype=img_tensor.dtype)
-                    for i in range(1, len(cu_seqlens)):
-                        attention_mask_vit[..., cu_seqlens[i - 1]:cu_seqlens[i],
-                                           cu_seqlens[i - 1]:cu_seqlens[i]] = 0
-                    rotary_pos_emb = compute_rotary_pos_emb(
-                        image_grid_thw, self.config,
-                        VisionRotaryEmbedding).to("cuda")
-                    vit_input['rotary_pos_emb'] = rotary_pos_emb.to('cuda')
-                    vit_input['attention_mask'] = attention_mask_vit.to(
-                        str_dtype_to_torch(self.vision_dtype_str)).to('cuda')
-
-                # Set up output tensors
-                vit_input_info = [
-                    TensorInfo(key, torch_dtype_to_trt(val.dtype), val.shape)
-                    for key, val in vit_input.items()
-                ]
-                vit_output_info = self.image_session.infer_shapes(
-                    vit_input_info)
-                vit_output = {
-                    t.name:
-                    torch.empty(tuple(t.shape),
-                                dtype=trt_dtype_to_torch(t.dtype),
-                                device='cuda')
-                    for t in vit_output_info
-                }
-                # Run the vision encoder
-                with torch.cuda.stream(self.vision_stream):
-                    ok = self.image_session.run(vit_input, vit_output,
-                                                self.vision_stream.cuda_stream)
-                    assert ok, "Runtime execution failed for vision encoder session"
-                    embeddings = vit_output['encoder_output'].to(
-                        self.vision_output_dtype)
-                self.vision_stream.synchronize()
-            # Post process output and save in responses
-            responses.extend(
-                self.postprocess_output_tensors(embeddings,
-                                                requests[start_idx:end_idx],
-                                                batch_sizes, num_images,
-                                                is_skip_encoders,
-                                                other_vision_input_tensors))
-            start_idx = end_idx
-
-        assert len(responses) == len(requests), \
-            f"Number of responses ({len(responses)}) from the vision model does not match number of requests ({len(requests)})"
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        logger.info('Cleaning up...')
diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
deleted file mode 100644
index 4caff0bbffc..00000000000
--- a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
+++ /dev/null
@@ -1,350 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-
-
-class LlavaOnevisionUtils:
-    # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py
-
-    def __init__(self, config, newline):
-        self.config = config
-        self.image_newline = newline
-
-    def postprocess_image(self, image_features, image_sizes, image_num_patches):
-
-        import math
-
-        from torch import nn
-        from transformers.models.llava_onevision.modeling_llava_onevision import (
-            get_anyres_image_grid_shape, unpad_image)
-        image_features = torch.split(image_features, image_num_patches, dim=0)
-        vision_aspect_ratio = self.config.vision_aspect_ratio
-
-        # LlavaOnevisionForConditionalGeneration.pack_image_features
-        new_image_features = []
-        feature_lens = []
-        for image_idx, image_feature in enumerate(image_features):
-            if image_feature.shape[0] > 1:
-                base_image_feature = image_feature[0]
-                image_feature = image_feature[1:]
-                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
-                if height * width != base_image_feature.shape[0]:
-                    raise ValueError(
-                        "The number of patches is not consistent with the image size."
-                    )
-                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                    image_sizes[image_idx],
-                    self.config.image_grid_pinpoints,
-                    self.config.vision_config.image_size,
-                )
-                image_feature = image_feature.view(num_patch_height,
-                                                   num_patch_width, height,
-                                                   width, -1)
-                image_feature = image_feature.permute(4, 0, 2, 1,
-                                                      3).contiguous()
-                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                image_feature = unpad_image(image_feature,
-                                            image_sizes[image_idx])
-                max_num_patches = int(vision_aspect_ratio.strip("anyres_max_"))
-                channels, curr_height, curr_width = image_feature.shape
-                ratio = math.sqrt(curr_height * curr_width /
-                                  (max_num_patches * height**2))
-                if ratio > 1.1:
-                    image_feature = image_feature[None]
-                    image_feature = nn.functional.interpolate(
-                        image_feature,
-                        [int(curr_height // ratio),
-                         int(curr_width // ratio)],
-                        mode="bilinear")[0]
-                if self.image_newline is not None:
-                    image_feature = torch.cat(
-                        (
-                            image_feature,
-                            self.image_newline[:, None, None].expand(
-                                *image_feature.shape[:-1], 1).to(
-                                    image_feature.device, image_feature.dtype),
-                        ),
-                        dim=-1,
-                    )
-                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                image_feature = torch.cat((base_image_feature, image_feature),
-                                          dim=0)
-            else:
-                image_feature = image_feature[0]
-                if self.image_newline is not None:
-                    image_feature = torch.cat(
-                        (image_feature,
-                         self.image_newline[None].to(image_feature)),
-                        dim=0)
-            new_image_features.append(image_feature)
-            feature_lens.append(image_feature.size(0))
-
-        max_tokens = max(x.shape[0] for x in new_image_features)
-        new_image_features = [
-            torch.nn.functional.pad(table,
-                                    (0, 0, 0, max_tokens - table.shape[0]),
-                                    mode='constant')
-            for table in new_image_features
-        ]
-        image_features = torch.stack(new_image_features, dim=0)
-        return image_features
-
-    def postprocess_video(self, image_features, batch_size, frames):
-
-        # LlavaOnevisionForConditionalGeneration.apply_pooling
-        import math
-
-        from torch import nn
-        height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
-        batch_frames, seq_len, dim = image_features.shape
-        image_features = image_features.view(batch_frames, height, width, -1)
-        image_features = image_features.permute(0, 3, 1, 2).contiguous()
-
-        height, width = image_features.shape[2:]
-        scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
-        image_features = nn.functional.interpolate(image_features,
-                                                   size=scaled_shape,
-                                                   mode="bilinear")
-
-        image_features = image_features.permute(0, 2, 3, 1)
-        image_features = image_features.view(batch_frames, -1, dim)
-
-        video_features = image_features.reshape(
-            batch_size, frames * image_features.shape[1], -1)
-        image_newline = self.image_newline[None, None, :].repeat(
-            batch_size, 1, 1).to(video_features.device)
-        video_features = torch.cat((video_features, image_newline), dim=1)
-        return video_features
-
-
-class Qwen2VLUtils:
-    # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
-
-    def __init__(self, config):
-        self.config = config
-        self.image_token_id = config.image_token_id
-        self.video_token_id = config.video_token_id
-        self.spatial_merge_size = config.vision_config.spatial_merge_size
-        self.vision_start_token_id = config.vision_start_token_id
-        self.max_position_embeddings = config.max_position_embeddings
-        self.hidden_size = config.hidden_size
-        self.num_attention_heads = config.num_attention_heads
-        self.rope_theta = config.rope_theta
-
-    def get_rope_index(
-        self,
-        input_ids: torch.IntTensor,
-        image_grid_thw: Optional[torch.LongTensor] = None,
-        video_grid_thw: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
-
-        Explanation:
-            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
-
-            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
-            Examples:
-                input_ids: [T T T T T], here T is for text.
-                temporal position_ids: [0, 1, 2, 3, 4]
-                height position_ids: [0, 1, 2, 3, 4]
-                width position_ids: [0, 1, 2, 3, 4]
-
-            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-            and 1D rotary position embedding for text part.
-            Examples:
-                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
-                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
-                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
-                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
-                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
-                text temporal position_ids: [3, 4, 5, 6, 7]
-                text height position_ids: [3, 4, 5, 6, 7]
-                text width position_ids: [3, 4, 5, 6, 7]
-                Here we calculate the text start position_ids as the max vision position_ids plus 1.
-
-        Args:
-            input_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-                it.
-            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
-                The temporal, height and width of feature shape of each image in LLM.
-            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
-                The temporal, height and width of feature shape of each video in LLM.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-        Returns:
-            position_ids (`torch.IntTensor` of shape `(3, batch_size, sequence_length)`)
-            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
-        """
-        spatial_merge_size = self.spatial_merge_size
-        image_token_id = self.image_token_id
-        video_token_id = self.video_token_id
-        vision_start_token_id = self.vision_start_token_id
-        mrope_position_deltas = []
-        input_ids = input_ids.to(torch.int64).to('cuda')
-        if image_grid_thw is not None or video_grid_thw is not None:
-            total_input_ids = input_ids
-            position_ids = torch.ones(3,
-                                      input_ids.shape[0],
-                                      input_ids.shape[1],
-                                      dtype=input_ids.dtype,
-                                      device=input_ids.device)
-            image_index, video_index = 0, 0
-            for idx, input_ids in enumerate(total_input_ids):
-                if attention_mask is not None:
-                    input_ids = input_ids[attention_mask[idx] == 1]
-                image_nums, video_nums = 0, 0
-                vision_start_indices = torch.argwhere(
-                    input_ids == vision_start_token_id).squeeze(1)
-                vision_tokens = input_ids[vision_start_indices + 1]
-                image_nums = (vision_tokens == image_token_id).sum()
-                video_nums = (vision_tokens == video_token_id).sum()
-                input_tokens = input_ids.tolist()
-                llm_pos_ids_list: list = []
-                st = 0
-                remain_images, remain_videos = image_nums, video_nums
-                for _ in range(image_nums + video_nums):
-                    if image_token_id in input_tokens and remain_images > 0:
-                        ed_image = input_tokens.index(image_token_id, st)
-                    else:
-                        ed_image = len(input_tokens) + 1
-                    if video_token_id in input_tokens and remain_videos > 0:
-                        ed_video = input_tokens.index(video_token_id, st)
-                    else:
-                        ed_video = len(input_tokens) + 1
-                    if ed_image < ed_video:
-                        t, h, w = (
-                            image_grid_thw[image_index][0],
-                            image_grid_thw[image_index][1],
-                            image_grid_thw[image_index][2],
-                        )
-                        image_index += 1
-                        remain_images -= 1
-                        ed = ed_image
-                    else:
-                        t, h, w = (
-                            video_grid_thw[video_index][0],
-                            video_grid_thw[video_index][1],
-                            video_grid_thw[video_index][2],
-                        )
-                        video_index += 1
-                        remain_videos -= 1
-                        ed = ed_video
-                    llm_grid_t, llm_grid_h, llm_grid_w = (
-                        t.item(),
-                        h.item() // spatial_merge_size,
-                        w.item() // spatial_merge_size,
-                    )
-                    text_len = ed - st
-
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                        llm_pos_ids_list) > 0 else 0
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) +
-                        st_idx)
-
-                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                        -1, llm_grid_h * llm_grid_w).flatten()
-                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                        llm_grid_t, -1, llm_grid_w).flatten()
-                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                        llm_grid_t, llm_grid_h, -1).flatten()
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + text_len +
-                        st_idx)
-                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-                if st < len(input_tokens):
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(
-                        llm_pos_ids_list) > 0 else 0
-                    text_len = len(input_tokens) - st
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) +
-                        st_idx)
-
-                llm_positions = torch.cat(llm_pos_ids_list,
-                                          dim=1).reshape(3, -1)
-                position_ids[..., idx,
-                             attention_mask[idx] == 1] = llm_positions.to(
-                                 position_ids.device)
-                mrope_position_deltas.append(llm_positions.max() + 1 -
-                                             len(total_input_ids[idx]))
-            mrope_position_deltas = torch.tensor(
-                mrope_position_deltas, device=input_ids.device).unsqueeze(1)
-            return position_ids, mrope_position_deltas
-        else:
-            if attention_mask is not None:
-                position_ids = attention_mask.long().cumsum(-1) - 1
-                position_ids.masked_fill_(attention_mask == 0, 1)
-                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(
-                    input_ids.device)
-                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
-                    -1, keepdim=True)[0]
-                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[
-                    -1]
-            else:
-                position_ids = (torch.arange(input_ids.shape[1],
-                                             device=input_ids.device).view(
-                                                 1, 1, -1).expand(
-                                                     3, input_ids.shape[0], -1))
-                mrope_position_deltas = torch.zeros(
-                    [input_ids.shape[0], 1],
-                    device=input_ids.device,
-                    dtype=input_ids.dtype,
-                )
-            return position_ids, mrope_position_deltas
-
-    def compute_mrope(self, input_ids, image_grid_thw, attention_mask):
-        pass
-
-        from tensorrt_llm.functional import (RopeEmbeddingUtils,
-                                             RotaryScalingType)
-        mrope_position_ids, mrope_position_deltas = self.get_rope_index(
-            input_ids,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=None,
-            attention_mask=attention_mask,
-        )
-        mrope_position_deltas = mrope_position_deltas.to(torch.device('cpu'))
-        if not hasattr(self, 'rotary_cos_sin'):
-            inv_freq, rotary_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
-                num_pos=self.max_position_embeddings,
-                dim=int(self.hidden_size / self.num_attention_heads),
-                theta=float(self.rope_theta),
-                scale_type=RotaryScalingType.mrope)
-
-            self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin).to("cuda")
-            self.rotary_cos_sin = self.rotary_cos_sin.reshape(
-                self.max_position_embeddings,
-                int(self.hidden_size / self.num_attention_heads / 2), 2)
-            self.cos_ori = self.rotary_cos_sin[:, :, 0]
-            self.sin_ori = self.rotary_cos_sin[:, :, 1]
-
-        mrope_position_ids = mrope_position_ids.transpose(1, 0)
-        mrope_position_ids_padding = torch.zeros(
-            mrope_position_ids.shape[:-1] + (self.max_position_embeddings, ),
-            dtype=torch.int32,
-            device="cuda")
-        mrope_position_ids_padding[:, :, :mrope_position_ids.
-                                   shape[-1]] = mrope_position_ids
-        cos = self.cos_ori[mrope_position_ids_padding]
-        sin = self.sin_ori[mrope_position_ids_padding]
-
-        mrope_section = [16, 24, 24]
-        cos = torch.cat([
-            m[:, i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))
-        ],
-                        dim=-1).unsqueeze(-1)
-        sin = torch.cat([
-            m[:, i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))
-        ],
-                        dim=-1).unsqueeze(-1)
-        concat_cos_sin = torch.concatenate((cos, sin), axis=-1)
-        concat_cos_sin = concat_cos_sin.reshape(concat_cos_sin.shape[0], -1)
-        return concat_cos_sin, mrope_position_deltas
diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt b/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt
deleted file mode 100755
index 715c4915016..00000000000
--- a/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "multimodal_encoders"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-dynamic_batching {
-    preferred_batch_size: [ ${triton_max_batch_size} ]
-    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
-    default_queue_policy: { max_queue_size: ${max_queue_size} }
-}
-
-input [
-    {
-        name: "IMAGE"
-        data_type: TYPE_FP16
-        dims: [ -1, -1, -1, -1 ]
-        optional: true
-    },
-    {
-        name: "pixel_values"
-        data_type: TYPE_FP16
-        dims: [ -1, -1, -1, -1 ]
-        optional: true
-    },
-    # input tensors for mllama
-    {
-        name: "aspect_ratio_ids"
-        data_type: TYPE_INT64
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "aspect_ratio_mask"
-        data_type: TYPE_INT64
-        dims: [ -1, -1 ]
-        optional: true
-    },
-    {
-        name: "cross_attention_mask"
-        data_type: TYPE_INT64
-        dims: [ -1, -1, -1 ]
-        optional: true
-    },
-    {
-        name: "max_tokens"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-        optional: true
-    },
-    # input tensors for llava_onevision
-    {
-        name: "image_sizes"
-        data_type: TYPE_INT64
-        dims: [ 2 ]
-        optional: true
-    },
-    {
-        name: "is_video_input"
-        data_type: TYPE_BOOL
-        dims: [ 1 ]
-        optional: true
-    },
-    # Required for Qwen2-VL
-    {
-        name: "attention_mask"
-        data_type: TYPE_INT64
-
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "image_grid_thw"
-        data_type: TYPE_INT64
-        dims: [ 3 ]
-        optional: true
-    },
-    {
-        name: "vision_input_id"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-        optional: true
-    }
-]
-
-# The supported datatype in trtion's python backend is written
-# https://github.com/triton-inference-server/python_backend/blob/bfabfdbf4aa1e3db36aaf9e640b1ce5e0a720f48/src/pb_stub_utils.cc#L124
-# NOTE
-# TYPE_BF16, TYPE_FP8 is not supported
-output [
-    {
-        name: "OUT_PROMPT_EMBEDDING_TABLE"
-        data_type: TYPE_FP16
-        dims: [ -1, -1 ]
-    },
-    {
-        name: "OUT_PROMPT_VOCAB_SIZE"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    # Output for visual encoders of type mllama
-    {
-        name: "ENCODER_INPUT_FEATURES"
-        data_type: ${encoder_input_features_data_type}
-        dims: [ -1, -1 ]
-    },
-    {
-        name: "ENCODER_OUTPUT_LENGTHS"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-        reshape: { shape: [ ] }
-    },
-    {
-        name: "CROSS_ATTENTION_MASK"
-        data_type: TYPE_BOOL
-        dims: [ -1, -1 ]
-    },
-    # skip_cross_attn_blocks shape `[bs]`, only used in mllama
-    {
-        name: "SKIP_CROSS_ATTN_BLOCKS"
-        data_type: TYPE_BOOL
-        dims: [ 1 ]
-    },
-    # Mrope param, only used in Qwen2-VL
-    {
-        name: "MROPE_ROTARY_COS_SIN"
-        data_type: TYPE_FP32
-        dims: [ -1 ]
-    },
-    {
-        name: "MROPE_POSITION_DELTAS"
-        data_type: TYPE_INT64
-        dims: [ 1 ]
-    }
-]
-
-# USER can add another modal's model as parameters and also as inputs here.
-
-parameters {
-  key: "multimodal_model_path"
-  value: {
-    string_value: "${multimodal_model_path}"
-  }
-}
-# HF model path, needed for llava_onevision and mllama model
-parameters {
-  key: "hf_model_path"
-  value: {
-    string_value: "${hf_model_path}"
-  }
-}
-
-instance_group [
-    {
-        count: 1
-        kind: KIND_CPU
-    }
-]
diff --git a/triton_backend/all_models/multimodal/requirements-llava-onevision.txt b/triton_backend/all_models/multimodal/requirements-llava-onevision.txt
deleted file mode 100644
index dc1ce6e8a13..00000000000
--- a/triton_backend/all_models/multimodal/requirements-llava-onevision.txt
+++ /dev/null
@@ -1 +0,0 @@
-av
diff --git a/triton_backend/all_models/multimodal/requirements-mllama.txt b/triton_backend/all_models/multimodal/requirements-mllama.txt
deleted file mode 100644
index 770b8042c24..00000000000
--- a/triton_backend/all_models/multimodal/requirements-mllama.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-transformers>=4.45.0
-torchvision
diff --git a/triton_backend/all_models/multimodal/requirements-qwen2vl.txt b/triton_backend/all_models/multimodal/requirements-qwen2vl.txt
deleted file mode 100755
index a0b40b2da7a..00000000000
--- a/triton_backend/all_models/multimodal/requirements-qwen2vl.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-accelerate
-qwen-vl-utils==0.0.8
diff --git a/triton_backend/all_models/multimodal/requirements-vila.txt b/triton_backend/all_models/multimodal/requirements-vila.txt
deleted file mode 100755
index a4b509de033..00000000000
--- a/triton_backend/all_models/multimodal/requirements-vila.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-git+https://github.com/bfshi/scaling_on_scales.git
-transformers==4.36.2
-deepspeed
-torchvision
-flash_attn
diff --git a/triton_backend/all_models/tests/test_decode.py b/triton_backend/all_models/tests/test_decode.py
deleted file mode 100644
index 28e6952a5d3..00000000000
--- a/triton_backend/all_models/tests/test_decode.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from typing import Dict, Optional
-
-import numpy as np
-import pytest
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
-from lib.decode import *
-
-
-class MockDecoder(Decoder):
-
-    def __init__(self,
-                 streaming=False,
-                 accumulate=False,
-                 data_dict: Optional[Dict] = None):
-        super().__init__(streaming=streaming, accumulate=accumulate)
-        self.data_dict = data_dict
-        self.draft_step = -1
-        self.target_step = -1
-
-        self.draft_num_calls = 0
-        self.target_num_calls = 0
-
-    def preprocess(self, request: Request) -> PreprocResponse:
-        return PreprocResponse(
-            input_ids=np.array([self.data_dict["input_ids"]]),
-            input_lengths=np.array([[len(self.data_dict["input_ids"])]]),
-            stop_words_list=np.array([[[]]]))
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        target_output = self.data_dict["target_output"][self.target_step]
-        return Response(text_output=np.array([target_output["output_text"]]))
-
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        self.draft_num_calls += 1
-        self.draft_step += 1
-        draft_output = self.data_dict["draft_output"][self.draft_step]
-        response = GenerationResponse(
-            output_ids=np.array([[draft_output["output_ids"]]]),
-            generation_logits=None,
-            sequence_length=np.array([[draft_output["sequence_length"]]]))
-        if self.data_dict.get("use_draft_logits", False):
-            print("!!!")
-            response.generation_logits = draft_output["generation_logits"]
-        return response
-
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None
-    ) -> Generator[GenerationResponse, None, None]:
-        for idx, target_output in enumerate(self.data_dict["target_output"]):
-            self.target_num_calls += 1
-            self.target_step = idx
-            output_len = len(target_output["output_ids"])
-            yield GenerationResponse(output_ids=np.array(
-                [[target_output["output_ids"]]]),
-                                     sequence_length=np.array([[output_len]]))
-
-    def _generate_non_streaming(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None,
-        multimodal_enc_response: Optional[MultimodalEncResponse] = None
-    ) -> GenerationResponse:
-        self.target_num_calls += 1
-        # Return the full completion (final step) if not using speculative decoding in non-streaming mode
-        if not self.data_dict["use_speculative"]:
-            self.target_step = (len(self.data_dict["target_output"]) - 2)
-        else:
-            print(draft_request)
-            assert draft_request is not None
-            if draft_request.draft_input_ids is not None:
-                assert draft_request.draft_input_ids.shape[1] > 0
-                if self.data_dict.get("use_draft_logits", False):
-                    assert draft_request.draft_logits is not None
-                    assert draft_request.draft_logits.shape[
-                        1] == draft_request.draft_input_ids.shape[1]
-
-        self.target_step += 1
-        target_output = self.data_dict["target_output"][self.target_step]
-        output_len = len(target_output["output_ids"])
-        return GenerationResponse(output_ids=np.array(
-            [[target_output["output_ids"]]]),
-                                  sequence_length=np.array([[output_len]]))
-
-
-decode_testcases = [
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        False,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7],
-            "output_text": "Deep learning is a"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9],
-            "output_text": "Deep learning is a subset"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22],
-            "output_text": "Deep learning is a subset of Machine"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }]
-    },
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "use_draft_logits":
-        False,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7,
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9,
-        }]
-    },
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "use_draft_logits":
-        True,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7,
-            "generation_logits": np.random.rand(1, 1, 7, 1024),
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9,
-            "generation_logits": np.random.rand(1, 1, 7, 1024),
-        }]
-    },
-]
-
-
-@pytest.mark.parametrize("test_case", decode_testcases)
-def test_decode(test_case):
-
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        use_draft_logits=(np.array([[test_case["use_draft_logits"]]],
-                                   dtype=bool)
-                          if "use_draft_logits" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Last index is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][-1]["output_text"]], dtype=object))
-
-    if not test_case["use_speculative"]:
-        # Test non speculative mode
-
-        # non-streaming
-        d = MockDecoder(data_dict=test_case, streaming=False)
-        for res in d.decode(request):
-            assert expected_res == res
-        assert d.target_num_calls == 1
-
-        # streaming
-        d = MockDecoder(data_dict=test_case, streaming=True)
-        final_res = None
-        for res in d.decode(request):
-            final_res = res
-        assert final_res == expected_res
-        assert d.target_num_calls == len(test_case["target_output"])
-    else:
-        # Test speculative decoding
-        d = MockDecoder(data_dict=test_case)
-        final_res = None
-        for res in d.decode(request, speculative_decoding=True):
-            final_res = res
-        assert final_res == expected_res
-        num_steps = len(test_case["draft_output"])
-        assert d.target_num_calls == num_steps
-        assert d.draft_num_calls == num_steps
-
-
-length_stop_testcases = [{
-    "text_input":
-    "Deep learning is",
-    "max_tokens":
-    1,
-    "use_speculative":
-    True,
-    "num_draft_tokens":
-    3,
-    "input_ids": [1, 10, 11, 23],
-    "target_output": [{
-        "output_ids": [1, 10, 11, 23],
-        "output_text": "Deep learning is a"
-    }, {
-        "output_ids": "not important",
-        "output_text": "not important"
-    }],
-    "draft_output": [{
-        "output_ids": ["not important"],
-        "sequence_length": 0
-    }, {
-        "output_ids": ["not important"],
-        "sequence_length": 0
-    }]
-}]
-
-
-@pytest.mark.parametrize("test_case", length_stop_testcases)
-def test_length_stop(test_case):
-    # Since max_tokens is 1, test if get the first output as the final output
-    # and make sure the draft model is never called
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Index 0 is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][0]["output_text"]], dtype=object))
-
-    d = MockDecoder(data_dict=test_case)
-    final_res = None
-    for res in d.decode(request, speculative_decoding=True):
-        final_res = res
-    assert final_res == expected_res
-    assert d.target_num_calls == 1
-    assert d.draft_num_calls == 0
-
-
-early_stopping_testcases = [
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of Machine"
-        }, {
-            "output_ids": ["not important"],
-            "output_text": "not important"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9
-        }, {
-            "output_ids": ["not important"],
-            "sequence_length": 0
-        }]
-    },
-]
-
-
-@pytest.mark.parametrize("test_case", early_stopping_testcases)
-def test_early_stopping(test_case):
-
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Index 1 is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][1]["output_text"]], dtype=object))
-
-    d = MockDecoder(data_dict=test_case)
-    final_res = None
-    for res in d.decode(request, speculative_decoding=True):
-        final_res = res
-    assert final_res == expected_res
-    assert d.target_num_calls == 2
-    assert d.draft_num_calls == 2
-
-
-def test_request_validation():
-    req = Request()
-    with pytest.raises(RequestValidationError):
-        req.validate()
-    req.text_input = np.array([["input string"]], dtype=object)
-    with pytest.raises(RequestValidationError):
-        req.validate()
-    req.max_tokens = np.array([[10]])
-    req.validate()
-
-    req.stream = np.array([[True]])
-    req.num_draft_tokens = np.array([[5]])
-
-    with pytest.raises(RequestValidationError):
-        req.validate()
diff --git a/triton_backend/all_models/tests/test_llmapi_python_backend.py b/triton_backend/all_models/tests/test_llmapi_python_backend.py
deleted file mode 100644
index 6ef7cd9946e..00000000000
--- a/triton_backend/all_models/tests/test_llmapi_python_backend.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from dataclasses import dataclass
-from typing import Dict, List, Union
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-
-# Mock pb_utils
-sys.modules["triton_python_backend_utils"] = MagicMock()
-
-from helpers import (convert_request_input_to_dict,
-                     get_output_config_from_request, get_parameter,
-                     get_sampling_params_from_request,
-                     get_streaming_from_request)
-# Use PYTHONPATH=../llmapi/tensorrt_llm/1/
-from model import *
-
-
-@dataclass
-class MockTritonTensor:
-    _name: str
-    _tensor: Union[np.ndarray, torch.Tensor]
-
-    def name(self) -> str:
-        return self._name
-
-    def as_numpy(self) -> np.ndarray:
-        if self.is_cpu():
-            return self._tensor
-        else:
-            return self._tensor.as_numpy()
-
-    def is_cpu(self) -> bool:
-        if isinstance(self._tensor, np.ndarray):
-            return True
-        else:
-            return False
-
-
-@dataclass
-class MockTritonError:
-    message: str
-
-
-@dataclass
-class MockTritonResponse:
-    tensors: Dict[str, MockTritonTensor]
-    error: MockTritonError
-
-    def __init__(self,
-                 output_tensors: List[MockTritonTensor],
-                 error: MockTritonError = None):
-        self.tensors = {}
-        for tensor in output_tensors:
-            self.tensors[tensor.name()] = tensor
-        self.error = error
-
-    def output_tensors(self):
-        return self.tensors.values()
-
-    def has_error(self):
-        return self.error is not None
-
-
-@dataclass
-class MockTritonRequest:
-    tensors: Dict[str, MockTritonTensor]
-
-    def get_input_tensor_by_name(self, name: str) -> MockTritonTensor:
-        return self.tensors[name] if name in self.tensors else None
-
-    def get_response_sender(self):
-        return None
-
-
-def mock_pb_utils_get_input_tensor_by_name_side_effect(
-        request: MockTritonRequest, name: str) -> MockTritonTensor:
-    return request.get_input_tensor_by_name(name)
-
-
-def make_mock_triton_request(
-        tensors: Dict[str, np.ndarray]) -> MockTritonRequest:
-    return MockTritonRequest({
-        k: MockTritonTensor(k, np.array(v))
-        for k, v in tensors.items()
-    })
-
-
-@pytest.fixture(autouse=True)
-def apply_patches():
-
-    patch("model.pb_utils.Tensor", new=MockTritonTensor).start()
-    patch("model.pb_utils.InferenceResponse", new=MockTritonResponse).start()
-    patch("model.pb_utils.TritonError", new=MockTritonError).start()
-    patch("model.pb_utils.InferenceRequest", new=MockTritonRequest).start()
-    patch("model.pb_utils.get_input_tensor_by_name",
-          new=mock_pb_utils_get_input_tensor_by_name_side_effect).start()
-    patch("model.pb_utils.TritonModelException", new=Exception).start()
-
-
-def inputs(streaming=False):
-    """Different request configurations for testing."""
-    return {
-        "text_input": ["Tell me a story."],
-        "streaming": [streaming],
-        "sampling_param_temperature": [0.8],
-        "sampling_param_beam_width": [4],
-        "sampling_param_top_k": [0],
-        "sampling_param_top_p": [1.0],
-        "sampling_param_stop": ['\n', 'stop'],
-        "sampling_param_exclude_input_from_output": [True],
-        "sampling_param_max_tokens": [100],
-        "sampling_param_frequency_penalty": [0.0],
-        "sampling_param_presence_penalty": [0.0],
-        "sampling_param_seed": [2],
-        "return_finish_reason": [True],
-        "return_stop_reason": [True],
-        "sampling_param_return_perf_metrics": [True]
-    }
-
-
-@pytest.fixture
-def mock_model():
-    model = TritonPythonModel()
-    model.initialize({"model_config": json.dumps({"max_batch_size": 64})})
-    return model
-
-
-def test_get_sampling_params_from_request():
-    request = make_mock_triton_request(inputs(streaming=False))
-    config = get_sampling_params_from_request(request)
-    assert config["temperature"] == 0.8
-    # assert config["beam_width"] == 4
-    assert config["top_k"] == 0
-    assert config["top_p"] == 1.0
-    assert config["max_tokens"] == 100
-    assert config["frequency_penalty"] == 0.0
-    assert config["presence_penalty"] == 0.0
-    assert config["seed"] == 2
-    assert config["return_perf_metrics"] == True
-    assert np.array_equal(config["stop"], np.array(['\n', 'stop']))
-
-
-def test_get_streaming_from_request():
-    for streaming in [True, False]:
-        request = make_mock_triton_request(inputs(streaming=streaming))
-        assert get_streaming_from_request(request) == streaming
-
-
-def test_get_output_config_from_request():
-    request = make_mock_triton_request(inputs(streaming=False))
-    output_config = get_output_config_from_request(request)
-    assert output_config["return_finish_reason"] == True
-    assert output_config["return_stop_reason"] == True
-
-
-def test_convert_request_input_to_dict():
-    request = make_mock_triton_request({
-        "param_a": [1],
-        "param_b": [True],
-        "missing_param": [10]
-    })
-
-    param_mappings = {
-        "param_a": "mapped_a",
-        "param_b": "mapped_b",
-        "non_existent": "mapped_c"
-    }
-
-    default_values = {"param_b": False, "non_existent": "default_value"}
-
-    result = convert_request_input_to_dict(request=request,
-                                           param_mappings=param_mappings,
-                                           default_values=default_values,
-                                           batch_size=1,
-                                           batch_index=0)
-
-    assert result == {
-        "mapped_a": 1,
-        "mapped_b": True,
-        "mapped_c": "default_value"
-    }
-
-
-def test_get_parameter():
-    # Test valid parameter cases
-    model_config = {
-        "parameters": {
-            "valid_int": {
-                "string_value": "42"
-            },
-            "valid_bool": {
-                "string_value": "True"
-            },
-            "valid_str": {
-                "string_value": "test_str"
-            },
-            "invalid_number": {
-                "string_value": "not_a_number"
-            },
-            "empty_param": {
-                "string_value": ""
-            },
-            "env_var_param": {
-                "string_value": "${ENV_VAR}"
-            }
-        }
-    }
-
-    # Valid parameter reads
-    assert get_parameter(model_config, "valid_int", int) == 42
-    assert get_parameter(model_config, "valid_bool", bool) is True
-    assert get_parameter(model_config, "valid_str", str) == "test_str"
-
-    # Invalid parameter handling
-    assert get_parameter(model_config, "invalid_number", int) is None
-    assert get_parameter(model_config, "non_existent_param") is None
-
-    # Special cases
-    assert get_parameter(model_config, "empty_param") is None
-    assert get_parameter(model_config, "env_var_param") is None
diff --git a/triton_backend/all_models/tests/test_multi_image_preprocess.py b/triton_backend/all_models/tests/test_multi_image_preprocess.py
deleted file mode 100755
index f410c5d4b97..00000000000
--- a/triton_backend/all_models/tests/test_multi_image_preprocess.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from unittest.mock import MagicMock
-
-import numpy as np
-import pytest
-
-sys.modules["triton_python_backend_utils"] = MagicMock()
-# Use PYTHONPATH=../inflight_batcher_llm/preprocessing/1/
-from model import TritonPythonModel, VisionPreProcessor
-
-
-class MockTokenizer:
-
-    def __init__(self):
-        self.vocab_size = 50000
-        self.pad_token = 0
-        self.eos_token = 2
-        self.bos_token = 1
-
-    def encode(self, text, **kwargs):
-        # Simple mock to convert characters to their ASCII values starting from 100 to avoid conflicts
-        # This ensures that the pad token (0) and other special tokens are not used
-        return [100 + ord(c) for c in text]
-
-
-class MockProcessor:
-
-    def __call__(self, images=None, text=None, **kwargs):
-        # Simple mock to convert a uint8 image into a float image
-        return dict(pixel_values=images / 255.0)
-
-
-@pytest.fixture
-def triton_model():
-    model = TritonPythonModel()
-    model.tokenizer = MockTokenizer()
-    model.add_special_tokens = True
-    model.is_multimodal = True
-    model.model_type = 'vila'
-    model.max_num_images = 2
-    model.vocab_size = model.tokenizer.vocab_size
-    model.ptable_shape = (-1, 10, 768)
-    model.tokenizer_pad_id = model.tokenizer.pad_token
-    model.tokenizer_end_id = model.tokenizer.eos_token
-    model.vision_preprocessor = VisionPreProcessor(model.model_type,
-                                                   MockProcessor())
-    return model
-
-
-# Test for _process_multi_image_inputs()
-@pytest.mark.parametrize(
-    "query, expected_output",
-    [
-        # Test Case 1: Single image placeholder
-        (np.array([[b"Hello <image> World"]]), [
-            np.array([100 + ord(c) for c in "Hello "] + [-200] +
-                     [100 + ord(c) for c in " World"])
-        ]),
-        # Test Case 2: Multiple image placeholders
-        (np.array([[b"Image1 <image> Image2 <image> Image3"]]), [
-            np.array([100 + ord(c) for c in "Image1 "] + [-200] +
-                     [100 + ord(c) for c in " Image2 "] + [-200] +
-                     [100 + ord(c) for c in " Image3"])
-        ]),
-        # Test Case 3: No image placeholders
-        (np.array([[b"No images here"]
-                   ]), [np.array([100 + ord(c) for c in "No images here"])]),
-        # Test Case 4: Multiple image at start or end
-        (np.array([[b"<image> Image1 Image2 <image>"]]), [
-            np.array([-200] + [100 + ord(c)
-                               for c in " Image1 Image2 "] + [-200])
-        ])
-    ])
-def test_process_multi_image_inputs(triton_model, query, expected_output):
-    output = triton_model._process_multi_image_inputs(query)
-    assert len(output) == len(expected_output)
-    for out, exp in zip(output, expected_output):
-        assert np.array_equal(out, exp)
-
-
-# Test for _split_prompt_by_images()
-@pytest.mark.parametrize(
-    "concatenated_ids, expected_splits",
-    [
-        # Test Case 1: Single image placeholder
-        (
-            # <pre> Hello <image> World <post>
-            [
-                np.array([100 + ord(c) for c in "Pre Hello "] + [-200] +
-                         [100 + ord(c) for c in " World Post"])
-            ],
-            [[
-                np.array([100 + ord(c) for c in "Pre Hello "]).reshape(1, -1),
-                np.array([100 + ord(c) for c in " World Post"]).reshape(1, -1)
-            ]]),
-        # Test Case 2: Multiple image placeholders
-        (
-            # <pre> Image1 <image> Image2 <image> Image3 <post>
-            [
-                np.array([100 + ord(c) for c in "Pre Image1 "] + [-200] +
-                         [100 + ord(c) for c in " Image2 "] + [-200] +
-                         [100 + ord(c) for c in " Image3 Post"])
-            ],
-            [[
-                np.array([100 + ord(c) for c in "Pre Image1 "]).reshape(1, -1),
-                np.array([100 + ord(c) for c in " Image2 "]).reshape(1, -1),
-                np.array([100 + ord(c) for c in " Image3 Post"]).reshape(1, -1)
-            ]]),
-        # Test Case 3: No image placeholders
-        (
-            # <pre> No images here <post>
-            [np.array([100 + ord(c) for c in "Pre No images here Post"])],
-            [[
-                np.array([100 + ord(c)
-                          for c in "Pre No images here Post"]).reshape(1, -1)
-            ]]),
-        # Test Case 4: Multiple image at start or end
-        (
-            # <pre> <image> Image1 Image2 <image> <post>
-            [
-                np.array([100 + ord(c) for c in "Pre "] + [-200] +
-                         [100 + ord(c) for c in " Image1 Image2 "] + [-200] +
-                         [100 + ord(c) for c in " Post"])
-            ],
-            [[
-                np.array([100 + ord(c) for c in "Pre "]).reshape(1, -1),
-                np.array([100 + ord(c)
-                          for c in " Image1 Image2 "]).reshape(1, -1),
-                np.array([100 + ord(c) for c in " Post"]).reshape(1, -1)
-            ]])
-    ])
-def test_split_prompt_by_images(triton_model, concatenated_ids,
-                                expected_splits):
-    output = triton_model._split_prompt_by_images(concatenated_ids)
-    assert len(output) == len(expected_splits)
-    for out_splits, exp_splits in zip(output, expected_splits):
-        assert len(out_splits) == len(exp_splits)
-        for out, exp in zip(out_splits, exp_splits):
-            assert np.array_equal(out, exp)
-
-
-# Test for _setup_fake_prompts()
-@pytest.mark.parametrize(
-    "batch_size, batch_split_prompts, ptable_shape, vocab_size, expected_input_ids",
-    [
-        # Test Case 1: Single image placeholder in each sample
-        (2, [[
-            np.array([100 + ord(c) for c in "Pre Hello "]).reshape(1, -1),
-            np.array([100 + ord(c) for c in " World Post"]).reshape(1, -1)
-        ],
-             [
-                 np.array([100 + ord(c) for c in "Pre Foo "]).reshape(1, -1),
-                 np.array([100 + ord(c) for c in " Bar Post"]).reshape(1, -1)
-             ]], (-1, 10, 768), 50000, [
-                 np.concatenate([[100 + ord(c) for c in "Pre Hello "],
-                                 np.arange(50000, 50000 + 10),
-                                 [100 + ord(c) for c in " World Post"]]),
-                 np.concatenate([[100 + ord(c) for c in "Pre Foo "],
-                                 np.arange(50000, 50000 + 10),
-                                 [100 + ord(c)
-                                  for c in " Bar Post"], [0, 0, 0, 0]])
-             ]),
-        # Test Case 2: Multiple image placeholders in a sample
-        (1, [[
-            np.array([100 + ord(c) for c in "Pre Image1 "]).reshape(1, -1),
-            np.array([100 + ord(c) for c in " Image2 "]).reshape(1, -1),
-            np.array([100 + ord(c) for c in " Image3 Post"]).reshape(1, -1)
-        ]], (-1, 10, 768), 50000, [
-            np.concatenate([[100 + ord(c) for c in "Pre Image1 "],
-                            np.arange(50000, 50000 + 10),
-                            [100 + ord(c) for c in " Image2 "],
-                            np.arange(50010, 50010 + 10),
-                            [100 + ord(c) for c in " Image3 Post"]])
-        ]),
-        # Test Case 3: No image placeholders
-        (1, [[
-            np.array([100 + ord(c)
-                      for c in "Pre No image here Post"]).reshape(1, -1)
-        ]], (-1, 10, 768), 50000,
-         [[100 + ord(c) for c in "Pre No image here Post"]]),
-        # Test Case 4: Multiple image at start or end
-        (2, [[
-            np.array([100 + ord(c) for c in "Pre "]).reshape(1, -1),
-            np.array([100 + ord(c) for c in " Image1 Image2 "]).reshape(1, -1),
-            np.array([100 + ord(c) for c in " Post"]).reshape(1, -1)
-        ],
-             [
-                 np.array([100 + ord(c) for c in "Pre "]).reshape(1, -1),
-                 np.array([100 + ord(c)
-                           for c in " Image3 Image4 "]).reshape(1, -1),
-                 np.array([100 + ord(c) for c in " Post"]).reshape(1, -1)
-             ]], (-1, 10, 768), 50000, [
-                 np.concatenate([[100 + ord(c) for c in "Pre "],
-                                 np.arange(50000, 50000 + 10),
-                                 [100 + ord(c) for c in " Image1 Image2 "],
-                                 np.arange(50010, 50010 + 10),
-                                 [100 + ord(c) for c in " Post"]]),
-                 np.concatenate([[100 + ord(c) for c in "Pre "],
-                                 np.arange(50000, 50000 + 10),
-                                 [100 + ord(c) for c in " Image3 Image4 "],
-                                 np.arange(50010, 50010 + 10),
-                                 [100 + ord(c) for c in " Post"]])
-             ])
-    ])
-def test_setup_fake_prompts(triton_model, batch_size, batch_split_prompts,
-                            ptable_shape, vocab_size, expected_input_ids):
-    triton_model.ptable_shape = ptable_shape
-    triton_model.vocab_size = vocab_size
-    output = triton_model._setup_fake_prompts(batch_size, batch_split_prompts)
-    assert output.shape[0] == batch_size
-    for out, exp in zip(output, expected_input_ids):
-        assert np.array_equal(out, exp)
-
-
-# Test for _process_multi_image_inputs()
-@pytest.mark.parametrize(
-    "query, image_bytes, expected_output",
-    [
-        # Test Case 1: Single image placeholder
-        (np.array([[b"Hello <image> World"]
-                   ]), np.ones((1, 1, 32, 32, 3), dtype=np.uint8) * 255,
-         dict(PIXEL_VALUES=np.ones((1, 1, 32, 32, 3), dtype=np.float16)))
-    ])
-def test_process_image_for_encoder(triton_model, query, image_bytes,
-                                   expected_output):
-    output = triton_model.vision_preprocessor.mllama_process(
-        query, image_bytes=image_bytes)
-    assert output.keys() == expected_output.keys()
-    for key in output.keys():
-        assert np.array_equal(output[key], expected_output[key])
diff --git a/triton_backend/all_models/tests/test_multimodal_encoders.py b/triton_backend/all_models/tests/test_multimodal_encoders.py
deleted file mode 100644
index 7b892179f09..00000000000
--- a/triton_backend/all_models/tests/test_multimodal_encoders.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import math
-import sys
-from unittest.mock import MagicMock
-
-import pytest
-import torch
-
-sys.modules["triton_python_backend_utils"] = MagicMock()
-# Use PYTHONPATH=../multimodal/multimodal_encoders/1
-from multimodal_utils import LlavaOnevisionUtils, Qwen2VLUtils
-
-
-@pytest.fixture
-def LlavaOvUtiils():
-    # MockConfig
-    class VisionConfig:
-
-        def __init__(self, image_size, patch_size):
-            self.image_size = image_size
-            self.patch_size = patch_size
-
-    class Config:
-
-        def __init__(self, vision_aspect_ratio, image_grid_pinpoints,
-                     hidden_size, vision_config):
-            self.vision_aspect_ratio = vision_aspect_ratio
-            self.image_grid_pinpoints = image_grid_pinpoints
-            self.vision_config = vision_config
-            self.hidden_size = hidden_size
-
-    grid_pinpoints = [[1152, 1536], [1152, 1152]]
-    vision_config = VisionConfig(384, 14)
-    config = Config("anyres_max_9", grid_pinpoints, 3584, vision_config)
-    newline = torch.ones((3584), dtype=torch.float16)
-
-    return LlavaOnevisionUtils(config, newline)
-
-
-# Test for LlavaOnevisionUtils.postprocess_video()
-@pytest.mark.parametrize("image_features, batch_size, frames",
-                         [(torch.ones(
-                             (8, 729, 3584), dtype=torch.float16), 1, 8),
-                          (torch.ones(
-                              (20, 729, 3584), dtype=torch.float16), 2, 10)])
-def test_llava_onevision_utils_video(LlavaOvUtiils, image_features, batch_size,
-                                     frames):
-    output = LlavaOvUtiils.postprocess_video(image_features, batch_size, frames)
-
-    hw = LlavaOvUtiils.config.vision_config.image_size // LlavaOvUtiils.config.vision_config.patch_size
-    size = hw * hw
-    dim = math.ceil(math.sqrt(size) / 2)
-    expected_dim = frames * dim * dim + 1
-    assert output.dtype == image_features.dtype
-    assert output.dim() == 3
-    assert output.shape[0] == batch_size
-    assert output.shape[1] == expected_dim
-    assert output.shape[2] == LlavaOvUtiils.config.hidden_size
-
-
-# Test for LlavaOnevisionUtils.postprocess_image()
-@pytest.mark.parametrize(
-    "image_features, image_sizes, image_num_patches, expected_dim",
-    [(torch.ones(
-        (13, 729, 3584), dtype=torch.float16), [[874, 1192]], [13], 7284),
-     (torch.ones(
-         (20, 729, 3584), dtype=torch.float16), [[899, 1024], [899, 1024]
-                                                 ], [10, 10], 6551)])
-def test_llava_onevision_utils_image(LlavaOvUtiils, image_features, image_sizes,
-                                     image_num_patches, expected_dim):
-    output = LlavaOvUtiils.postprocess_image(image_features, image_sizes,
-                                             image_num_patches)
-
-    assert output.dtype == image_features.dtype
-    assert output.dim() == 3
-    assert output.shape[0] == len(image_sizes)
-    assert output.shape[1] == expected_dim
-    assert output.shape[2] == LlavaOvUtiils.config.hidden_size
-
-
-@pytest.fixture
-def MockQwen2VLUtils():
-
-    class visionConfig:
-
-        def __init__(self, spatial_merge_size, embed_dim, num_heads):
-            self.spatial_merge_size = spatial_merge_size
-            self.embed_dim = embed_dim
-            self.num_heads = num_heads
-
-    # MockConfig
-    class Config:
-
-        def __init__(self, image_token_id, video_token_id,
-                     vision_start_token_id, max_position_embeddings,
-                     hidden_size, num_attention_heads, rope_theta,
-                     vision_config):
-            self.image_token_id = image_token_id
-            self.video_token_id = video_token_id
-            self.vision_config = vision_config
-            self.vision_start_token_id = vision_start_token_id
-            self.max_position_embeddings = max_position_embeddings
-            self.hidden_size = hidden_size
-            self.num_attention_heads = num_attention_heads
-            self.rope_theta = rope_theta
-
-    vision_config = visionConfig(2, 1280, 16)
-    config = Config(151655, 151656, 151652, 32768, 3584, 28, 1000000.0,
-                    vision_config)
-    return Qwen2VLUtils(config)
-
-
-@pytest.mark.parametrize(
-    "input_ids, image_grid_thw, attention_mask",
-    [(torch.ones((1, 300), dtype=torch.int32),
-      torch.tensor([[1, 36, 36]],
-                   dtype=torch.int64), torch.ones((1, 300), dtype=torch.int64)),
-     (torch.ones((2, 300), dtype=torch.int32),
-      torch.tensor([[1, 36, 36], [1, 36, 36]],
-                   dtype=torch.int64), torch.ones(
-                       (2, 300), dtype=torch.int64))])
-def test_qwen2_vl_utils_compute_mrope(MockQwen2VLUtils, input_ids,
-                                      image_grid_thw, attention_mask):
-    concat_cos_sin, mrope_position_deltas = MockQwen2VLUtils.compute_mrope(
-        input_ids, image_grid_thw, attention_mask)
-    assert concat_cos_sin.dim() == 2
-    assert mrope_position_deltas.dim() == 2
-    assert concat_cos_sin.shape[0] == input_ids.shape[0]
-    assert mrope_position_deltas.shape[0] == input_ids.shape[0]
-    assert mrope_position_deltas.shape[1] == 1
diff --git a/triton_backend/all_models/tests/test_python_backend.py b/triton_backend/all_models/tests/test_python_backend.py
deleted file mode 100644
index b993af957f9..00000000000
--- a/triton_backend/all_models/tests/test_python_backend.py
+++ /dev/null
@@ -1,872 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import sys
-from dataclasses import dataclass
-from typing import Dict, List, Union
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-
-# Mock pb_utils
-sys.modules["triton_python_backend_utils"] = MagicMock()
-
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm/1/
-from model import *
-
-import tensorrt_llm.bindings.executor as trtllm
-
-
-@dataclass
-class MockTritonTensor:
-    _name: str
-    _tensor: Union[np.ndarray, torch.Tensor]
-
-    def name(self) -> str:
-        return self._name
-
-    def as_numpy(self) -> np.ndarray:
-        if self.is_cpu():
-            return self._tensor
-        else:
-            return self._tensor.as_numpy()
-
-    def is_cpu(self) -> bool:
-        if isinstance(self._tensor, np.ndarray):
-            return True
-        else:
-            return False
-
-
-@dataclass
-class MockTritonError:
-    message: str
-
-
-@dataclass
-class MockTritonResponse:
-    tensors: Dict[str, MockTritonTensor]
-    error: MockTritonError
-
-    def __init__(self,
-                 output_tensors: List[MockTritonTensor],
-                 error: MockTritonError = None):
-        self.tensors = {}
-        for tensor in output_tensors:
-            self.tensors[tensor.name()] = tensor
-        self.error = error
-
-    def output_tensors(self):
-        return self.tensors.values()
-
-    def has_error(self):
-        return self.error is not None
-
-
-@dataclass
-class MockTritonRequest:
-    tensors: Dict[str, MockTritonTensor]
-
-    def get_input_tensor_by_name(self, name: str) -> MockTritonTensor:
-        return self.tensors[name] if name in self.tensors else None
-
-    def get_response_sender(self):
-        return None
-
-
-def mock_pb_utils_get_input_tensor_by_name_side_effect(
-        request: MockTritonRequest, name: str) -> MockTritonTensor:
-    return request.get_input_tensor_by_name(name)
-
-
-def make_mock_triton_request(
-        tensors: Dict[str, np.ndarray]) -> MockTritonRequest:
-    return MockTritonRequest({
-        k: MockTritonTensor(k, np.array(v))
-        for k, v in tensors.items()
-    })
-
-
-@pytest.fixture(autouse=True)
-def apply_patches():
-    patch("model.pb_utils.Tensor", new=MockTritonTensor).start()
-    patch("model.pb_utils.InferenceResponse", new=MockTritonResponse).start()
-    patch("model.pb_utils.TritonError", new=MockTritonError).start()
-    patch("model.pb_utils.InferenceRequest", new=MockTritonRequest).start()
-    patch("model.pb_utils.get_input_tensor_by_name",
-          new=mock_pb_utils_get_input_tensor_by_name_side_effect).start()
-    patch("model.pb_utils.TritonModelException", new=Exception).start()
-
-
-@pytest.fixture
-def triton_request() -> MockTritonRequest:
-    inputs = {
-        "input_ids": [[28524, 287, 5093, 12]],
-        "request_output_len": [16],
-        "streaming": [True],
-        "end_id": [50256],
-        "pad_id": [50256],
-        "stop_words_list": [[[14480, 326, 262, 1171], [1, 4, -1, -1]]],
-        "bad_words_list": [[[24044, 76, 1230], [2, 3, -1]]],
-        "embedding_bias":
-        np.array([[0., 0., 0.]], dtype=np.float32),
-        "beam_width": [2],
-        "runtime_top_k": [1],
-        "runtime_top_p": [0.],
-        "seed": [4],
-        "temperature": [1.],
-        "min_tokens": [3],
-        "repetition_penalty": [1.0],
-        "presence_penalty": [2.0],
-        "frequency_penalty": [4.0],
-        "len_penalty": [8.0],
-        "runtime_top_p_min": [1.0],
-        "runtime_top_p_reset_ids": [1],
-        "runtime_top_p_decay": [1.0],
-        "beam_search_diversity_rate": [1.0],
-        "early_stopping": [True],
-        "return_log_probs":
-        True,
-        "return_context_logits":
-        True,
-        "return_generation_logits":
-        True,
-        "draft_input_ids": [[0, 1]],
-        "draft_logits":
-        np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float32),
-        "draft_acceptance_threshold":
-        1.0,
-        "prompt_embedding_table":
-        np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float16),
-        "lora_task_id": [1],
-        "lora_weights":
-        np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float16),
-        "lora_config":
-        np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.int32),
-        "retention_token_range_starts":
-        np.array([[0, 100]], dtype=np.int32),
-        "retention_token_range_ends":
-        np.array([[100, 200]], dtype=np.int32),
-        "retention_token_range_priorities":
-        np.array([[100, 50]], dtype=np.int32),
-        "prompt_vocab_size": [2],
-    }
-    return make_mock_triton_request(inputs)
-
-
-@pytest.fixture
-def batched_triton_request() -> MockTritonRequest:
-    inputs = {
-        "input_ids": [[28524, 287, 5093, 12], [1, 2, 3, 4]],
-        "input_lengths": [4, 2],
-        "request_output_len": [16, 3],
-        "streaming": [True, False],
-        "end_id": [50256, 50257],
-        "pad_id": [50256, 50257],
-        "stop_words_list": [[[14480, 326, 262, 1171], [1, 4, -1, -1]],
-                            [[66, 77, -1, -1], [1, 2, -1, -1]]],
-        "bad_words_list": [[[24044, 76, 1230], [2, 3, -1]],
-                           [[88, 99, 111], [1, 3, -1]]],
-        "embedding_bias":
-        np.array([[0., 0., 0.], [1., 1., 1.]], dtype=np.float32),
-        "beam_width": [2, 3],
-        "runtime_top_k": [1, 2],
-        "runtime_top_p": [0., 1.],
-        "seed": [4, 7],
-        "temperature": [1., 0.5],
-        "min_tokens": [3, 10],
-        "repetition_penalty": [1.0, 1.1],
-        "presence_penalty": [2.0, 2.1],
-        "frequency_penalty": [4.0, 4.1],
-        "len_penalty": [8.0, 8.1],
-        "runtime_top_p_min": [1.0, 0.5],
-        "runtime_top_p_reset_ids": [1, 3],
-        "runtime_top_p_decay": [1.0, 0.1],
-        "beam_search_diversity_rate": [1.0, 0.7],
-        "early_stopping": [True, False],
-        "return_log_probs": [True, False],
-        "return_context_logits": [True, False],
-        "return_generation_logits": [True, False],
-        "draft_input_ids": [[0, 1], [2, 3]],
-        "draft_logits":
-        np.array([[[1.0, 2.0], [3.0, 4.0]], [[1.1, 2.1], [3.1, 4.1]]],
-                 dtype=np.float32),
-        "draft_acceptance_threshold": [1.0, 0.5],
-        "prompt_embedding_table":
-        np.array([[[1.0, 2.0], [3.0, 4.0]], [[2.0, 3.0], [4.0, 5.0]]],
-                 dtype=np.float16),
-        "lora_task_id": [1, 2],
-        "lora_weights":
-        np.array([[[1.0, 2.0], [3.0, 4.0]], [[3.0, 4.0], [5.0, 6.0]]],
-                 dtype=np.float16),
-        "lora_config":
-        np.array([[[1, 2, 3], [4, 5, 6]], [[11, 12, 13], [14, 15, 16]]],
-                 dtype=np.int32),
-        "retention_token_range_starts":
-        np.array([[0, 100], [0, 200]], dtype=np.int32),
-        "retention_token_range_ends":
-        np.array([[100, 200], [200, 300]], dtype=np.int32),
-        "retention_token_range_priorities":
-        np.array([[100, 50], [0, 0]], dtype=np.int32),
-        "prompt_vocab_size": [2],
-    }
-    return make_mock_triton_request(inputs)
-
-
-@pytest.fixture
-def triton_request_minimal() -> MockTritonRequest:
-    inputs = {
-        "input_ids": [[28524, 287, 5093, 12]],
-        "request_output_len": [[16]],
-    }
-    return MockTritonRequest({
-        k: MockTritonTensor(k, np.array(v))
-        for k, v in inputs.items()
-    })
-
-
-@pytest.fixture
-def trtllm_response() -> trtllm.Response:
-    result = trtllm.Result()
-    result.is_final = True
-    result.output_token_ids = [[1, 2, 3]]
-    result.cum_log_probs = [1]
-    result.log_probs = [[1, 3]]
-    result.context_logits = torch.ones(3, 10)
-    result.generation_logits = torch.ones(1, 5, 10)
-    return trtllm.Response(0, result)
-
-
-@pytest.fixture
-def trtllm_response_minimal() -> trtllm.Response:
-    result = trtllm.Result()
-    result.is_final = False
-    result.output_token_ids = [[1, 2, 3]]
-    return trtllm.Response(0, result)
-
-
-@pytest.fixture
-def trtllm_response_error() -> trtllm.Response:
-    return trtllm.Response(0, "internal error")
-
-
-def test_get_input_tensor_by_name(triton_request: MockTritonRequest):
-    assert (get_input_tensor_by_name(triton_request, "input_ids") == np.array(
-        [[28524, 287, 5093, 12]])).all()
-    assert get_input_tensor_by_name(triton_request, "no_value") is None
-
-
-def test_get_input_scalar_by_name(triton_request: MockTritonRequest):
-    assert get_input_scalar_by_name(triton_request, "request_output_len") == 16
-    assert get_input_scalar_by_name(triton_request, "streaming") == True
-    assert get_input_scalar_by_name(triton_request, "end_id") == 50256
-    assert get_input_scalar_by_name(triton_request, "pad_id") == 50256
-    assert get_input_scalar_by_name(triton_request, "beam_width") == 2
-    assert get_input_scalar_by_name(triton_request, "runtime_top_k") == 1
-    assert get_input_scalar_by_name(triton_request, "runtime_top_p") == 0.
-    assert get_input_scalar_by_name(triton_request, "temperature") == 1.
-
-
-def test_read_parameter_as_type():
-    assert read_parameter_as_type("", "name") is None
-    assert read_parameter_as_type("", "name", int) is None
-    assert read_parameter_as_type("", "name", float) is None
-    assert read_parameter_as_type("", "name", bool) is None
-    assert read_parameter_as_type("${unfilled_parameter}", "name") is None
-    assert read_parameter_as_type("foo", "name", int) is None
-    assert read_parameter_as_type("string_value", "name") == "string_value"
-    assert read_parameter_as_type("4", "name", int) == 4
-    assert read_parameter_as_type("0.5", "name", float) == 0.5
-    assert read_parameter_as_type("1", "name", bool) == True
-    assert read_parameter_as_type("true", "name", bool) == True
-    assert read_parameter_as_type("True", "name", bool) == True
-    assert read_parameter_as_type("0", "name", bool) == False
-    assert read_parameter_as_type("false", "name", bool) == False
-    assert read_parameter_as_type("False", "name", bool) == False
-
-
-def test_get_parameter():
-    model_config = {"parameters": {"max_beam_width": {"string_value": "1"}}}
-    assert get_parameter(model_config, "max_beam_width", int) == 1
-    assert get_parameter(model_config, "gpt_model_type", str) is None
-
-
-def test_convert_word_list():
-    assert convert_word_list(None) is None
-    assert convert_word_list(np.array([[[], []]])) == []
-    assert convert_word_list(
-        np.array([[[14480, 326, 262, 1171], [1, 4, -1,
-                                             -1]]])) == [[14480],
-                                                         [326, 262, 1171]]
-    assert convert_word_list(np.array([[[24044, 76, 1230],
-                                        [2, 3, -1]]])) == [[24044, 76], [1230]]
-    assert convert_word_list(np.array([[[326, 262, 1230],
-                                        [3, -1, -1]]])) == [[326, 262, 1230]]
-    for bad_format in [
-            np.array([]),
-            np.array([[]]),
-            np.array([[[]]]),
-            np.array([[[1], [2], [3]]]),
-            np.array([[[262], [5]]]),
-    ]:
-        with pytest.raises(Exception, match="Invalid format for word list"):
-            convert_word_list(bad_format)
-
-
-def test_parse_medusa_choices():
-    assert parse_medusa_choices("{0, 0, 0}, {0, 1}") == [[0, 0, 0], [0, 1]]
-    for bad_format in [
-            "{{}",
-            "{",
-            "{{}",
-            "}",
-            "{0, 1, 2",
-            "0, 1, 2",
-            "{0, 1, 2}, {\"foo\"}",
-    ]:
-        with pytest.raises(Exception,
-                           match="Invalid format for medusa_choices"):
-            parse_medusa_choices(bad_format)
-
-
-def check_converted_request(converted):
-    assert isinstance(converted, trtllm.Request)
-    assert converted.input_token_ids == [28524, 287, 5093, 12]
-    assert converted.max_tokens == 16
-    assert converted.streaming == True
-    assert converted.end_id == 50256
-    assert converted.pad_id == 50256
-    assert converted.stop_words == [[14480], [326, 262, 1171]]
-    assert converted.bad_words == [[24044, 76], [1230]]
-    assert (converted.embedding_bias == torch.tensor([0., 0., 0.])).all()
-    assert converted.logits_post_processor_name is None
-
-    assert isinstance(converted.external_draft_tokens_config,
-                      trtllm.ExternalDraftTokensConfig)
-    assert converted.external_draft_tokens_config.tokens == [0, 1]
-    assert (converted.external_draft_tokens_config.logits == torch.tensor(
-        [[1.0, 2.0], [3.0, 4.0]])).all()
-    assert converted.external_draft_tokens_config.acceptance_threshold == 1.0
-
-    assert isinstance(converted.prompt_tuning_config, trtllm.PromptTuningConfig)
-    assert (converted.prompt_tuning_config.embedding_table == torch.tensor(
-        [[1.0, 2.0], [3.0, 4.0]])).all()
-
-    assert isinstance(converted.lora_config, trtllm.LoraConfig)
-    assert converted.lora_config.task_id == 1
-    assert (converted.lora_config.weights == torch.tensor([[1.0, 2.0],
-                                                           [3.0, 4.0]])).all()
-    assert (converted.lora_config.config == torch.tensor([[1, 2, 3],
-                                                          [4, 5, 6]])).all()
-
-    assert isinstance(converted.kv_cache_retention_config,
-                      trtllm.KvCacheRetentionConfig)
-    assert len(
-        converted.kv_cache_retention_config.token_range_retention_configs)
-    assert converted.kv_cache_retention_config.token_range_retention_configs[
-        0].token_start == 0
-    assert converted.kv_cache_retention_config.token_range_retention_configs[
-        0].token_end == 100
-    assert converted.kv_cache_retention_config.token_range_retention_configs[
-        0].priority == 100
-
-    assert converted.sampling_config.beam_width == 2
-    assert converted.sampling_config.top_k == 1
-    assert converted.sampling_config.top_p is None
-    assert converted.sampling_config.top_p_min == 1.0
-    assert converted.sampling_config.top_p_reset_ids == 1
-    assert converted.sampling_config.top_p_decay == 1.0
-    assert converted.sampling_config.seed == 4
-    assert converted.sampling_config.temperature == 1.0
-    assert converted.sampling_config.min_tokens == 3
-    assert converted.sampling_config.beam_search_diversity_rate == 1.0
-    assert converted.sampling_config.repetition_penalty == 1.0
-    assert converted.sampling_config.presence_penalty == 2.0
-    assert converted.sampling_config.frequency_penalty == 4.0
-    assert converted.sampling_config.length_penalty == 8.0
-    assert converted.sampling_config.early_stopping == True
-
-    assert converted.output_config.return_log_probs == True
-    assert converted.output_config.return_context_logits == True
-    assert converted.output_config.return_generation_logits == True
-    assert converted.output_config.exclude_input_from_output == True
-
-
-def test_convert_batched_request(batched_triton_request: MockTritonRequest):
-    converted_reqs = convert_request(batched_triton_request,
-                                     exclude_input_from_output=True,
-                                     decoupled=True)
-    assert len(converted_reqs) == 2
-    converted0 = converted_reqs[0]
-    check_converted_request(converted0)
-
-    converted = converted_reqs[1]
-
-    assert isinstance(converted, trtllm.Request)
-    assert converted.input_token_ids == [1, 2]
-    assert converted.max_tokens == 3
-    assert converted.streaming == False
-    assert converted.end_id == 50257
-    assert converted.pad_id == 50257
-    assert converted.stop_words == [[66], [77]]
-    assert converted.bad_words == [[88], [99, 111]]
-    assert (converted.embedding_bias == torch.tensor([1., 1., 1.])).all()
-    assert converted.logits_post_processor_name is None
-
-    assert isinstance(converted.external_draft_tokens_config,
-                      trtllm.ExternalDraftTokensConfig)
-    assert converted.external_draft_tokens_config.tokens == [2, 3]
-    assert (converted.external_draft_tokens_config.logits == torch.tensor(
-        [[1.1, 2.1], [3.1, 4.1]])).all()
-    assert converted.external_draft_tokens_config.acceptance_threshold == 0.5
-
-    assert isinstance(converted.prompt_tuning_config, trtllm.PromptTuningConfig)
-    print(converted.prompt_tuning_config.embedding_table)
-    assert (converted.prompt_tuning_config.embedding_table == torch.tensor(
-        [[2.0, 3.0], [4.0, 5.0]])).all()
-
-    assert isinstance(converted.lora_config, trtllm.LoraConfig)
-    assert converted.lora_config.task_id == 2
-    assert (converted.lora_config.weights == torch.tensor([[3.0, 4.0],
-                                                           [5.0, 6.0]])).all()
-    assert (converted.lora_config.config == torch.tensor([[11, 12, 13],
-                                                          [14, 15, 16]])).all()
-
-    assert converted.sampling_config.beam_width == 3
-    assert converted.sampling_config.top_k == 2
-    assert converted.sampling_config.top_p == 1.
-    assert converted.sampling_config.top_p_min == 0.5
-    assert converted.sampling_config.top_p_reset_ids == 3
-    assert converted.sampling_config.top_p_decay == pytest.approx(0.1)
-    assert converted.sampling_config.seed == 7
-    assert converted.sampling_config.temperature == 0.5
-    assert converted.sampling_config.min_tokens == 10
-    assert converted.sampling_config.beam_search_diversity_rate == pytest.approx(
-        0.7)
-    assert converted.sampling_config.repetition_penalty == pytest.approx(1.1)
-    assert converted.sampling_config.presence_penalty == pytest.approx(2.1)
-    assert converted.sampling_config.frequency_penalty == pytest.approx(4.1)
-    assert converted.sampling_config.length_penalty == pytest.approx(8.1)
-    assert converted.sampling_config.early_stopping == False
-
-    assert converted.output_config.return_log_probs == False
-    assert converted.output_config.return_context_logits == False
-    assert converted.output_config.return_generation_logits == False
-    assert converted.output_config.exclude_input_from_output == True
-
-
-def test_convert_request(triton_request: MockTritonRequest):
-    converted_reqs = convert_request(triton_request,
-                                     exclude_input_from_output=True,
-                                     decoupled=True)
-    assert len(converted_reqs) == 1
-    converted = converted_reqs[0]
-    check_converted_request(converted)
-
-
-def test_convert_request_minimal(triton_request_minimal: MockTritonRequest):
-    converted_reqs = convert_request(triton_request_minimal,
-                                     exclude_input_from_output=False,
-                                     decoupled=False)
-    assert len(converted_reqs) == 1
-    converted = converted_reqs[0]
-    assert converted.input_token_ids == [28524, 287, 5093, 12]
-    assert converted.max_tokens == 16
-    assert converted.streaming == False
-    assert converted.end_id is None
-    assert converted.pad_id is None
-    assert converted.stop_words is None
-    assert converted.bad_words is None
-    assert converted.embedding_bias is None
-    assert converted.logits_post_processor_name is None
-    assert converted.external_draft_tokens_config is None
-    assert converted.prompt_tuning_config is None
-    assert converted.lora_config is None
-    assert converted.kv_cache_retention_config is None
-
-    assert converted.sampling_config.beam_width == 1
-    assert converted.sampling_config.top_k is None
-    assert converted.sampling_config.top_p is None
-    assert converted.sampling_config.top_p_min is None
-    assert converted.sampling_config.top_p_reset_ids is None
-    assert converted.sampling_config.top_p_decay is None
-    assert converted.sampling_config.seed is None
-    assert converted.sampling_config.temperature is None
-    assert converted.sampling_config.min_tokens is None
-    assert converted.sampling_config.beam_search_diversity_rate is None
-    assert converted.sampling_config.repetition_penalty is None
-    assert converted.sampling_config.presence_penalty is None
-    assert converted.sampling_config.frequency_penalty is None
-    assert converted.sampling_config.length_penalty is None
-    assert converted.sampling_config.early_stopping is None
-
-    assert converted.output_config.return_log_probs == False
-    assert converted.output_config.return_context_logits == False
-    assert converted.output_config.return_generation_logits == False
-    assert converted.output_config.exclude_input_from_output == False
-
-
-def test_kv_cache_retention_config_invalid():
-
-    def check_retention_config(d: Dict[str, np.ndarray], is_valid: bool):
-        req = make_mock_triton_request(d)
-        if is_valid:
-            get_kv_cache_retention_config_from_request(req, 1, 0)
-        else:
-            with pytest.raises(RuntimeError):
-                get_kv_cache_retention_config_from_request(req, 1, 0)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0, 100]]),
-            "retention_token_range_ends": np.array([[100, 200]])
-        }, False)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0, 100]]),
-            "retention_token_range_ends": np.array([[100, 200]]),
-            "retention_token_range_priorities": np.array([[100]])
-        }, False)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0, 100]]),
-            "retention_token_range_ends": np.array([[100, 200]]),
-            "retention_token_range_priorities": np.array([[50, 50]])
-        }, True)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0, 100]]),
-            "retention_token_range_ends": np.array([[100, 200]]),
-            "retention_token_range_priorities": np.array([[50, 50]]),
-            "retention_token_range_durations_ms": np.array([[100]])
-        }, False)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0, 100]]),
-            "retention_token_range_ends": np.array([[100, 200]]),
-            "retention_token_range_priorities": np.array([[50, 50]]),
-            "retention_token_range_durations_ms": np.array([[100, 50]])
-        }, True)
-
-    check_retention_config(
-        {
-            "retention_token_range_starts": np.array([[0]]),
-            "retention_token_range_ends": np.array([[-1]]),
-            "retention_token_range_priorities": np.array([[50]]),
-            "retention_token_range_durations_ms": np.array([[1000]])
-        }, True)
-
-
-# Need to test with Executor lookahead config.
-def test_request_lookahead_config():
-
-    def check_request_lookahead_config(request_config: Dict[str, str],
-                                       executor_config, is_valid: bool):
-        req = make_mock_triton_request(request_config)
-
-        if is_valid:
-            get_lookahead_decoding_config_from_request(req,
-                                                       executor_config,
-                                                       batch_size=1,
-                                                       batch_index=0)
-        else:
-            with pytest.raises(Exception):
-                get_lookahead_decoding_config_from_request(req,
-                                                           executor_config,
-                                                           batch_size=1,
-                                                           batch_index=0)
-
-    # When request and executor lookahead_config are set correctly
-    check_request_lookahead_config(
-        {
-            "lookahead_window_size": np.array([[3]], dtype=np.int32),
-            "lookahead_ngram_size": np.array([[3]], dtype=np.int32),
-            "lookahead_verification_set_size": np.array([[3]], dtype=np.int32),
-        }, trtllm.LookaheadDecodingConfig(3, 3, 3), True)
-
-    # When request lookahead_config is not specified
-    check_request_lookahead_config({}, trtllm.LookaheadDecodingConfig(3, 3, 3),
-                                   True)
-
-    # When request lookahead_config is incomplete
-    check_request_lookahead_config(
-        {
-            "lookahead_window_size": np.array([[3]], dtype=np.int32),
-        }, trtllm.LookaheadDecodingConfig(3, 3, 3), False)
-
-    # When request lookahead_config is incomplete
-    check_request_lookahead_config(
-        {
-            "lookahead_window_size": np.array([[3]], dtype=np.int32),
-            "lookahead_ngram_size": np.array([[3]], dtype=np.int32),
-        }, trtllm.LookaheadDecodingConfig(3, 3, 3), False)
-
-    # When request lookahead_config is set while executor_lookahead_config is None
-    check_request_lookahead_config(
-        {
-            "lookahead_window_size": np.array([[3]], dtype=np.int32),
-            "lookahead_ngram_size": np.array([[3]], dtype=np.int32),
-            "lookahead_verification_set_size": np.array([[3]], dtype=np.int32),
-        }, None, False)
-
-
-def test_convert_request_invalid():
-    with pytest.raises(Exception, match="A value is required for input_ids"):
-        no_input_ids = MockTritonRequest({
-            "request_output_len":
-            MockTritonTensor("request_output_len", np.array([[128]]))
-        })
-        convert_request(no_input_ids, False, False)
-    with pytest.raises(Exception, match="Invalid format for input_ids"):
-        bad_input_ids = MockTritonRequest(
-            {"input_ids": MockTritonTensor("input_ids", np.array([]))})
-        convert_request(bad_input_ids, False, False)
-    with pytest.raises(Exception,
-                       match="A value is required for request_output_len"):
-        no_output_len = MockTritonRequest(
-            {"input_ids": MockTritonTensor("input_ids", np.array([[1, 2, 3]]))})
-        convert_request(no_output_len, False, False)
-    with pytest.raises(Exception,
-                       match="Streaming is only supported in decoupled mode."):
-        streaming_non_decoupled = MockTritonRequest({
-            "input_ids":
-            MockTritonTensor("input_ids", np.array([[1, 2, 3]])),
-            "request_output_len":
-            MockTritonTensor("request_output_len", np.array([[128]])),
-            "streaming":
-            MockTritonTensor("streaming", np.array([[True]])),
-        })
-        convert_request(streaming_non_decoupled, False, False)
-
-
-def test_convert_response(trtllm_response: trtllm.Response):
-    batch_index = 2
-    batch_size = 3
-    num_return_sequences = 1
-    response, is_final, output_length = convert_response(
-        trtllm_response, batch_index, batch_size, num_return_sequences)
-    assert is_final == True
-    assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2,
-                                                                    3]])).all()
-    assert (response.tensors["sequence_length"].as_numpy() == np.array(
-        [[3]])).all()
-    assert (response.tensors["cum_log_probs"].as_numpy() == np.array([1])).all()
-    assert (response.tensors["output_log_probs"].as_numpy() == np.array(
-        [[1, 3]])).all()
-    assert (response.tensors["context_logits"].as_numpy() == np.ones(
-        (3, 10), dtype=np.float32)).all()
-    assert (response.tensors["generation_logits"].as_numpy() == np.ones(
-        (1, 5, 10), dtype=np.float32)).all()
-    assert (response.tensors["batch_index"].as_numpy() == np.array(
-        [[batch_index]])).all()
-
-
-def test_convert_response_minimal(trtllm_response_minimal: trtllm.Response):
-    batch_index = 2
-    batch_size = 3
-    num_return_sequences = 1
-    response, is_final, output_length = convert_response(
-        trtllm_response_minimal, batch_index, batch_size, num_return_sequences)
-    assert is_final == False
-    assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2,
-                                                                    3]])).all()
-    assert (response.tensors["sequence_length"].as_numpy() == np.array(
-        [[3]])).all()
-    assert "cum_log_probs" not in response.tensors
-    assert "output_log_probs" not in response.tensors
-    assert "output_log_probs" not in response.tensors
-    assert "context_logits" not in response.tensors
-    assert "generation_logits" not in response.tensors
-    assert (response.tensors["batch_index"].as_numpy() == np.array(
-        [[batch_index]])).all()
-
-
-def test_convert_response_error(trtllm_response_error: trtllm.Response):
-    batch_index = 2
-    batch_size = 3
-    num_return_sequences = 1
-    response, is_final, output_length = convert_response(
-        trtllm_response_error, batch_index, batch_size, num_return_sequences)
-    assert is_final == True
-    assert response.has_error() and response.error.message == "internal error"
-
-
-def test_convert_scheduler_policy():
-    assert convert_scheduler_policy(
-        "max_utilization") == trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    assert convert_scheduler_policy(
-        "guaranteed_no_evict"
-    ) == trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    with pytest.raises(
-            Exception,
-            match="batch_scheduler_policy value of 'other' is not supported"):
-        convert_scheduler_policy("other")
-
-
-def test_convert_batching_type():
-    assert convert_batching_type(
-        "inflight_fused_batching") == trtllm.BatchingType.INFLIGHT
-    assert convert_batching_type(
-        "inflight_batching") == trtllm.BatchingType.INFLIGHT
-    assert convert_batching_type("v1") == trtllm.BatchingType.STATIC
-    with pytest.raises(
-            Exception,
-            match="gpt_model_type value of 'other' is not supported"):
-        convert_batching_type("other")
-
-
-def test_convert_decoding_mode():
-    assert convert_decoding_mode(None) is None
-    assert convert_decoding_mode("auto").isAuto()
-    assert convert_decoding_mode("top_k").isTopK()
-    assert convert_decoding_mode("top_p").isTopP()
-    assert convert_decoding_mode("top_k_top_p").isTopKandTopP()
-    assert convert_decoding_mode("beam_search").isBeamSearch()
-    assert convert_decoding_mode("medusa").isMedusa()
-    assert convert_decoding_mode("redrafter").isExplicitDraftTokens()
-    assert convert_decoding_mode("lookahead").isLookahead()
-    assert convert_decoding_mode("eagle").isEagle()
-    with pytest.raises(Exception,
-                       match="decoding_mode value of 'other' is not supported"):
-        convert_decoding_mode("other")
-
-
-@pytest.fixture
-def model_config() -> Dict:
-    config = {
-        "max_beam_width": "2",
-        "enable_chunked_context": "true",
-        "normalize_log_probs": "false",
-        "gpt_model_type": "inflight_batching",
-        "medusa_choices": "{1, 2, 3, 4}, {5, 6, 7}",
-        "decoding_mode": "medusa",
-        "batch_scheduler_policy": "max_utilization",
-        "enable_kv_cache_reuse": "false",
-        "max_tokens_in_paged_kv_cache": "1",
-        "max_attention_window_size": "2",
-        "sink_token_length": "3",
-        "kv_cache_free_gpu_mem_fraction": "0.5",
-        "cross_kv_cache_fraction": "0.5",
-        "kv_cache_host_memory_bytes": "4",
-        "kv_cache_onboard_blocks": "false",
-        "gpu_device_ids": "0,1,2,3",
-        "executor_worker_path": str(os.path.abspath(__file__)),
-        "lora_cache_optimal_adapter_size": "1",
-        "lora_cache_max_adapter_size": "2",
-        "lora_cache_gpu_memory_fraction": "0.5",
-        "lora_cache_host_memory_bytes": "4",
-        "lora_prefetch_dir": "",
-        "enable_context_fmha_fp32_acc": "true"
-    }
-    return {"parameters": {k: {"string_value": v} for k, v in config.items()}}
-
-
-def test_get_executor_config(model_config: Dict):
-    os.environ["TRTLLM_ORCHESTRATOR"] = "0"
-    config = TritonPythonModel().get_executor_config(model_config)
-    assert config.max_beam_width == 2
-    assert config.enable_chunked_context == True
-    assert config.normalize_log_probs == False
-    assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.decoding_config.medusa_choices == [[1, 2, 3, 4], [5, 6, 7]]
-    assert config.decoding_config.decoding_mode.isMedusa()
-    assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    assert config.kv_cache_config.enable_block_reuse == False
-    assert config.kv_cache_config.max_tokens == 1
-    assert config.kv_cache_config.max_attention_window == [2]
-    assert config.kv_cache_config.sink_token_length == 3
-    assert config.kv_cache_config.free_gpu_memory_fraction == 0.5
-    assert config.kv_cache_config.cross_kv_cache_fraction == 0.5
-    assert config.kv_cache_config.host_cache_size == 4
-    assert config.kv_cache_config.onboard_blocks == False
-    assert config.parallel_config.device_ids == [0, 1, 2, 3]
-    assert config.parallel_config.orchestrator_config is None
-    assert config.peft_cache_config.optimal_adapter_size == 1
-    assert config.peft_cache_config.max_adapter_size == 2
-    assert config.peft_cache_config.device_cache_percent == 0.5
-    assert config.peft_cache_config.host_cache_size == 4
-    assert config.iter_stats_max_iterations == 1000
-    assert config.request_stats_max_iterations == 0
-    assert config.logits_post_processor_config is None
-    assert config.extended_runtime_perf_knob_config.enable_context_fmha_fp32_acc == True
-    assert config.extended_runtime_perf_knob_config.multi_block_mode == True
-    del os.environ["TRTLLM_ORCHESTRATOR"]
-
-
-def test_get_executor_config_orchestrator_mode(model_config: Dict):
-    os.environ["TRTLLM_ORCHESTRATOR"] = "1"
-    config = TritonPythonModel().get_executor_config(model_config)
-    assert config.parallel_config.device_ids == [0, 1, 2, 3]
-    assert config.parallel_config.orchestrator_config.is_orchestrator == True
-    assert config.parallel_config.orchestrator_config.worker_executable_path == str(
-        os.path.abspath(__file__))
-    del os.environ["TRTLLM_ORCHESTRATOR"]
-
-
-def test_get_executor_config_minimal():
-    if "TRTLLM_ORCHESTRATOR" in os.environ:
-        del os.environ["TRTLLM_ORCHESTRATOR"]
-    config = TritonPythonModel().get_executor_config({"parameters": {}})
-    assert config.max_beam_width == 1
-    assert config.enable_chunked_context == False
-    assert config.normalize_log_probs == True
-    assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.decoding_config.decoding_mode is None
-    assert config.decoding_config.medusa_choices is None
-    assert config.decoding_config.eagle_config is None
-    assert config.decoding_config.lookahead_decoding_config is None
-    assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    assert config.kv_cache_config.enable_block_reuse == True
-    assert config.kv_cache_config.max_tokens is None
-    assert config.kv_cache_config.max_attention_window is None
-    assert config.kv_cache_config.sink_token_length is None
-    assert config.kv_cache_config.free_gpu_memory_fraction is None
-    assert config.kv_cache_config.cross_kv_cache_fraction is None
-    assert config.kv_cache_config.host_cache_size is None
-    assert config.kv_cache_config.onboard_blocks == True
-    assert config.parallel_config is None
-    assert config.peft_cache_config.optimal_adapter_size == 8
-    assert config.peft_cache_config.max_adapter_size == 64
-    assert config.peft_cache_config.device_cache_percent is None
-    assert config.peft_cache_config.host_cache_size is None
-    assert config.iter_stats_max_iterations == 1000
-    assert config.request_stats_max_iterations == 0
-    assert config.logits_post_processor_config is None
-    assert config.extended_runtime_perf_knob_config.enable_context_fmha_fp32_acc == False
-    assert config.extended_runtime_perf_knob_config.multi_block_mode == True
-
-
-def test_convert_timestamp_to_seconds():
-    assert convert_timestamp_to_seconds("01-01-1970 00:00:00.000000") == 0
-    assert convert_timestamp_to_seconds(
-        "05-17-2024 23:28:39.000000") == 1715988519
diff --git a/triton_backend/all_models/tests/test_triton_decoder.py b/triton_backend/all_models/tests/test_triton_decoder.py
deleted file mode 100644
index 7ebcb28e99a..00000000000
--- a/triton_backend/all_models/tests/test_triton_decoder.py
+++ /dev/null
@@ -1,456 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import Dict, List, Union
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-
-# Mock pb_utils
-sys.modules["triton_python_backend_utils"] = MagicMock()
-
-import model
-from lib.decode import GenerationResponse, PreprocResponse, Request, Response
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
-from lib.triton_decoder import TritonDecoder
-from model import TritonPythonModel
-
-
-@dataclass
-class MockTritonTensor:
-    _name: str
-    _tensor: Union[np.ndarray, torch.Tensor]
-
-    def name(self) -> str:
-        return self._name
-
-    def as_numpy(self) -> np.ndarray:
-        if self.is_cpu():
-            return self._tensor
-        else:
-            return self._tensor.as_numpy()
-
-    def is_cpu(self) -> bool:
-        if isinstance(self._tensor, np.ndarray):
-            return True
-        else:
-            return False
-
-
-@dataclass
-class MockTritonResponse:
-    tensors: Dict[str, MockTritonTensor]
-
-    def __init__(self, output_tensors: List[MockTritonTensor]):
-        self.tensors = {}
-        for tensor in output_tensors:
-            self.tensors[tensor.name()] = tensor
-
-    def output_tensors(self):
-        return self.tensors.values()
-
-
-@dataclass
-class MockTritonRequest:
-    tensors: Dict[str, MockTritonTensor]
-
-    def get_input_tensor_by_name(self, name: str) -> MockTritonTensor:
-        return self.tensors[name] if name in self.tensors else None
-
-    def get_response_sender(self):
-        return None
-
-
-@pytest.fixture
-def triton_decoder() -> TritonDecoder:
-    return TritonDecoder()
-
-
-@pytest.fixture
-def response(request) -> MockTritonResponse:
-    output_names = [
-        "text_output",
-        "cum_log_probs",
-        "output_log_probs",
-        "context_logits",
-        "generation_logits",
-        "batch_index",
-        "sequence_index",
-        "kv_cache_alloc_new_blocks",
-        "kv_cache_reused_blocks",
-        "kv_cache_alloc_total_blocks",
-        "arrival_time_ns",
-        "first_scheduled_time_ns",
-        "first_token_time_ns",
-        "last_token_time_ns",
-        "acceptance_rate",
-        "total_accepted_draft_tokens",
-        "total_draft_tokens",
-    ]
-    response = Response()
-    for output_name in output_names:
-        setattr(response, output_name, np.array(request.param[output_name]))
-    return response
-
-
-@pytest.fixture
-def triton_request(request) -> MockTritonRequest:
-    input_names = [
-        "text_input", "max_tokens", "bad_words", "stop_words", "end_id",
-        "pad_id", "top_k", "top_p", "temperature", "length_penalty",
-        "repetition_penalty", "min_tokens", "presence_penalty",
-        "frequency_penalty", "seed", "return_log_probs",
-        "return_context_logits", "return_generation_logits", "beam_width",
-        "stream", "prompt_embedding_table", "prompt_vocab_size",
-        "embedding_bias_words", "embedding_bias_weights", "num_draft_tokens",
-        "return_perf_metrics"
-    ]
-    triton_tensor_map = {}
-    for input_name in input_names:
-        if input_name in request.param:
-            triton_tensor = MockTritonTensor(
-                input_name, np.array(request.param[input_name]))
-            triton_tensor_map[input_name] = triton_tensor
-    return MockTritonRequest(triton_tensor_map)
-
-
-@pytest.fixture(autouse=True)
-def apply_patches():
-    patch("lib.triton_decoder.pb_utils.Tensor", new=MockTritonTensor).start()
-    patch("lib.triton_decoder.pb_utils.InferenceResponse",
-          new=MockTritonResponse).start()
-    patch("lib.triton_decoder.pb_utils.InferenceRequest",
-          new=MockTritonRequest).start()
-    patch("lib.triton_decoder.pb_utils.get_input_tensor_by_name",
-          new=mock_pb_utils_get_input_tensor_by_name_side_effect).start()
-
-
-def mock_pb_utils_get_input_tensor_by_name_side_effect(
-        request: MockTritonRequest, name: str) -> MockTritonTensor:
-    return request.get_input_tensor_by_name(name)
-
-
-mock_reponse = {
-    "text_output": ["Hello world"],
-    "cum_log_probs": [[0.0]],
-    "output_log_probs": [[[0.1, 0.3]]],
-    "context_logits": [[[-0.2, 0.2]]],
-    "generation_logits": [[[0.3, 1.1]]],
-    "batch_index": [[0]],
-    "sequence_index": [[0]],
-    "sequence_index": [[0]],
-    "kv_cache_alloc_new_blocks": [[0]],
-    "kv_cache_reused_blocks": [[0]],
-    "kv_cache_alloc_total_blocks": [[0]],
-    "arrival_time_ns": [[0]],
-    "first_scheduled_time_ns": [[1]],
-    "first_token_time_ns": [[2]],
-    "last_token_time_ns": [[3]],
-    "acceptance_rate": [[0.0]],
-    "total_accepted_draft_tokens": [[0]],
-    "total_draft_tokens": [[0]]
-}
-
-mock_request = {"text_input": [["Hello world"]], "max_tokens": [[24]]}
-
-
-@pytest.mark.parametrize("response", [mock_reponse], indirect=True)
-def test_create_triton_response(triton_decoder: TritonDecoder,
-                                response: Response):
-    triton_response = triton_decoder.create_triton_response(response)
-    # Check if all fields and values are present in the triton response
-    output_triton_tensors = triton_response.output_tensors()
-    output_triton_tensor_map = {
-        tensor.name(): tensor.as_numpy()
-        for tensor in output_triton_tensors
-    }
-    assert (output_triton_tensor_map.keys() == response.__dict__.keys())
-    for output_name in output_triton_tensor_map:
-        output_tensor = output_triton_tensor_map[output_name]
-        np.testing.assert_array_equal(output_tensor,
-                                      getattr(response, output_name))
-
-
-@pytest.mark.parametrize("triton_request", [mock_request], indirect=True)
-def test_convert_triton_request(triton_decoder: TritonDecoder,
-                                triton_request: MockTritonRequest):
-    request = triton_decoder.convert_triton_request(triton_request)
-    tensor_names = [
-        tensor_name for tensor_name in request.__dict__.keys()
-        if getattr(request, tensor_name) is not None
-    ]
-    assert set(tensor_names) == triton_request.tensors.keys()
-    for tensor_name in tensor_names:
-        request_tensor = getattr(request, tensor_name)
-        if request_tensor is not None:
-            triton_tensor = triton_request.get_input_tensor_by_name(tensor_name)
-            assert triton_tensor is not None
-            np.testing.assert_array_equal(getattr(request, tensor_name),
-                                          triton_tensor.as_numpy())
-
-
-_preproc_name_map = {
-    "INPUT_ID": "input_ids",
-    "REQUEST_INPUT_LEN": "input_lengths",
-    "BAD_WORDS_IDS": "bad_words_list",
-    "STOP_WORDS_IDS": "stop_words_list",
-    "EMBEDDING_BIAS": "embedding_bias",
-    "OUT_PAD_ID": "pad_id",
-    "OUT_END_ID": "end_id",
-}
-_generation_name_map = {
-    "output_ids": "output_ids",
-    "sequence_length": "sequence_length",
-    "cum_log_probs": "cum_log_probs",
-    "output_log_probs": "output_log_probs",
-    "context_logits": "context_logits",
-    "generation_logits": "generation_logits",
-    "batch_index": "batch_index",
-    "sequence_index": "sequence_index",
-    "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
-    "kv_cache_reused_blocks": "kv_cache_reused_blocks",
-    "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks",
-    "arrival_time_ns": "arrival_time_ns",
-    "first_scheduled_time_ns": "first_scheduled_time_ns",
-    "first_token_time_ns": "first_token_time_ns",
-    "last_token_time_ns": "last_token_time_ns",
-    "acceptance_rate": "acceptance_rate",
-    "total_accepted_draft_tokens": "total_accepted_draft_tokens",
-    "total_draft_tokens": "total_draft_tokens",
-}
-
-convert_triton_response_testcases = [{
-    "response_factory": PreprocResponse,
-    "name_map": _preproc_name_map,
-    "response": {
-        "INPUT_ID": [["Hello world"]],
-        "REQUEST_INPUT_LEN": [[16]]
-    }
-}, {
-    "response_factory": GenerationResponse,
-    "name_map": _generation_name_map,
-    "response": {
-        "output_ids": [[[1, 23, 23412, 2]]],
-        "sequence_length": [[4]]
-    }
-}]
-
-
-@pytest.mark.parametrize("convert_triton_response_testcases",
-                         convert_triton_response_testcases)
-def test_convert_triton_response(triton_decoder: TritonDecoder,
-                                 convert_triton_response_testcases):
-    triton_tensors = []
-    for tensor_name, tensor in convert_triton_response_testcases[
-            "response"].items():
-        triton_tensors.append(MockTritonTensor(tensor_name, np.array(tensor)))
-    triton_response = MockTritonResponse(triton_tensors)
-    response = triton_decoder.convert_triton_response(
-        triton_response, convert_triton_response_testcases["response_factory"],
-        convert_triton_response_testcases["name_map"])
-
-    response_tensors_length = len([
-        attr for attr in response.__dict__
-        if getattr(response, attr) is not None
-    ])
-    assert len(convert_triton_response_testcases["response"]
-               ) == response_tensors_length
-    for tensor_name, tensor in convert_triton_response_testcases[
-            "response"].items():
-        target_name = tensor_name
-        if convert_triton_response_testcases["name_map"]:
-            target_name = convert_triton_response_testcases["name_map"][
-                tensor_name]
-        assert getattr(response, target_name) is not None
-        np.testing.assert_array_equal(
-            convert_triton_response_testcases["response"][tensor_name],
-            getattr(response, target_name))
-
-
-create_triton_tensors_testcases = [{
-    "obj":
-    Request(text_input=np.array([["Hello world"]]),
-            max_tokens=np.array([["16"]]),
-            return_log_probs=np.array([True])),
-    "name_map": {
-        "text_input": "QUERY",
-        "max_tokens": "REQUEST_OUTPUT_LEN",
-        "return_log_probs": "return_log_probs",
-    },
-    "undo_reshape_map": {
-        "return_log_probs": True,
-    }
-}]
-
-
-@pytest.mark.parametrize("create_triton_tensors_testcases",
-                         create_triton_tensors_testcases)
-def test_create_triton_tensors(triton_decoder: TritonDecoder,
-                               create_triton_tensors_testcases):
-    request = create_triton_tensors_testcases["obj"]
-    obj_tensors_length = len([
-        attr for attr in request.__dict__ if getattr(request, attr) is not None
-    ])
-    triton_tensors = triton_decoder.create_triton_tensors(
-        create_triton_tensors_testcases["obj"],
-        create_triton_tensors_testcases["name_map"])
-    triton_tensor_map = {
-        tensor.name(): tensor.as_numpy()
-        for tensor in triton_tensors
-    }
-    assert len(triton_tensors) == obj_tensors_length
-    for tensor_name in request.__dict__:
-        if getattr(request, tensor_name) is not None:
-            target_name = create_triton_tensors_testcases["name_map"][
-                tensor_name]
-            assert target_name in triton_tensor_map
-            if create_triton_tensors_testcases.get("undo_reshape_map",
-                                                   {}).get(target_name, False):
-                np.testing.assert_array_equal(
-                    triton_tensor_map[target_name],
-                    np.expand_dims(getattr(request, tensor_name), 0))
-            else:
-                np.testing.assert_array_equal(triton_tensor_map[target_name],
-                                              getattr(request, tensor_name))
-
-
-check_stop_word_test_cases = [{
-    "stop_words": [["."]],
-    "text_input": ["What is the capital of France?"],
-    "stream": [True],
-    "responses":
-    [["The", " capital", " of", " France", " is", " Paris", ".", " The"]],
-    "exclude_input_in_output": [True],
-    "expected_output": [["The capital of France is Paris."]],
-    "num_return_sequences":
-    1
-}, {
-    "stop_words": [["."]],
-    "text_input": ["What is the capital of France?"],
-    "stream": [False],
-    "responses": [["The capital of France is Paris. The"]],
-    "exclude_input_in_output": [True],
-    "expected_output": [["The capital of France is Paris."]],
-    "num_return_sequences":
-    1
-}, {
-    "stop_words": [["."], ["Ottawa"]],
-    "text_input":
-    ["What is the capital of France?", "What is the capital of Canada?"],
-    "stream": [True, True],
-    "responses":
-    [["The ", "capital ", "of ", "France ", "is ", "Paris", ".", " The"],
-     ["The", " capital ", "of ", "Canada ", "is ", "Ottawa", ".", " The"]],
-    "exclude_input_in_output": [True, True],
-    "expected_output": [["The capital of France is Paris."],
-                        ["The capital of Canada is Ottawa"]],
-    "num_return_sequences":
-    1
-}, {
-    "stop_words": [["."]],
-    "text_input": ["What is the capital of France?"],
-    "stream": [True],
-    "responses":
-    [["The ", "capital ", "of ", "France ", "is ", "Paris", ".", " The"],
-     ["Paris ", "is ", "the ", "capital ", "of ", "France", ".", " The"]],
-    "exclude_input_in_output": [True],
-    "expected_output": [["The capital of France is Paris."],
-                        ["Paris is the capital of France."]],
-    "num_return_sequences":
-    2
-}, {
-    "stop_words": [["."]],
-    "text_input": ["What is the capital of France?"],
-    "stream": [True],
-    "responses": [[["The ", "The "], ["capital ", "capital "], ["of ", "of "],
-                   ["France ", "France "], ["is ", "is "], ["Paris", "Paris"],
-                   [".", ", "], ["The ", "and "], ["", "it "], ["", "is "],
-                   ["", "beautiful"], ["", "."]]],
-    "exclude_input_in_output": [True],
-    "expected_output": [[
-        "The capital of France is Paris.",
-        "The capital of France is Paris, and it is beautiful."
-    ]],
-    "num_return_sequences":
-    1
-}]
-
-
-@pytest.mark.parametrize("check_stop_word_test_case",
-                         check_stop_word_test_cases)
-def test_check_stop_words(check_stop_word_test_case):
-    stop_words_list = check_stop_word_test_case["stop_words"]
-    text_inputs = check_stop_word_test_case["text_input"]
-    stream = check_stop_word_test_case["stream"]
-    responses = check_stop_word_test_case["responses"]
-    expected_output = check_stop_word_test_case["expected_output"]
-    num_return_sequences = check_stop_word_test_case["num_return_sequences"]
-
-    request = Request()
-    request.stop_words = [[
-        stop_word.encode("utf-8") for stop_word in stop_words
-    ] for stop_words in stop_words_list]
-    request.text_input = [[text_input.encode("utf-8")]
-                          for text_input in text_inputs]
-    request.stream = [[stream_val] for stream_val in stream]
-    request.exclude_input_in_output = [
-        [exclude_input_in_output] for exclude_input_in_output in
-        check_stop_word_test_case["exclude_input_in_output"]
-    ]
-
-    stopped_word_status = defaultdict(model.StopWordsState)
-
-    for i, response_list in enumerate(responses):
-        output = defaultdict(lambda: "")
-        detected_stop_word = False
-        for j, response in enumerate(response_list):
-            seq_index = None if num_return_sequences == 1 else i
-            batch_index = i if num_return_sequences == 1 else 0
-            if type(response) is list:
-                response_obj = Response(text_output=[
-                    response_item.encode('utf-8') for response_item in response
-                ],
-                                        batch_index=np.asarray([[batch_index]]),
-                                        sequence_index=np.asarray([[seq_index]
-                                                                   ]))
-            else:
-                response_obj = Response(text_output=[response.encode('utf-8')],
-                                        batch_index=np.asarray([[batch_index]]),
-                                        sequence_index=np.asarray([[seq_index]
-                                                                   ]))
-            detected_stop_word = TritonPythonModel().check_stop_words(
-                request, response_obj, stopped_word_status)
-            for j, text_output in enumerate(response_obj.text_output):
-                output[j] += str(text_output, encoding='utf-8')
-            if detected_stop_word:
-                break
-
-        assert list(output.values()) == expected_output[i]
diff --git a/triton_backend/all_models/whisper/whisper_bls/1/fbank.py b/triton_backend/all_models/whisper/whisper_bls/1/fbank.py
deleted file mode 100644
index 09e90582974..00000000000
--- a/triton_backend/all_models/whisper/whisper_bls/1/fbank.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
-import os
-from typing import Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-
-def mel_filters(device, n_mels: int = 128) -> torch.Tensor:
-    """
-    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
-    Allows decoupling librosa dependency; saved using:
-
-        np.savez_compressed(
-            "mel_filters.npz",
-            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
-        )
-    """
-    assert n_mels == 80 or n_mels == 128, f"Unsupported n_mels: {n_mels}"
-    with np.load(os.path.join(os.path.dirname(__file__),
-                              "mel_filters.npz")) as f:
-        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
-
-
-def log_mel_spectrogram(
-    audio: Union[torch.Tensor],
-    filters: torch.Tensor,
-    n_mels: int = 128,
-    n_fft: int = 400,
-    hop_length: int = 160,
-):
-    """
-    Compute the log-Mel spectrogram of
-
-    Parameters
-    ----------
-    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
-        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
-
-    n_mels: int
-        The number of Mel-frequency filters, only 80 or 128 is supported
-
-    filters: torch.Tensor
-
-    Returns
-    -------
-    torch.Tensor, shape = (128, n_frames)
-        A Tensor that contains the Mel spectrogram
-    """
-    window = torch.hann_window(n_fft).to(audio.device)
-    stft = torch.stft(audio,
-                      n_fft,
-                      hop_length,
-                      window=window,
-                      return_complex=True)
-    magnitudes = stft[..., :-1].abs()**2
-
-    mel_spec = filters @ magnitudes
-    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
-    log_spec = (log_spec + 4.0) / 4.0
-    # cast to float 16
-    log_spec = log_spec.half()
-    return log_spec
-
-
-class FeatureExtractor(torch.nn.Module):
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def __init__(self, n_mels: int = 128):
-        self.device = torch.device("cuda")
-        self.n_mels = n_mels
-        self.filters = mel_filters(self.device, n_mels=self.n_mels)
-
-    def compute_feature(self, wav, target: int = 3000):
-        mel = log_mel_spectrogram(wav, self.filters)
-        if mel.shape[1] < target:
-            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
-        if mel.shape[1] % 2:
-            # pad to even length for remove_padding case, since conv1d requires even length
-            mel = torch.nn.functional.pad(mel, (0, 1))
-        mel = mel.unsqueeze(0)
-        return mel
diff --git a/triton_backend/all_models/whisper/whisper_bls/1/model.py b/triton_backend/all_models/whisper/whisper_bls/1/model.py
deleted file mode 100644
index 00902020258..00000000000
--- a/triton_backend/all_models/whisper/whisper_bls/1/model.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-import re
-import traceback
-
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import to_dlpack
-
-from .fbank import FeatureExtractor
-from .tokenizer import get_tokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        self.model_config = json.loads(args['model_config'])
-
-        self.tokenizer = get_tokenizer(num_languages=100)
-        self.eos = self.tokenizer.encode(
-            "<|endoftext|>",
-            allowed_special=self.tokenizer.special_tokens_set)[0]
-        self.device = torch.device("cuda")
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            self.model_config)
-        self.logger = pb_utils.Logger
-        self.init_model(self.model_config['parameters'])
-
-    def init_model(self, parameters):
-        for key, value in parameters.items():
-            parameters[key] = value["string_value"]
-        n_mels = int(parameters["n_mels"])
-        self.zero_pad = True if parameters["zero_pad"] == "true" else False
-        self.feature_extractor = FeatureExtractor(n_mels=n_mels)
-
-    def _prepare_inputs(self,
-                        request,
-                        mel_feature,
-                        mel_len,
-                        prompt,
-                        max_tokens=50):
-        """
-        Prepares inputs for the language model based on the parameters in the
-        request, image features, and prompt. It tokenizes prompt,
-        extracts and processes additional parameters from the request:
-            - max_tokens: Maximum number of tokens to generate (default: 50)
-            - temperature: Controls randomness in generation (default: 0.5)
-            - top_k: Top K sampling parameter (default: 1)
-            - frequency_penalty: Penalizes frequent tokens (default: 0.7)
-            - seed: Random seed for generation (default: 10)
-
-        Final llm input dictionary is combined out of all processed parameters,
-        prompt's tokens and image features. The latter will be passed to llm
-        through `prompt_embedding_table`.
-
-        Parameters
-        ----------
-        - request: The original request object containing additional parameters.
-        - image_features (list): A list containing image feature tensors.
-        - prompt (str): The text prompt to be processed.
-
-        Returns
-        -------
-        - dict: A dictionary containing all the prepared inputs for the language model.
-        """
-        input_dict = {
-            "request_output_len": np.array([[max_tokens]], dtype=np.int32),
-            "end_id": np.array([[self.eos]], dtype=np.int32),
-            "pad_id": np.array([[self.eos]], dtype=np.int32),
-            "encoder_output_lengths": mel_len // 2,
-            "input_lengths": mel_len,
-            "decoder_input_ids": prompt,
-            "streaming": np.array([[self.decoupled]], dtype=np.bool_),
-        }
-        input_tensor_list = [
-            pb_utils.Tensor(k, v) for k, v in input_dict.items()
-        ]
-        input_tensor_list.append(
-            pb_utils.Tensor.from_dlpack("encoder_input_features",
-                                        to_dlpack(mel_feature.contiguous())))
-
-        return input_tensor_list
-
-    def _prepare_llm_response(self, llm_request_inputs):
-        """
-        Prepares the response from the language model based on the provided
-        inputs. Creates a `pb_utils.InferenceRequest` object with passed
-        `llm_request_inputs` to send to a decoupled TensorRTLLM model.
-        For each response from the language model:
-            - Checks for errors and raise an exception if any are found.
-            - Extracts the "output_ids" tensor from the response.
-            - Determines the finish reason based on the presence of the
-              end-of-sequence token or reaching the maximum length.
-            - Appends the generated token IDs to `output_ids`.
-            - If the finish reason is determined, decodes the output IDs to text
-              and prepares the final response.
-
-        The final response includes the generated text, finish reason,
-        completion tokens, prompt tokens, and total tokens.
-
-        Parameters
-        ----------
-        - llm_request_inputs (dict): A dictionary containing the inputs for the language model.
-
-        Returns
-        -------
-        - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata.
-        """
-        llm_request = pb_utils.InferenceRequest(
-            model_name="tensorrt_llm",
-            requested_output_names=["output_ids", "sequence_length"],
-            inputs=llm_request_inputs,
-        )
-        responses = llm_request.exec(decoupled=self.decoupled)
-        if not self.decoupled:
-            llm_response = responses
-            if llm_response.has_error():
-                raise pb_utils.TritonModelException(
-                    llm_response.error().message())
-            output_ids = (pb_utils.get_output_tensor_by_name(
-                llm_response, "output_ids").as_numpy().flatten().tolist())
-
-            output_text = self.tokenizer.decode(output_ids).strip()
-            output_text = re.sub(r'<\|.*?\|>', '', output_text)
-            response = pb_utils.InferenceResponse(output_tensors=[
-                pb_utils.Tensor("TRANSCRIPTS",
-                                np.array([output_text], np.object_)),
-            ])
-            yield response
-        else:
-            output_ids = []
-            for llm_response in responses:
-                if llm_response.has_error():
-                    raise pb_utils.TritonModelException(
-                        llm_response.error().message())
-                stream_output_ids = (pb_utils.get_output_tensor_by_name(
-                    llm_response, "output_ids").as_numpy().flatten().tolist())
-                if len(stream_output_ids) == 0:
-                    continue
-                output_ids.extend(stream_output_ids)
-                output_text = self.tokenizer.decode(output_ids).strip()
-                output_text = re.sub(r'<\|.*?\|>', '', output_text)
-                response = pb_utils.InferenceResponse(output_tensors=[
-                    pb_utils.Tensor("TRANSCRIPTS",
-                                    np.array([output_text], np.object_)),
-                ])
-                yield response
-
-    def execute(self, requests):
-
-        responses = []
-
-        for request in requests:
-            # Perform inference on the request and append it to responses list...
-            decoder_text_prompt = pb_utils.get_input_tensor_by_name(
-                request, "TEXT_PREFIX").as_numpy().tolist()
-            text_prefix = decoder_text_prompt[0][0].decode('utf-8')
-            if text_prefix == "":
-                text_prefix = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
-            prompt_id = self.tokenizer.encode(
-                text_prefix, allowed_special=self.tokenizer.special_tokens_set)
-            decoder_input_ids = np.array([prompt_id], dtype=np.int32)
-
-            wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
-            assert wav.shape[0] == 1, "Only support batch size 1"
-            # To support batch > 1
-            # cat mel,text_prompt, also, need to increase decoder_input_len as a triton input
-            wav = torch.from_numpy(wav[0]).to(self.device)
-            wav_len = pb_utils.get_input_tensor_by_name(
-                request, "WAV_LENS").as_numpy().item()
-            if self.zero_pad:
-                wav = wav[:wav_len]
-                target = 0
-            else:
-                target = 3000
-            mel = self.feature_extractor.compute_feature(wav, target).transpose(
-                1, 2)
-            mel_len = np.array([[mel.shape[1]]], dtype=np.int32)
-            if self.decoupled:
-                response_sender = request.get_response_sender()
-            try:
-
-                llm_request_inputs = self._prepare_inputs(
-                    request, mel, mel_len, decoder_input_ids)
-                if isinstance(llm_request_inputs, pb_utils.TritonError):
-                    error = pb_utils.InferenceResponse(error=llm_request_inputs)
-                    if self.decoupled:
-                        response_sender.send(
-                            error,
-                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                    else:
-                        responses.append(error)
-                llm_responses = self._prepare_llm_response(llm_request_inputs)
-
-                for triton_response in llm_responses:
-                    if self.decoupled:
-                        response_sender.send(triton_response)
-                    else:
-                        responses.append(triton_response)
-
-                if self.decoupled:
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-            except Exception:
-                self.logger.log_error(traceback.format_exc())
-                # If encountering an error, send a response with err msg
-                error_response = pb_utils.InferenceResponse(
-                    output_tensors=[],
-                    error=pb_utils.TritonError(traceback.format_exc()))
-
-                if self.decoupled:
-                    response_sender.send(error_response)
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                else:
-                    responses.append(error_response)
-
-        if self.decoupled:
-            return None
-        else:
-            assert len(responses) == len(requests)
-            return responses
diff --git a/triton_backend/all_models/whisper/whisper_bls/1/tokenizer.py b/triton_backend/all_models/whisper/whisper_bls/1/tokenizer.py
deleted file mode 100644
index b3a1612bcf7..00000000000
--- a/triton_backend/all_models/whisper/whisper_bls/1/tokenizer.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
-import base64
-import os
-
-import tiktoken
-
-LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "he": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
-    "yue": "cantonese",
-}
-
-
-def get_tokenizer(name: str = "multilingual",
-                  num_languages: int = 99,
-                  tokenizer_dir: str = None):
-    if tokenizer_dir is None:
-        vocab_path = os.path.join(os.path.dirname(__file__),
-                                  f"./{name}.tiktoken")
-    else:
-        vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
-    ranks = {
-        base64.b64decode(token): int(rank)
-        for token, rank in (line.split() for line in open(vocab_path) if line)
-    }
-    n_vocab = len(ranks)
-    special_tokens = {}
-
-    specials = [
-        "<|endoftext|>",
-        "<|startoftranscript|>",
-        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
-        "<|translate|>",
-        "<|transcribe|>",
-        "<|startoflm|>",
-        "<|startofprev|>",
-        "<|nospeech|>",
-        "<|notimestamps|>",
-        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
-    ]
-
-    for token in specials:
-        special_tokens[token] = n_vocab
-        n_vocab += 1
-
-    return tiktoken.Encoding(
-        name=os.path.basename(vocab_path),
-        explicit_n_vocab=n_vocab,
-        pat_str=
-        r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
-        mergeable_ranks=ranks,
-        special_tokens=special_tokens,
-    )
-
-
-if __name__ == "__main__":
-    enc = get_tokenizer()
-    mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
-    encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
-    mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
-    mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
-    #print(encoding, mystr, mystr2)
-    print(
-        enc.encode("<|startoftranscript|>",
-                   allowed_special=enc.special_tokens_set)[0])
-    print(
-        enc.encode("<|endoftext|>", allowed_special=enc.special_tokens_set)[0])
-    my_zh_str = "好好学习"
-    encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
-    decoding = enc.decode(encoding)
-    print(type(decoding))
-    #print(encoding, decoding)
diff --git a/triton_backend/all_models/whisper/whisper_bls/config.pbtxt b/triton_backend/all_models/whisper/whisper_bls/config.pbtxt
deleted file mode 100644
index c08de678474..00000000000
--- a/triton_backend/all_models/whisper/whisper_bls/config.pbtxt
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: "whisper_bls"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-    decoupled: ${decoupled_mode}
-}
-
-parameters [
-  {
-   key: "n_mels",
-   value: {string_value:"${n_mels}"} # 128 dim for large-v3, 80 dim for large-v2
-  },
-  {
-  key: "engine_dir"
-  value: {string_value: "${engine_dir}"}
-  },
-  {
-  key: "zero_pad"
-  value: {string_value: "${zero_pad}"}
-  }
-]
-
-
-input [
-  {
-    name: "TEXT_PREFIX"
-    data_type: TYPE_STRING
-    dims: [1]
-  },
-  {
-    name: "WAV"
-    data_type: TYPE_FP32
-    dims: [-1]
-  },
-  {
-    name: "WAV_LENS"
-    data_type: TYPE_INT32
-    dims: [1]
-    optional: True
-  }
-]
-
-output [
-  {
-    name: "TRANSCRIPTS"
-    data_type: TYPE_STRING
-    dims: [1]
-  }
-]
-
-dynamic_batching {
-  }
-instance_group [
-    {
-      count: 8
-      kind: KIND_CPU
-    }
-  ]
diff --git a/triton_backend/ci/L0_backend_trtllm/base_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/base_metrics_verification_tests.py
deleted file mode 100644
index 2d6f2ba5570..00000000000
--- a/triton_backend/ci/L0_backend_trtllm/base_metrics_verification_tests.py
+++ /dev/null
@@ -1,400 +0,0 @@
-#!/usr/bin/python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import json
-import os
-import sys
-from collections import defaultdict
-
-import numpy as np
-import requests
-
-BACKEND_ROOT = os.environ.get('BACKEND_ROOT',
-                              "/opt/tritonserver/tensorrtllm_backend")
-sys.path.append(os.path.join(BACKEND_ROOT, "tools/utils"))
-import re
-import unittest
-
-import utils
-
-# This unit test was generated because the Triton team needed a
-# static test in which an equal number of inferences were distributed
-# across the 3 models orchestrated by the ensemble. This is so we could
-# compare inference request counts and latencies in an equal environment.
-# Many of the tests provided by the TRT team unevenly distribute requests
-# so when we poll metrics the tensorrt_llm model, for example, will have
-# performed 72 inferences whereas the pre/post models will have only
-# performed 49. Further, because of this unequal distribution of requests
-# we cannot check whether the latency across the 3 models is <= to the
-# latency of the ensemble.
-
-# Consider removing this unit test when the TRT tests have stabilized.
-
-
-class TRTLLMBaseMetricsTest(unittest.TestCase):
-
-    def setUp(self):
-        self.expected_input_token_len = []
-        self.expected_output_token_len = []
-
-    def _get_metrics(self):
-        metrics_url = "http://localhost:8002/metrics"
-        r = requests.get(metrics_url)
-        r.raise_for_status()
-        return r.text
-
-    def _run_infer(self, client, prompts, output_lens, beam_width_value=1):
-        model_name = "ensemble"
-        async_requests = []
-        for i, prompt in enumerate(prompts):
-            input0 = [[prompt]]
-            input0_data = np.array(input0).astype(object)
-            output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-            bad_words_list = np.array([[""]], dtype=object)
-            stop_words_list = np.array([[""]], dtype=object)
-
-            inputs = [
-                utils.prepare_tensor("text_input", input0_data, "http"),
-                utils.prepare_tensor("max_tokens", output0_len, "http"),
-                utils.prepare_tensor("bad_words", bad_words_list, "http"),
-                utils.prepare_tensor("stop_words", stop_words_list, "http"),
-            ]
-            if beam_width_value > 1:
-                beam_width = np.ones_like(input0).astype(
-                    np.int32) * beam_width_value
-                inputs.append(
-                    utils.prepare_tensor("beam_width", beam_width, "http"))
-            self.expected_output_token_len.append(output_lens[i] *
-                                                  beam_width_value)
-            # Request minimal outputs
-            outputs = utils.prepare_outputs("http")
-            async_requests.append(
-                client.async_infer(model_name,
-                                   inputs,
-                                   outputs=outputs,
-                                   request_id=str(i)))
-
-        try:
-            utils.get_http_results(async_requests)
-        except Exception as e:
-            print("Failed receiving responses: " + str(e))
-            sys.exit(1)
-
-    def _all_equal(self, iterable):
-        return all(item == iterable[0] for item in iterable)
-
-    def _calculate_bucket_counts(self, token_lengths, le_values):
-        """
-        Calculate histogram bucket counts based on le_values boundaries.
-        Each bucket counts all values less than or equal to its boundary.
-
-        Args:
-            token_lengths (list): List of token lengths
-            le_values (list): List of bucket boundaries (strings)
-
-        Returns:
-            list: Cumulative count of values less than or equal to each boundary
-        """
-        # Convert le_values to float (except "+Inf")
-        boundaries = [
-            float(x) if x != "+Inf" else float('inf') for x in le_values
-        ]
-
-        # Initialize bucket counts
-        bucket_counts = [0] * len(boundaries)
-
-        # Count tokens for each bucket (cumulative)
-        for length in token_lengths:
-            for i, boundary in enumerate(boundaries):
-                if float(length) <= boundary:
-                    # Increment this bucket and all higher buckets
-                    for j in range(i, len(boundaries)):
-                        bucket_counts[j] += 1
-                    break
-
-        return bucket_counts
-
-    def _find_metric_values(self, filename, le_values):
-        """
-        Find metric values in file for given le_values.
-
-        Args:
-            filename (str): Path to the metrics file
-            le_values (list): List of le values to search for
-
-        Returns:
-            tuple: Lists of input and output token values, or (None, None) if error
-        """
-        input_token_values = []
-        output_token_values = []
-
-        try:
-            with open(filename, 'r') as file:
-                content = file.read()
-
-                for le_value in le_values:
-                    # Patterns with dynamic le value
-                    # Escape +Inf properly by replacing it with \+Inf in the regex
-                    if le_value == "+Inf":
-                        le_value = r"\+Inf"
-
-                    input_pattern = rf'nv_llm_input_token_len_bucket{{model="tensorrt_llm",response_metric_type="total_input_tokens",version="1",le="{le_value}"}}\s+(\d+)'
-                    output_pattern = rf'nv_llm_output_token_len_bucket{{model="tensorrt_llm",response_metric_type="total_output_tokens",version="1",le="{le_value}"}}\s+(\d+)'
-
-                    input_match = re.search(input_pattern, content)
-                    output_match = re.search(output_pattern, content)
-
-                    if input_match:
-                        # Extract the actual numeric value from the match
-                        input_token_values.append(int(input_match.group(1)))
-                    if output_match:
-                        # Extract the actual numeric value from the match
-                        output_token_values.append(int(output_match.group(1)))
-
-            return input_token_values, output_token_values
-
-        except FileNotFoundError:
-            print(f"Error: File '{filename}' not found")
-            return None, None
-        except Exception as e:
-            print(f"Error reading file: {str(e)}")
-            return None, None
-
-    def _verify_per_request_custom_metrics(self, filename,
-                                           expected_input_tokens,
-                                           expected_output_tokens):
-        """
-        Helper to Verify request and response metrics.
-
-        Args:
-            filename (str): Path to the metrics file
-            expected_input_tokens (list): Expected input token counts
-            expected_output_tokens (list): Expected output token counts
-        """
-        # Multiple values lookup
-        le_values = ["10", "50", "100", "500", "1000", "+Inf"]
-
-        # Get actual values from file
-        input_results, output_results = self._find_metric_values(
-            filename, le_values)
-
-        if input_results is None or output_results is None:
-            self.fail("Failed to read metric values from file")
-            return
-
-        # Calculate expected histogram bucket counts
-        input_bucket_counts = self._calculate_bucket_counts(
-            expected_input_tokens, le_values)
-        output_bucket_counts = self._calculate_bucket_counts(
-            expected_output_tokens, le_values)
-        # Verify input tokens
-        self.assertTrue(
-            len(input_bucket_counts) == len(input_results)
-            and all(a == b for a, b in zip(input_bucket_counts, input_results)),
-            f"Input token arrays don't match:\nExpected: {input_bucket_counts}\nActual: {input_results}"
-        )
-
-        # Verify output tokens
-        self.assertTrue(
-            len(output_bucket_counts) == len(output_results)
-            and all(a == b
-                    for a, b in zip(output_bucket_counts, output_results)),
-            f"Output token arrays don't match:\nExpected: {output_bucket_counts}\nActual: {output_results}"
-        )
-
-    def _verify_end_to_end_metrics(self, filename):
-        # Used to verify end to end test metrics output with STREAMING ON
-        # Read the OUTPUT_SIZE environment variable
-        stream_output_size = os.getenv('STREAM_OUTPUT_SIZE')
-        stream_input_size = os.getenv('STREAM_INPUT_SIZE')
-        if stream_input_size and stream_output_size:
-            stream_input_size = int(stream_input_size)
-            stream_output_size = int(stream_output_size)
-            expected_input_token_len = []
-            expected_output_token_len = []
-            expected_input_token_len.append(stream_input_size)
-            expected_output_token_len.append(stream_output_size)
-            self._verify_per_request_custom_metrics(filename,
-                                                    expected_input_token_len,
-                                                    expected_output_token_len)
-        else:
-            self.assertTrue(
-                False,
-                "Unable to read stream_output_size and stream_input_size from env variables"
-            )
-
-    def _verify_base_metrics(self, filename):
-        # FIXME: Custom parsing is messy. As part of the Triton
-        # CLI work, we should add a metrics client API that will
-        # return the metrics in a neatly formatted JSON.
-        model_metrics = defaultdict(dict)
-        with open(filename) as metrics_file:
-            for line in metrics_file:
-                if line[0] != "#" and "nv_inference" in line:
-                    # Splits metric line into:
-                    # ex. 'nv_inference_request_success', '{model="ensemble",version="1"}', '104'
-                    model_data = line.replace("{", " {").split()
-                    key = model_data[0].replace("nv_inference_", "")
-                    model = model_data[1].split('"')[1]
-                    value = model_data[2]
-                    model_metrics[model][key] = value
-
-        print(json.dumps(model_metrics, indent=4))
-
-        # Assert the expected models are in the metrics output
-        expected_models = [
-            "ensemble", "preprocessing", "postprocessing", "tensorrt_llm"
-        ]
-        self.assertTrue(all(model in model_metrics
-                            for model in expected_models))
-
-        # Assert each model records the same number of metrics
-        self.assertTrue(
-            self._all_equal(
-                [len(model_metrics[model].keys()) for model in model_metrics]))
-
-        # Assert models have the same counts
-        count_keys = [
-            "request_success", "request_failure", "count", "exec_count",
-            "pending_request_count"
-        ]
-        for stat in count_keys:
-            if stat == "exec_count":
-                # Dynamic batching is enabled for the post-processing model and
-                # pre-processing, so the 'exec_count' will not be the same
-                # between the postprocessing model and other models.
-                self.assertTrue(
-                    self._all_equal([
-                        model_metrics[model][stat] for model in model_metrics if
-                        model != "postprocessing" and model != "preprocessing"
-                    ]))
-            else:
-                self.assertTrue(
-                    self._all_equal([
-                        model_metrics[model][stat] for model in model_metrics
-                    ]))
-
-        duration_keys = [
-            "request_duration_us", "compute_input_duration_us",
-            "compute_infer_duration_us", "compute_output_duration_us"
-        ]
-        for stat in duration_keys:
-            composing_stat_duration = sum([
-                int(model_metrics[model][stat]) for model in model_metrics
-                if model != "ensemble"
-            ])
-            ensemble_stat_duration = int(model_metrics["ensemble"][stat])
-            self.assertTrue(composing_stat_duration > 0)
-            self.assertTrue(ensemble_stat_duration > 0)
-
-    def test_end_to_end(self):
-        try:
-            client = utils.create_inference_server_client("http",
-                                                          "localhost:8000",
-                                                          concurrency=128,
-                                                          verbose=True)
-        except Exception as e:
-            print("channel creation failed: " + str(e))
-            sys.exit(1)
-
-        max_input_len = 500
-        op_tokens_per_word = 1.3
-        dataset = "./simple_data.json"
-
-        prompts = []
-        output_lens = []
-        with open(dataset, "r") as f:
-            data_dict = json.load(f)
-            for req in data_dict:
-                prompt = req["input"] + " " + req["instruction"]
-                output = req["output"]
-                # 1.3 is a magic number that converts number of words to number of tokens
-                if int(len(prompt.split(" ")) /
-                       op_tokens_per_word) > max_input_len:
-                    continue
-                prompts.append(prompt)
-                self.expected_input_token_len.append(
-                    len(prompt.split(" ")) * op_tokens_per_word)
-                output_lens.append(
-                    int(len(output.split(" ")) * op_tokens_per_word))
-
-        self._run_infer(client, prompts, output_lens)
-        metrics = self._get_metrics()
-        filename = "./base_metrics.out"
-        with open(filename, "w+") as metrics_file:
-            metrics_file.write(metrics)
-        self._verify_base_metrics(filename)
-        self._verify_per_request_custom_metrics(filename,
-                                                self.expected_input_token_len,
-                                                self.expected_output_token_len)
-        filename = "./end_to_end_token_metrics.out"
-        self._verify_end_to_end_metrics(filename)
-
-    def test_end_to_end_beam_width(self):
-        # End to end test for beam > 1
-        try:
-            client = utils.create_inference_server_client("http",
-                                                          "localhost:8000",
-                                                          concurrency=128,
-                                                          verbose=True)
-        except Exception as e:
-            print("channel creation failed: " + str(e))
-            sys.exit(1)
-
-        max_input_len = 500
-        op_tokens_per_word = 1.3
-        dataset = "./simple_data.json"
-        prompts = []
-        output_lens = []
-        self.expected_input_token_len = []
-        self.expected_output_token_len = []
-        with open(dataset, "r") as f:
-            data_dict = json.load(f)
-            for req in data_dict:
-                prompt = req["input"] + " " + req["instruction"]
-                output = req["output"]
-                # 1.3 is a magic number that converts number of words to number of tokens
-                if int(len(prompt.split(" ")) /
-                       op_tokens_per_word) > max_input_len:
-                    continue
-                prompts.append(prompt)
-                self.expected_input_token_len.append(
-                    len(prompt.split(" ")) * op_tokens_per_word)
-                output_lens.append(
-                    int(len(output.split(" ")) * op_tokens_per_word))
-        beam_width = 2
-        self._run_infer(client, prompts, output_lens, beam_width)
-        metrics = self._get_metrics()
-        filename = "./base_metrics_beam_width.out"
-        with open(filename, "w+") as metrics_file:
-            metrics_file.write(metrics)
-        self._verify_per_request_custom_metrics(filename,
-                                                self.expected_input_token_len,
-                                                self.expected_output_token_len)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
deleted file mode 100644
index 3523dff6819..00000000000
--- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import json
-import os
-import re
-import unittest
-from datetime import datetime, timedelta
-
-AVAILABLE_GPUS = int(os.environ.get("AVAILABLE_GPUS", "1"))
-
-metric_to_stat_dict = {
-    "request_type=context": "Context Requests",
-    "request_type=scheduled": "Scheduled Requests",
-    "request_type=max": "Max Request Count",
-    "request_type=active": "Active Request Count",
-    "request_type=waiting": "Waiting Requests",
-    "memory_type=pinned": "Runtime Pinned Memory Usage",
-    "memory_type=gpu": "Runtime GPU Memory Usage",
-    "memory_type=cpu": "Runtime CPU Memory Usage",
-    "kv_cache_block_type=tokens_per": "Tokens per KV cache block",
-    "kv_cache_block_type=used": "Used KV cache blocks",
-    "kv_cache_block_type=free": "Free KV cache blocks",
-    "kv_cache_block_type=max": "Max KV cache blocks",
-    "kv_cache_block_type=reused": "Reused KV cache blocks",
-    "kv_cache_block_type=fraction": "Fraction used KV cache blocks",
-    "inflight_batcher_specific_metric=total_context_tokens":
-    "Total Context Tokens",
-    "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
-    "inflight_batcher_specific_metric=generation_requests":
-    "Generation Requests",
-    "inflight_batcher_specific_metric=paused_requests": "Paused Requests",
-    "v1_specific_metric=total_context_tokens": "Total Context Tokens",
-    "v1_specific_metric=total_generation_tokens": "Total Generation Tokens",
-    "v1_specific_metric=empty_generation_slots": "Empty Generation Slots",
-    "general_type=iteration_counter": "Iteration Counter",
-    "general_type=timestamp": "Timestamp",
-    "disaggregated_serving_type=kv_cache_transfer_ms": "KV cache transfer time",
-    "disaggregated_serving_type=request_count": "Request count",
-}
-
-
-class CustomMetricsTest(unittest.TestCase):
-
-    def _parse_log_file(self, filename):
-        with open(filename) as log_file:
-            for line in reversed(list(log_file)):
-                if "Active Request Count" in line:
-                    match = re.search(r'({.*})', line)
-                    if match:
-                        json_string = match.group(1)
-                        try:
-                            json_string = json_string.replace('\\"', '"')
-                        except json.JSONDecodeError as e:
-                            raise Exception("Error parsing the JSON string: ",
-                                            e)
-                    else:
-                        raise Exception("No JSON found in the log file")
-
-                    return json.loads(json_string)
-
-    def _parse_triton_metrics(self, filename):
-        curl_counts = {}
-        with open(filename) as metrics_file:
-            for line in metrics_file:
-                metric_value = ""
-                if line[0] != "#" and "nv_trt_llm" in line:
-                    metric_output = re.sub(r"^.*?{", "{", line).split()
-                    metric_key = metric_output[0]
-                    metric_value = metric_output[1]
-                    key = self._convert_metric_key_to_stats_key(metric_key)
-                    curl_counts[key] = metric_value
-        return curl_counts
-
-    def _convert_metric_key_to_stats_key(self, metric_output):
-        # Converts:
-        # '{model="tensorrt_llm",request_type="context",version="1"}'
-        # to:
-        # ['model=tensorrt_llm', 'request_type=context', 'version=1']
-        base = metric_output.replace('"', "").strip("{}").split(",")
-        key = [
-            i for i in base
-            if not i.startswith('model') and not i.startswith('version')
-        ][0]
-        self.assertIn(key, metric_to_stat_dict)
-        self.assertNotIn("v1_specific_metric", key)
-        return metric_to_stat_dict[key]
-
-    def _base_test(self, stats_file, metrics_file):
-        stats = self._parse_log_file(stats_file)
-        metrics = self._parse_triton_metrics(metrics_file)
-        self.assertEqual(len(stats.keys()), len(metrics.keys()))
-        self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
-        for metric_key in stats.keys():
-            if metric_key != "Timestamp":
-                # [FIXME] The current parsing logic only returns the latest reported
-                # values, which is insufficient for accumulated metrics as the
-                # latest metrics value is already accumulated whereas the log
-                # only reports the value in one measurement.
-                self.assertEqual(
-                    int(stats[metric_key]), int(metrics[metric_key]),
-                    f"{metric_key} stats value doesn't match metrics value")
-            else:
-                dt_log = datetime.strptime(stats[metric_key],
-                                           '%m-%d-%Y %H:%M:%S.%f')
-                # Function only supports input in seconds so extract timestamp in seconds
-                # then add microseconds
-                dt_curl = datetime.utcfromtimestamp(
-                    int(metrics[metric_key]) // 1000000)
-                dt_curl += timedelta(microseconds=int(metrics[metric_key][-6:]))
-                difference = dt_log - dt_curl
-                self.assertTrue(
-                    timedelta(seconds=-1) <= difference, difference
-                    <= timedelta(seconds=1))
-
-    def test_1_gpu_IFB_no_stream(self):
-        self._base_test("1gpu_IFB_no_streaming_server.log",
-                        "1gpu_IFB_no_stream_metrics.out")
-
-    def test_1_gpu_IFB_stream(self):
-        self._base_test("1gpu_IFB_streaming_server.log",
-                        "1gpu_IFB_stream_metrics.out")
-
-    if AVAILABLE_GPUS >= 2:
-
-        def test_2_gpu_IFB_no_stream(self):
-            self._base_test("2gpu_IFB_no_streaming_server.log",
-                            "2gpu_IFB_no_stream_metrics.out")
-
-        def test_2_gpu_IFB_stream(self):
-            self._base_test("2gpu_IFB_streaming_server.log",
-                            "2gpu_IFB_stream_metrics.out")
-
-    if AVAILABLE_GPUS >= 4:
-
-        def test_4_gpu_IFB_no_stream(self):
-            self._base_test("4gpu_IFB_no_streaming_server.log",
-                            "4gpu_IFB_no_stream_metrics.out")
-
-        def test_4_gpu_IFB_stream(self):
-            self._base_test("4gpu_IFB_streaming_server.log",
-                            "4gpu_IFB_stream_metrics.out")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/triton_backend/ci/L0_backend_trtllm/generate_engines.sh b/triton_backend/ci/L0_backend_trtllm/generate_engines.sh
deleted file mode 100644
index 5b8ef661ff2..00000000000
--- a/triton_backend/ci/L0_backend_trtllm/generate_engines.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-BACKEND_ROOT=${BACKEND_ROOT:='/opt/tritonserver/tensorrtllm_backend'}
-BASE_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm
-GPT_DIR=${BACKEND_ROOT}/tensorrt_llm/examples/models/core/gpt
-TRTLLM_DIR=${BACKEND_ROOT}/tensorrt_llm/
-
-function build_base_model {
-    local NUM_GPUS=$1
-    cd ${GPT_DIR}
-    rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2
-    pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd
-    python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir ./c-model/gpt2/${NUM_GPUS}-gpu/
-    cd ${BASE_DIR}
-}
-
-function build_tensorrt_engine_inflight_batcher {
-    local NUM_GPUS=$1
-    cd ${GPT_DIR}
-    local GPT_MODEL_DIR=./c-model/gpt2/${NUM_GPUS}-gpu/
-    local OUTPUT_DIR=inflight_${NUM_GPUS}_gpu/
-    # ./c-model/gpt2/ must already exist (it will if build_base_model
-    # has already been run)
-    # max_batch_size set to 128 to avoid OOM errors
-    # enable use_paged_context_fmha for KV cache reuse
-    trtllm-build --checkpoint_dir "${GPT_MODEL_DIR}" \
-            --gpt_attention_plugin float16 \
-            --remove_input_padding enable \
-            --kv_cache_type paged \
-            --gemm_plugin float16 \
-            --workers "${NUM_GPUS}" \
-            --max_beam_width 2 \
-            --output_dir "${OUTPUT_DIR}" \
-            --max_batch_size 128 \
-            --use_paged_context_fmha enable
-    cd ${BASE_DIR}
-}
-
-# Generate the TRT_LLM model engines
-NUM_GPUS_TO_TEST=("1" "2" "4")
-for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
-    AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
-    if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
-        continue
-    fi
-
-    build_base_model "${NUM_GPU}"
-    build_tensorrt_engine_inflight_batcher "${NUM_GPU}"
-done
-
-# Move the TRT_LLM model engines to the CI directory
-mkdir engines
-mv ${GPT_DIR}/inflight_*_gpu/ engines/
-
-# Move the tokenizer into the CI directory
-mkdir tokenizer
-mv ${GPT_DIR}/gpt2/* tokenizer/
diff --git a/triton_backend/ci/L0_backend_trtllm/simple_data.json b/triton_backend/ci/L0_backend_trtllm/simple_data.json
deleted file mode 100644
index 9b7bebcac8f..00000000000
--- a/triton_backend/ci/L0_backend_trtllm/simple_data.json
+++ /dev/null
@@ -1,67 +0,0 @@
-[
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    }
-]
diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh
deleted file mode 100644
index 83967d1c58c..00000000000
--- a/triton_backend/ci/L0_backend_trtllm/test.sh
+++ /dev/null
@@ -1,662 +0,0 @@
-#!/bin/bash
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
-SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
-BACKEND_ROOT=${BACKEND_ROOT:="/opt/tritonserver/tensorrtllm_backend"}
-DATASET="$PWD/simple_data.json"
-TOOLS_DIR=${BACKEND_ROOT}/tools
-STREAM_DIR=${BACKEND_ROOT}/inflight_batcher_llm/client
-MODEL_DIR="$PWD/triton_model_repo"
-SERVER=/opt/tritonserver/bin/tritonserver
-TOKENIZER_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm/tokenizer
-BASE_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm
-BASE_METRICS_VERIFICATION_TEST=base_metrics_verification_tests.py
-BASE_METRICS_VERIFICATION_TEST_NAME=base_metrics_verification_tests
-BASE_METRICS_VERIFICATION_LOG="base_metrics_verification.log"
-CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
-CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
-SERVER_PID=0
-SLEEP_DURATION=3
-
-# Force environment to use python version 3
-apt update -q=2 \
-    && apt install -y python-is-python3
-
-# Helpers ===============================
-function replace_config_tags {
-  tag_to_replace="${1}"
-  new_value="${2}"
-  config_file_path="${3}"
-  sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path}
-
-}
-
-function run_server {
-  SERVER_ARGS="${1}"
-  python3 ${BACKEND_ROOT}/scripts/launch_triton_server.py ${SERVER_ARGS} > ${SERVER_LOG} 2>&1 &
-  sleep 2 # allow time to obtain the pid(s)
-  # Read PIDs into an array, trimming whitespaces
-  readarray -t SERVER_PID < <(pgrep -s 0 "tritonserver")
-}
-
-# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
-# success, 1 on failure
-function wait_for_server_ready() {
-    local wait_time_secs="${1:-30}"; shift
-    local spids=("$@");
-
-    WAIT_RET=0
-
-    local wait_secs=$wait_time_secs
-    until test $wait_secs -eq 0 ; do
-        # Multi-GPU will spawn multiple pids
-        for pid in "${spids[@]}"; do
-            if ! kill -0 $pid > /dev/null 2>&1; then
-                echo "=== Server not running."
-                WAIT_RET=1
-                return
-            fi
-        done
-
-        sleep 1;
-
-        set +e
-        code=`curl -s -w %{http_code} ${SERVER_IPADDR}:8000/v2/health/ready`
-        set -e
-        if [ "$code" == "200" ]; then
-            code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging`
-            assert_curl_success "Failed to change log settings necessary for verification" ${BASH_LINENO}
-            return
-        fi
-
-        ((wait_secs--));
-    done
-
-    echo "=== Timeout $wait_time_secs secs. Server not ready."
-    WAIT_RET=1
-}
-
-function reset_model_repo {
-    rm -rf triton_model_repo/
-    mkdir ${MODEL_DIR}
-}
-
-function kill_server {
-    pgrep tritonserver | xargs kill -SIGINT
-}
-
-function wait_for_server_terminated {
-    local wait_time_secs="${1:-30}"; shift
-    local spids=("$@");
-    for pid in "${spids[@]}"; do
-        WAIT_RET=1
-        echo "Waiting for proc ${pid} to terminate..."
-        local wait_secs=$wait_time_secs
-        until test $wait_secs -eq 0 ; do
-            if ! (kill -0 $pid) > /dev/null 2>&1; then
-                WAIT_RET=0
-                break
-            fi
-            sleep 1
-            ((wait_secs--));
-        done
-        if [ "$WAIT_RET" != "0" ]; then
-            # Cleanup
-            kill $SERVER_PID > /dev/null 2>&1 || true
-            echo -e "\n***\n*** Failed to wait for server to terminated $SERVER\n***"
-            cat $SERVER_LOG
-            exit 1
-        fi
-    done
-    ps aux
-    if pgrep --runstates R,S,D,I -x "trtllmExecutorW" > /dev/null; then
-        echo -e "Worker process still exists - failed to terminate"
-        exit 1
-    fi
-}
-
-function assert_curl_success {
-  message="${1}"
-  original_line_no="${2}"
-  if [ "$code" != "200" ]; then
-    cat ./curl.out
-    cat ${SERVER_LOG}
-    echo -e "\n***\n*** ${message} : line ${original_line_no}\n***"
-    RET=1
-    return 1
-  fi
-  return 0
-}
-
-# =======================================
-
-prerun_kill_triton_server () {
-    pkill -9 -f trtllmExecutorWorker || true
-    pkill -9 -f tritonserver
-}
-
-# Kill titonserver if it is still pending from previous test
-prerun_kill_triton_server || true
-
-rm -f *.log *.out *.txt
-# Generate TRT_LLM engines and install dependencies
-source ./generate_engines.sh
-pip3 install --upgrade tritonclient[all] pandas tabulate
-
-export AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
-
-RET=0
-
-NUM_GPUS_TO_TEST=("1" "2" "4")
-for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
-    if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
-        break
-    fi
-
-    SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}"
-
-    reset_model_repo
-
-    cp -r ${BACKEND_ROOT}/all_models/inflight_batcher_llm/* ${MODEL_DIR}
-    rm -rf ${MODEL_DIR}/tensorrt_llm_bls
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${max_queue_size}' '0' "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${max_queue_delay_microseconds}' '50000' "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${max_queue_size}' "0" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP16" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
-    replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt"
-    replace_config_tags '${logits_datatype}' 'TYPE_FP32' "${MODEL_DIR}/ensemble/config.pbtxt"
-    replace_config_tags '${logits_datatype}' 'TYPE_FP32' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    # Copy the engine and place it into the model folder
-    cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
-
-    # Invalid GPT model Type
-    SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # Expect invalid GPT model type error to be gracefully handled
-    if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then
-        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # inflight batching OFF (V1)
-    # streaming OFF
-    SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
-    replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # Expect invalid GPT model type error to be gracefully handled
-    if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
-        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    # inflight batching ON
-    # streaming OFF
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"
-    replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-        --max-input-len=500 \
-        dataset --dataset=${DATASET} \
-        --tokenizer-dir=${TOKENIZER_DIR}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # Start a clean server to verify token metrics are being
-    # reported correctly
-    SERVER_LOG="./${NUM_GPU}gpu_token_metrics.log"
-    replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-    set -e
-
-    #Based on prompt below
-    export STREAM_INPUT_SIZE=3
-    export STREAM_OUTPUT_SIZE=50
-    python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
-        --prompt="My name is" \
-        --streaming \
-        -o=$STREAM_OUTPUT_SIZE
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o end_to_end_token_metrics.out
-
-    set +e
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    replace_config_tags 'decoupled: True' 'decoupled: False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    # Start a clean server to verify base metrics are being
-    # reported correctly
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log"
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-    set -e
-
-    set +e
-    BACKEND_ROOT=${BACKEND_ROOT} python3 -m unittest ${BASE_METRICS_VERIFICATION_TEST_NAME}.TRTLLMBaseMetricsTest.test_end_to_end >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
-    if [ $? -ne 0 ]; then
-        cat ${BASE_METRICS_VERIFICATION_LOG}
-        echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        RET=1
-    fi
-    set -e
-
-    set +e
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # Start a clean server to verify base metrics are being
-    # reported correctly
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log"
-    replace_config_tags '${max_beam_width}' "2" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-    set -e
-
-    set +e
-    BACKEND_ROOT=${BACKEND_ROOT} python3 -m unittest ${BASE_METRICS_VERIFICATION_TEST_NAME}.TRTLLMBaseMetricsTest.test_end_to_end_beam_width >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
-    if [ $? -ne 0 ]; then
-        cat ${BASE_METRICS_VERIFICATION_LOG}
-        echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        RET=1
-    fi
-    set -e
-
-    set +e
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # World size must be 1 when using multi-model
-    if [ "${NUM_GPU}" == "0" ]; then
-        # Multi-model
-        SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
-        run_server "${SERVER_ARGS} --multi-model"
-        wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        if [ "$WAIT_RET" != "0" ]; then
-            # Cleanup
-            kill $SERVER_PID > /dev/null 2>&1 || true
-            echo -e "\n***\n*** Failed to start $SERVER\n***"
-            cat $SERVER_LOG
-            exit 1
-        fi
-        set -e
-
-        python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-            --max-input-len=500 \
-            --dataset=${DATASET}
-
-        if [ $? -ne 0 ]; then
-            cat $SERVER_LOG
-            echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-            kill_server
-            wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-            RET=1
-        fi
-        set +e
-
-        # Make sure the metrics is retrieved after the server has updated the metrics internally
-        sleep ${SLEEP_DURATION}
-        curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
-
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    fi
-
-    # inflight batching ON
-    # streaming ON
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log"
-    replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
-        --prompt="My name is"
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    # Per-request metrics stats
-    # Use large number of tokens for KV cache reuse
-    echo '{"text_input": "Machine learning is a field of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves the use of data and algorithms to imitate the way humans learn, gradually improving its accuracy. Machine learning is used in a variety of applications such as email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers.", "max_tokens": 50, "pad_id": 2, "end_id": 2, "return_perf_metrics": true }' > tmp.txt
-    echo "Machine learning is a field of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves the use of data and algorithms to imitate the way humans learn, gradually improving its accuracy. Machine learning is used in a variety of applications such as email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers." > prompt.txt
-
-    # Test the tensorrtllm model with different backends
-    for TRITON_BACKEND in tensorrtllm python; do
-        for DECOUPLED_TRIAL in non-decoupled decoupled; do
-            reset_model_repo
-            cp -r ${BACKEND_ROOT}/all_models/inflight_batcher_llm/* ${MODEL_DIR}
-            # Copy the engine and place it into the model folder
-            cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
-            ENGINE_DIR=${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/
-            TRITON_MAX_BATCH_SIZE=64
-            INSTANCE_COUNT=1
-            MAX_QUEUE_DELAY_MS=0
-            MAX_QUEUE_SIZE=0
-            FILL_TEMPLATE_SCRIPT=${BACKEND_ROOT}/tools/fill_template.py
-            if [ "${DECOUPLED_TRIAL}" == "non-decoupled" ]; then
-                DECOUPLED_MODE=false
-            else
-                DECOUPLED_MODE=true
-            fi
-
-            python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
-            python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT}
-            python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_batching,max_queue_size:${MAX_QUEUE_SIZE},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,request_stats_max_iterations:10,exclude_input_in_output:True,enable_kv_cache_reuse:True,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
-            python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT}
-            python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:TYPE_FP32
-
-            for ENDPOINT in generate grpc inflight_batcher_llm; do
-                SERVER_LOG="./${NUM_GPU}gpu_perf_metrics_${TRITON_BACKEND}_${ENDPOINT}_${DECOUPLED_TRIAL}_server.log"
-                CLIENT_LOG="./${NUM_GPU}gpu_perf_metrics_${TRITON_BACKEND}_${ENDPOINT}_${DECOUPLED_TRIAL}_client.log"
-                run_server "${SERVER_ARGS}"
-                wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-                for ITER in 1 2; do
-                    if [ "$ITER" == "1" ]; then
-                        EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS=4
-                        EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS=4
-                        EXPECTED_KV_CACHE_REUSED_BLOCKS=0
-                    else
-                        EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS=1
-                        EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS=1
-                        EXPECTED_KV_CACHE_REUSED_BLOCKS=4
-                    fi
-                    EXPECTED_ACCEPTANCE_RATE=0.0
-                    EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS=0
-                    EXPECTED_TOTAL_DRAFT_TOKENS=0
-
-                    if [ "$WAIT_RET" != "0" ]; then
-                        # Cleanup
-                        kill $SERVER_PID > /dev/null 2>&1 || true
-                        echo -e "\n***\n*** Failed to start $SERVER\n***"
-                        cat $SERVER_LOG
-                        exit 1
-                    fi
-
-                    if [ "$ENDPOINT" == "generate" ]; then
-                        # Generate endpoint
-                        # Test with both ensemble and tensorrt_llm_bls models
-                        if [ "$ITER" == "1" ]; then
-                            MODEL="ensemble"
-                        else
-                            MODEL="tensorrt_llm_bls"
-                        fi
-
-                        set +e
-                        if [ "${DECOUPLED_TRIAL}" == "non-decoupled" ]; then
-                            code=`curl -s -w %{http_code} -o ./curl.out -d @tmp.txt localhost:8000/v2/models/${MODEL}/generate`
-                        else
-                            # Remove the "data:" prefix from the response to avoid parsing issues
-                            code=$(curl -s -w %{http_code} -o ./curl.out -d @tmp.txt localhost:8000/v2/models/${MODEL}/generate_stream && sed -i 's/^data: //' ./curl.out)
-                        fi
-
-                        if [ "$code" != "200" ]; then
-                            cat ./curl.out
-                            echo -e "\n***\n*** Test Failed\n***"
-                            RET=1
-                        fi
-                        set -e
-
-                        kv_cache_alloc_new_blocks=$(jq '.kv_cache_alloc_new_blocks' curl.out)
-                        kv_cache_alloc_total_blocks=$(jq '.kv_cache_alloc_total_blocks' curl.out)
-                        kv_cache_reused_blocks=$(jq '.kv_cache_reused_blocks' curl.out)
-                        arrival_time_ns=$(jq '.arrival_time_ns' curl.out)
-                        first_scheduled_time_ns=$(jq '.first_scheduled_time_ns' curl.out)
-                        first_token_time_ns=$(jq '.first_token_time_ns' curl.out)
-                        last_token_time_ns=$(jq '.last_token_time_ns' curl.out)
-                        acceptance_rate=$(jq '.acceptance_rate' curl.out)
-                        total_accepted_draft_tokens=$(jq '.total_accepted_draft_tokens' curl.out)
-                        total_draft_tokens=$(jq '.total_draft_tokens' curl.out)
-                    else
-                        STREAMING_FLAG=""
-                        if [ "${DECOUPLED_TRIAL}" == "decoupled" ]; then
-                            STREAMING_FLAG="--streaming"
-                        fi
-                        if [ "$ENDPOINT" == "grpc" ]; then
-                            set +e
-                            python3 ${STREAM_DIR}/end_to_end_grpc_client.py -v --prompt="$(cat prompt.txt)" --return-perf-metrics ${STREAMING_FLAG} > ${CLIENT_LOG} 2>&1
-                            if [ $? -ne 0 ]; then
-                                cat $SERVER_LOG
-                                echo -e "\n***\n*** Error executing end_to_end_grpc_client.py with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-                                kill_server
-                                wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-                                RET=1
-                            fi
-                            set -e
-                        elif [ "$ENDPOINT" == "inflight_batcher_llm" ]; then
-                            set +e
-                            python3 ${STREAM_DIR}/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${TOKENIZER_DIR} \
-                                        --return-perf-metrics --text "$(cat prompt.txt)" ${STREAMING_FLAG} > ${CLIENT_LOG} 2>&1
-                            if [ $? -ne 0 ]; then
-                                cat $SERVER_LOG
-                                echo -e "\n***\n*** Error executing inflight_batcher_llm_client.py with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-                                kill_server
-                                wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-                                RET=1
-                            fi
-                            set -e
-                        fi
-                        kv_cache_alloc_new_blocks=$(grep "kv_cache_alloc_new_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        kv_cache_alloc_total_blocks=$(grep "kv_cache_alloc_total_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        kv_cache_reused_blocks=$(grep "kv_cache_reused_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        arrival_time_ns=$(grep "arrival_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        first_scheduled_time_ns=$(grep "first_scheduled_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        first_token_time_ns=$(grep "first_token_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        last_token_time_ns=$(grep "last_token_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        acceptance_rate=$(grep "acceptance_rate" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        total_accepted_draft_tokens=$(grep "total_accepted_draft_tokens" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                        total_draft_tokens=$(grep "total_draft_tokens" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
-                    fi
-
-                    if [[ "$kv_cache_alloc_new_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS" || \
-                        "$kv_cache_alloc_total_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS" || \
-                        "$kv_cache_reused_blocks" -ne "$EXPECTED_KV_CACHE_REUSED_BLOCKS" || \
-                        "$acceptance_rate" != "$EXPECTED_ACCEPTANCE_RATE" || \
-                        "$total_accepted_draft_tokens" -ne "$EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS" || \
-                        "$total_draft_tokens" -ne "$EXPECTED_TOTAL_DRAFT_TOKENS" ]]; then
-                        echo "Test failed for ${ENDPOINT} with ${NUM_GPU}GPU(s):"
-                        [[ "$kv_cache_alloc_new_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS" ]] && \
-                            echo "  kv_cache_alloc_new_blocks: expected $EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS, got $kv_cache_alloc_new_blocks"
-                        [[ "$kv_cache_alloc_total_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS" ]] && \
-                            echo "  kv_cache_alloc_total_blocks: expected $EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS, got $kv_cache_alloc_total_blocks"
-                        [[ "$kv_cache_reused_blocks" -ne "$EXPECTED_KV_CACHE_REUSED_BLOCKS" ]] && \
-                            echo "  kv_cache_reused_blocks: expected $EXPECTED_KV_CACHE_REUSED_BLOCKS, got $kv_cache_reused_blocks"
-                        [[ "$acceptance_rate" != "$EXPECTED_ACCEPTANCE_RATE" ]] && \
-                            echo "  acceptance_rate: expected $EXPECTED_ACCEPTANCE_RATE, got $acceptance_rate"
-                        [[ "$total_accepted_draft_tokens" -ne "$EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS" ]] && \
-                            echo "  total_accepted_draft_tokens: expected $EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS, got $total_accepted_draft_tokens"
-                        [[ "$total_draft_tokens" -ne "$EXPECTED_TOTAL_DRAFT_TOKENS" ]] && \
-                            echo "  total_draft_tokens: expected $EXPECTED_TOTAL_DRAFT_TOKENS, got $total_draft_tokens"
-                        RET=1
-                    fi
-
-                    if ! [[ $arrival_time_ns =~ ^-?[0-9]+$ ]] || [ $arrival_time_ns -eq 0 ]; then
-                        echo "Arrival time $arrival_time_ns is not valid, expected positive integer value"
-                        RET=1
-                    fi
-                    if ! [[ $first_scheduled_time_ns =~ ^-?[0-9]+$ ]] || [ $first_scheduled_time_ns  -eq 0 ]; then
-                        echo "First scheduled time $first_scheduled_time_ns is not valid, expected positive integer value"
-                        RET=1
-                    fi
-                    if ! [[ $first_token_time_ns =~ ^-?[0-9]+$ ]] || [ $first_token_time_ns -eq 0 ]; then
-                        echo "First token time $first_token_time_ns is not valid, expected positive integer value"
-                        RET=1
-                    fi
-                    if [ "${DECOUPLED_TRIAL}" == "decoupled" ]; then
-                        # Allow 0 for streaming mode
-                        if ! [[ $last_token_time_ns =~ ^-?[0-9]+$ ]]; then
-                            echo "Last token time: expected 0; got $last_token_time_ns"
-                            RET=1
-                        fi
-                    else
-                        if ! [[ $last_token_time_ns =~ ^-?[0-9]+$ ]] || [ $last_token_time_ns -eq 0 ]; then
-                            echo "Last token time $last_token_time_ns is not valid, expected positive integer value"
-                            RET=1
-                        fi
-                    fi
-                done
-                kill_server
-                wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-                # Add a delay to make sure the memory is freed before starting the next test
-                sleep 10
-            done
-        done
-    done
-done
-
-
-set +e
-# Verify TRT LLM statistics are being properly reported as custom metrics
-python3 ${CUSTOM_METRICS_VERIFICATION_TEST} >> ${CUSTOM_METRICS_VERIFICATION_LOG} 2>&1
-if [ $? -ne 0 ]; then
-    cat ${CUSTOM_METRICS_VERIFICATION_LOG}
-    RET=1
-fi
-set -e
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test FAILED\n***"
-fi
-
-exit $RET
diff --git a/triton_backend/ci/README.md b/triton_backend/ci/README.md
deleted file mode 100644
index 55cff0967de..00000000000
--- a/triton_backend/ci/README.md
+++ /dev/null
@@ -1,107 +0,0 @@
-<!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--->
-
-# Testing TensorRT LLM backend
-
-Tests in this CI directory can be run manually to provide extensive testing.
-
-## Run QA Tests
-
-Run the testing within the Triton container.
-
-```bash
-docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/opt/tritonserver/tensorrtllm_backend nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 bash
-
-# Change directory to the test and run the test.sh script
-cd /opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm
-bash -x ./test.sh
-```
-
-## Run the e2e/benchmark_core_model to benchmark
-
-These two tests are ran in the [L0_backend_trtllm](./L0_backend_trtllm/)
-test. Below are the instructions to run the tests manually.
-
-### Generate the model repository
-
-Follow the instructions in the
-[Create the model repository](../README.md#prepare-the-model-repository)
-section to prepare the model repository.
-
-### Modify the model configuration
-
-Follow the instructions in the
-[Modify the model configuration](../README.md#modify-the-model-configuration)
-section to modify the model configuration based on the needs.
-
-### End to end test
-
-[End to end test script](../tools/inflight_batcher_llm/end_to_end_test.py) sends
-requests to the deployed `ensemble` model.
-
-Ensemble model is ensembled by three models: `preprocessing`, `tensorrt_llm` and `postprocessing`:
-- "preprocessing": This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints).
-- "tensorrt_llm": This model is a wrapper of your TensorRT LLM model and is used for inferencing
-- "postprocessing": This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string).
-
-The end to end latency includes the total latency of the three parts of an ensemble model.
-
-```bash
-cd tools/inflight_batcher_llm
-python3 end_to_end_test.py --dataset <dataset path>
-```
-
-Expected outputs
-```
-[INFO] Functionality test succeed.
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 125 prompts.
-[INFO] Total Latency: 11099.243 ms
-```
-
-### benchmark_core_model
-
-[benchmark_core_model script](../tools/inflight_batcher_llm/benchmark_core_model.py)
-sends requests directly to the deployed `tensorrt_llm` model, the benchmark_core_model
-latency indicates the inference latency of TensorRT-LLM, not including the
-pre/post-processing latency which is usually handled by a third-party library
-such as HuggingFace.
-
-```bash
-cd tools/inflight_batcher_llm
-python3 benchmark_core_model.py dataset --dataset <dataset path>
-```
-
-Expected outputs
-
-```
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 125 prompts.
-[INFO] Total Latency: 10213.462 ms
-```
-*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
diff --git a/triton_backend/inflight_batcher_llm/CMakeLists.txt b/triton_backend/inflight_batcher_llm/CMakeLists.txt
deleted file mode 100644
index 62da21994e5..00000000000
--- a/triton_backend/inflight_batcher_llm/CMakeLists.txt
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met: *
-# Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer. * Redistributions in binary
-# form must reproduce the above copyright notice, this list of conditions and
-# the following disclaimer in the documentation and/or other materials provided
-# with the distribution. * Neither the name of NVIDIA CORPORATION nor the names
-# of its contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS
-# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-# EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-cmake_minimum_required(VERSION 3.17)
-include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake)
-
-set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../)
-
-include_directories(${TRTLLM_DIR} ${TRTLLM_DIR}/cpp/include)
-
-list(APPEND CMAKE_MODULE_PATH "${TRTLLM_DIR}/cpp/cmake/modules")
-
-project(tritontensorrtllmbackend LANGUAGES C CXX)
-
-add_compile_options("-DENABLE_MULTI_DEVICE=1")
-# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
-option(USE_CXX11_ABI "Using CXX11 ABI of libstdc++" OFF)
-message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
-if(USE_CXX11_ABI)
-  add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=1")
-else()
-  add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
-endif()
-
-find_package(CUDAToolkit REQUIRED)
-
-#
-# Options
-#
-# Must include options required for this project as well as any projects
-# included in this one by FetchContent.
-#
-# TRITON_ENABLE_GPU is set to OFF as currently the code does not use any GPU
-# related features since TRT-LLM backend manages the usage on GPUs itself.
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
-option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
-option(TRITON_ENABLE_METRICS "Include metrics support in server" ON)
-option(BUILD_TESTS "Build Google tests" OFF)
-
-if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS)
-  message(
-    FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON")
-endif()
-
-set(TRITON_REPO_ORGANIZATION
-    "https://github.com/triton-inference-server"
-    CACHE STRING "Git repository to pull from")
-set(TRITON_COMMON_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/common repo")
-set(TRITON_CORE_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_BACKEND_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/backend repo")
-
-if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
-  add_definitions("-DENABLE_BF16")
-  message(
-    STATUS
-      "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
-  )
-endif()
-
-if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8")
-  add_definitions("-DENABLE_FP8")
-  message(
-    STATUS
-      "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
-  )
-endif()
-
-if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8")
-  add_definitions("-DENABLE_FP4")
-  message(
-    STATUS
-      "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 12.8, enable -DENABLE_FP4 flag"
-  )
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include)
-message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
-
-#
-# Dependencies
-#
-# FetchContent requires us to include the transitive closure of all repos that
-# we depend on so that we can override the tags.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# The backend must be built into a shared library. Use an ldscript to hide all
-# symbols except for the TRITONBACKEND API.
-#
-configure_file(src/libtriton_tensorrtllm.ldscript
-               libtriton_tensorrtllm.ldscript COPYONLY)
-
-set(SRCS src/libtensorrtllm.cc src/model_instance_state.cc src/model_state.cc
-         src/utils.cc src/namedTensor.cpp)
-
-add_library(triton-tensorrt-llm-backend SHARED ${SRCS})
-
-enable_language(CUDA)
-
-find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
-find_package(Python3 COMPONENTS Interpreter Development)
-
-find_library(
-  tensorrt_llm libtensorrt_llm.so REQUIRED
-  PATHS ${Python3_SITEARCH}/tensorrt_llm/libs
-        ${TRTLLM_DIR}/cpp/build/tensorrt_llm)
-
-find_library(
-  nvinfer_plugin_tensorrt_llm libnvinfer_plugin_tensorrt_llm.so REQUIRED
-  PATHS ${Python3_SITEARCH}/tensorrt_llm/libs
-        ${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins)
-
-find_program(
-  TRTLLM_EXECUTOR_WORKER executorWorker REQUIRED
-  PATHS ${Python3_SITEARCH}/tensorrt_llm/bin
-        ${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker)
-install(
-  PROGRAMS ${TRTLLM_EXECUTOR_WORKER}
-  DESTINATION ${CMAKE_BINARY_DIR}
-  RENAME trtllmExecutorWorker)
-
-find_library(
-  CUDNN_LIB cudnn
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib)
-find_library(
-  CUBLAS_LIB cublas
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib lib/stubs)
-find_library(
-  CUBLASLT_LIB cublasLt
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib lib/stubs)
-find_library(
-  CUDART_LIB cudart
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64)
-find_library(
-  CUDA_DRV_LIB cuda
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs)
-find_library(
-  NVIDIA_ML_LIB nvidia-ml
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs)
-set(CUDA_LIBRARIES ${CUDART_LIB} ${NVIDIA_ML_LIB})
-
-find_package(MPI REQUIRED)
-message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
-message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}")
-
-# NCCL dependencies
-find_package(NCCL 2 REQUIRED)
-set(NCCL_LIB NCCL::nccl)
-
-# TRT dependencies
-set(TensorRT_ROOT /usr/local/tensorrt)
-find_package(TensorRT 10 REQUIRED)
-add_definitions("-DTRT_LLM_USE_DIM64")
-
-list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS})
-include_directories(${COMMON_HEADER_DIRS})
-
-target_include_directories(
-  triton-tensorrt-llm-backend
-  PRIVATE ${TRTLLM_DIR}/cpp
-          ${TRTLLM_DIR}/cpp/include
-          ${CMAKE_CURRENT_SOURCE_DIR}/src
-          ${CUDA_INCLUDE_DIRS}
-          ${CUDNN_ROOT_DIR}/include
-          ${NCCL_INCLUDE_DIR}
-          ${3RDPARTY_DIR}/cutlass/include
-          ${MPI_INCLUDE_PATH}
-          ${COMMON_HEADER_DIR})
-
-target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17)
-
-set(COMPILE_OPTIONS
-    $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall
-    -Wextra
-    -Wno-unused-parameter
-    -Wno-deprecated-declarations
-    -Wno-type-limits>
-    $<$<CXX_COMPILER_ID:MSVC>:/Wall
-    /D_WIN32_WINNT=0x0A00
-    /EHsc>)
-
-target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
-
-if(TRITON_ENABLE_METRICS)
-  list(APPEND REPORTER_SRCS
-       src/custom_metrics_reporter/custom_metrics_reporter.cc)
-  list(APPEND REPORTER_HDRS
-       src/custom_metrics_reporter/custom_metrics_reporter.h)
-
-  add_library(triton-custom-metrics-reporter-library EXCLUDE_FROM_ALL
-              ${REPORTER_SRCS} ${REPORTER_HDRS})
-  target_compile_features(triton-custom-metrics-reporter-library
-                          PRIVATE cxx_std_17)
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    target_compile_options(triton-custom-metrics-reporter-library
-                           PRIVATE /W1 /D_WIN32_WINNT=0x0A00 /EHsc)
-  else()
-    target_compile_options(
-      triton-custom-metrics-reporter-library
-      PRIVATE -Wall -Wextra -Wno-unused-parameter -Wno-deprecated-declarations
-              -Werror)
-  endif()
-
-  set_target_properties(triton-custom-metrics-reporter-library
-                        PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-  target_link_libraries(
-    triton-custom-metrics-reporter-library
-    PUBLIC triton-common-json # from repo-common
-           triton-common-logging # from repo-common
-           triton-core-serverapi # from repo-core
-           triton-core-serverstub # from repo-core
-           triton-backend-utils # from repo-backend
-           ${tensorrt_llm})
-
-  target_compile_definitions(triton-tensorrt-llm-backend
-                             PRIVATE TRITON_ENABLE_METRICS=1)
-  target_link_libraries(triton-tensorrt-llm-backend
-                        PRIVATE triton-custom-metrics-reporter-library)
-endif()
-
-target_link_libraries(
-  triton-tensorrt-llm-backend
-  PUBLIC ${tensorrt_llm}
-         triton-core-serverapi # from repo-core
-         triton-core-backendapi # from repo-core
-         triton-core-serverstub # from repo-core
-         triton-backend-utils # from repo-backend
-         ${MPI_LIBRARIES}
-         ${CUDA_LIBRARIES}
-         TensorRT::NvInfer
-         ${nvinfer_plugin_tensorrt_llm})
-
-FetchContent_Declare(
-  json
-  GIT_REPOSITORY https://github.com/nlohmann/json.git
-  GIT_TAG v3.11.2)
-
-FetchContent_MakeAvailable(json)
-
-target_link_libraries(triton-tensorrt-llm-backend
-                      PRIVATE nlohmann_json::nlohmann_json)
-
-if(WIN32)
-  set_target_properties(
-    triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON
-                                           OUTPUT_NAME triton_tensorrtllm)
-else()
-  set_target_properties(
-    triton-tensorrt-llm-backend
-    PROPERTIES
-      POSITION_INDEPENDENT_CODE ON
-      OUTPUT_NAME triton_tensorrtllm
-      LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
-      LINK_FLAGS
-      "-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
-  )
-endif()
-
-if(BUILD_TESTS)
-  enable_testing()
-  add_subdirectory(tests)
-endif()
diff --git a/triton_backend/inflight_batcher_llm/client/README.md b/triton_backend/inflight_batcher_llm/client/README.md
deleted file mode 100644
index 9b3bea05601..00000000000
--- a/triton_backend/inflight_batcher_llm/client/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Sample TRT-LLM backend clients
-Three sample TRT-LLM Triton clients are provided with the TRT-LLM Triton backend implementation.
-* `e2e_grpc_speculative_decoding_client.py`: Demonstrates how to orchestrate between two independent TRT-LLM models - a draft model and a target model to achiever faster inferencing using speculative decoding. The high level design involves the client making a call to the draft model requesting a certain number of draft tokens, and then associating those draft tokens with a request to the target model. The target model returns some number of completion tokens internally leveraging the draft tokens to speed up inference. The client wraps these back-to-back calls to draft and target models in a loop to complete the full generation.
-Example command:
-```
-python3 e2e_grpc_speculative_decoding_client.py -p "The only thing we have to fear is" \
-              --url-draft ${DRAFT_MODEL_URL} \
-              --url-target ${TARGET_MODEL_URL}
-```
-To get draft model draft tokens's logits, you need to enable `gather_generation_logits` when building then engine, and add `--return-draft-model-draft-logits` when running `e2e_grpc_speculative_decoding_client.py`.
-
-To get the target model accepted tokens's logits, you need to enable `gather_generation_logits` when building the engine, and add `--return-target-model-accepted-token-logits` when running `e2e_grpc_speculative_decoding_client.py`.
-
-
-* `end_to_end_grpc_client.py`: Demonstrates sending a single request to a tritonserver running an ensemble including preprocessor (tokenizer), TRT-LLM model and postprocessor (detokenizer) and getting back a completion from it.
-Example command:
-```
-python3 end_to_end_grpc_client.py \
-        --streaming --output-len 10 \
-        --prompt "The only thing we have to fear is"
-
-```
-* `inflight_batcher_llm_client.py`: Isolates queries and responses to the TRT-LLM model alone. Invokes tokenizer and detokenizer in the client script i.e. outside the server running inference.
-Example command:
-```
-python3 inflight_batcher_llm_client.py \
-            --tokenizer-dir ${TOKENIZER_PATH} \
-            --tokenizer-type ${TOKENIZER_TYPE} \
-            --input-tokens-csv=${LOGDIR}/prompts.csv \
-            --output-tokens-csv=${LOGDIR}/completions.csv
-```
diff --git a/triton_backend/inflight_batcher_llm/client/__init__.py b/triton_backend/inflight_batcher_llm/client/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py b/triton_backend/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py
deleted file mode 100644
index ccac573d1f5..00000000000
--- a/triton_backend/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py
+++ /dev/null
@@ -1,562 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import queue
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-        output = result.as_numpy('text_output')
-        print(output, flush=True)
-
-
-def get_preprocessor_inputs(prompt, output_len, bad_words, stop_words, end_id,
-                            pad_id):
-    input0 = [[prompt]]
-    input0_data = np.array(input0).astype(object)
-    output0_len = np.ones_like(input0).astype(np.int32) * output_len
-
-    preprocessor_inputs = [
-        prepare_tensor("QUERY", input0_data),
-        prepare_tensor("REQUEST_OUTPUT_LEN", output0_len),
-    ]
-
-    if bad_words:
-        bad_words_list = np.array([bad_words], dtype=object)
-        preprocessor_inputs += [
-            prepare_tensor("BAD_WORDS_DICT", bad_words_list)
-        ]
-
-    if stop_words:
-        stop_words_list = np.array([stop_words], dtype=object)
-        preprocessor_inputs += [
-            prepare_tensor("STOP_WORDS_DICT", stop_words_list)
-        ]
-
-    if end_id:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        preprocessor_inputs += [prepare_tensor("END_ID", end_id_data)]
-
-    if pad_id:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        preprocessor_inputs += [prepare_tensor("PAD_ID", pad_id_data)]
-
-    return preprocessor_inputs
-
-
-def extract_preprocessor_outputs(result):
-
-    input_ids = np.squeeze(result.as_numpy("INPUT_ID").astype(np.int32), axis=0)
-    bad_words_ids = result.as_numpy("BAD_WORDS_IDS").astype(np.int32)
-    stop_words_ids = result.as_numpy("STOP_WORDS_IDS").astype(np.int32)
-    end_id = result.as_numpy("OUT_END_ID").astype(np.int32)[0][0]
-    pad_id = result.as_numpy("OUT_PAD_ID").astype(np.int32)[0][0]
-
-    return input_ids, bad_words_ids, stop_words_ids, end_id, pad_id
-
-
-def get_trtllm_inputs(input_ids,
-                      input_length,
-                      request_output_len,
-                      draft_tokens,
-                      beam_width,
-                      temperature,
-                      repetition_penalty,
-                      presence_penalty,
-                      frequency_penalty,
-                      bad_words_ids,
-                      stop_words_ids,
-                      end_id,
-                      pad_id,
-                      return_draft_model_draft_logits=False,
-                      return_target_model_accepted_token_logits=False):
-
-    # These two flags correspond to the settings of draft model and target model respectively,
-    # and only one of them can be true at a time.
-    assert not (return_draft_model_draft_logits
-                and return_target_model_accepted_token_logits)
-
-    # input_ids is expected to have shape [input_length]
-    # Add batch dimension of 1
-    input_ids_data = np.expand_dims(input_ids, axis=0)
-    inputs = [
-        prepare_tensor("input_ids", input_ids_data),
-        prepare_tensor("input_lengths",
-                       np.array([[input_length]], dtype=np.int32)),
-        prepare_tensor("request_output_len",
-                       np.array([[request_output_len]], dtype=np.int32)),
-        prepare_tensor("bad_words_list", bad_words_ids),
-        prepare_tensor("stop_words_list", stop_words_ids),
-        prepare_tensor("beam_width", np.array([[beam_width]], dtype=np.int32)),
-        prepare_tensor("temperature", np.array([[temperature]],
-                                               dtype=np.float32)),
-    ]
-
-    if draft_tokens is not None:
-        draft_tokens_data = np.array([draft_tokens], dtype=np.int32)
-        inputs.append(prepare_tensor("draft_input_ids", draft_tokens_data))
-
-    if repetition_penalty is not None:
-        repetition_penalty_data = np.array([[repetition_penalty]],
-                                           dtype=np.float32)
-        inputs.append(
-            prepare_tensor("repetition_penalty", repetition_penalty_data))
-
-    if presence_penalty is not None:
-        presence_penalty_data = np.array([[presence_penalty]], dtype=np.float32)
-        inputs.append(prepare_tensor("presence_penalty", presence_penalty_data))
-
-    if frequency_penalty is not None:
-        frequency_penalty_data = np.array([[frequency_penalty]],
-                                          dtype=np.float32)
-        inputs.append(
-            prepare_tensor("frequency_penalty", frequency_penalty_data))
-
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        inputs.append(prepare_tensor("end_id", end_id_data))
-
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        inputs.append(prepare_tensor("pad_id", pad_id_data))
-
-    if return_draft_model_draft_logits:
-        return_draft_model_draft_logits_data = np.array(
-            [[return_draft_model_draft_logits]], dtype=bool)
-        inputs.append(
-            prepare_tensor("return_generation_logits",
-                           return_draft_model_draft_logits_data))
-
-    if return_target_model_accepted_token_logits:
-        return_target_model_accepted_token_logits_data = np.array(
-            [[return_target_model_accepted_token_logits]], dtype=bool)
-        inputs.append(
-            prepare_tensor("return_generation_logits",
-                           return_target_model_accepted_token_logits_data))
-
-    return inputs
-
-
-def check_result(result, model_name):
-    if type(result) == InferenceServerException:
-        print(
-            f"Received an error from server while calling {model_name}: {result}"
-        )
-
-
-def extract_trtllm_outputs(result):
-    # Get batch 0, beam 0 output_ids
-    output_ids = np.squeeze(result.as_numpy("output_ids").astype(np.int32),
-                            axis=(0, 1))
-    sequence_length_data = result.as_numpy("sequence_length").astype(np.int32)
-    assert sequence_length_data.shape[0] == 1
-    assert sequence_length_data.shape[1] == 1
-    sequence_length = sequence_length_data[0, 0]
-    cum_log_probs = result.as_numpy("cum_log_probs")
-    if cum_log_probs is not None:
-        cum_log_probs = cum_log_probs.astype(np.float32)
-    output_log_probs = result.as_numpy("output_log_probs")
-    if output_log_probs is not None:
-        output_log_probs = output_log_probs.astype(np.float32)
-    context_logits = result.as_numpy("context_logits")
-    if context_logits is not None:
-        context_logits.astype(np.float32)
-    generation_logits = result.as_numpy("generation_logits")
-    if generation_logits is not None:
-        generation_logits = generation_logits.astype(np.float32)
-    return output_ids, sequence_length, cum_log_probs, output_log_probs, context_logits, generation_logits
-
-
-def get_postprocessor_inputs(output_ids, cum_log_probs, output_log_probs,
-                             context_logits, generation_logits):
-    output_ids_data = np.expand_dims(output_ids, axis=(0, 1))
-    inputs = [
-        prepare_tensor("TOKENS_BATCH", output_ids_data),
-        prepare_tensor("SEQUENCE_LENGTH",
-                       np.array([[len(output_ids)]], dtype=np.int32)),
-    ]
-
-    return inputs
-
-
-def encountered_stop_words(input_ids, stop_words_ids):
-    for stop_word_ids in stop_words_ids:
-        if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
-            return True
-    return False
-
-
-def run_speculative_inference(
-        client_draft, client_target, prompt, output_len, in_num_draft_tokens,
-        request_id, repetition_penalty, presence_penalty, frequency_penalty,
-        temperature, stop_words, bad_words, end_id, pad_id, beam_width,
-        preprocessor_model_name, draft_tensorrt_llm_model_name,
-        target_tensorrt_llm_model_name, postprocessor_model_name,
-        return_draft_model_draft_logits,
-        return_target_model_accepted_token_logits, verbose):
-
-    # Call the preprocessor
-    preprocessor_inputs = get_preprocessor_inputs(prompt, output_len, bad_words,
-                                                  stop_words, end_id, pad_id)
-    preprocessor_result = client_draft.infer(preprocessor_model_name,
-                                             preprocessor_inputs,
-                                             request_id=request_id)
-    check_result(preprocessor_result, preprocessor_model_name)
-    prompt_input_ids, bad_words_ids, stop_words_ids, end_id, pad_id = extract_preprocessor_outputs(
-        preprocessor_result)
-
-    input_ids = prompt_input_ids
-    last_input_ids = None
-    draft_output_ids = None
-
-    while True:
-
-        num_draft_tokens = min(
-            in_num_draft_tokens,
-            len(prompt_input_ids) + output_len - len(input_ids) - 1)
-
-        if num_draft_tokens > 0:
-
-            if verbose:
-                print("Draft model input ids:")
-                print(input_ids.tolist())
-
-            #Generate up to num_draft_tokens with draft model
-            draft_inputs = get_trtllm_inputs(
-                input_ids,
-                len(input_ids),
-                num_draft_tokens,
-                None,
-                beam_width,
-                temperature,
-                repetition_penalty,
-                presence_penalty,
-                frequency_penalty,
-                bad_words_ids,
-                stop_words_ids,
-                end_id,
-                pad_id,
-                return_draft_model_draft_logits=return_draft_model_draft_logits)
-
-            draft_result = client_draft.infer(draft_tensorrt_llm_model_name,
-                                              draft_inputs,
-                                              request_id=request_id)
-            check_result(draft_result, draft_tensorrt_llm_model_name)
-            draft_output_ids, draft_seq_len, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs(
-                draft_result)
-
-            if verbose:
-                print("Draft model output ids:")
-                print(draft_output_ids.tolist())
-                print("draft_sequence_length")
-                print(draft_seq_len)
-
-            # Set the draft token and call the target model to generate up to num_draft_tokens + 1
-            draft_tokens = draft_output_ids[len(input_ids):draft_seq_len]
-
-            if verbose:
-                print("draft_tokens")
-                print(draft_tokens.tolist())
-                if return_draft_model_draft_logits:
-                    draft_model_draft_token_logits = generation_logits.squeeze(
-                        0)  # [beam_width, num_draft_tokens, vocab_size]
-                    print(
-                        f"draft model draft tokens' logits: shape: {draft_model_draft_token_logits.shape}, value: {draft_model_draft_token_logits}"
-                    )
-
-        if verbose:
-            print("Target model input ids")
-            print(input_ids.tolist())
-
-        # Generate up to len(draft_tokens) + 1 with target model
-        target_inputs = get_trtllm_inputs(
-            input_ids,
-            len(input_ids),
-            len(draft_tokens) + 1 if num_draft_tokens > 0 else 1,
-            draft_tokens if num_draft_tokens > 0 else None,
-            beam_width,
-            temperature,
-            repetition_penalty,
-            presence_penalty,
-            frequency_penalty,
-            bad_words_ids,
-            stop_words_ids,
-            end_id,
-            pad_id,
-            return_target_model_accepted_token_logits=
-            return_target_model_accepted_token_logits)
-
-        target_result = client_target.infer(target_tensorrt_llm_model_name,
-                                            target_inputs,
-                                            request_id=request_id)
-        check_result(target_result, target_tensorrt_llm_model_name)
-        target_output_ids, seq_length, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs(
-            target_result)
-
-        if verbose:
-            print("Target model output_ids")
-            print(target_output_ids.tolist())
-            print("target seq_length")
-            print(seq_length)
-            if return_target_model_accepted_token_logits:
-                target_model_accept_token_logits = generation_logits.squeeze(
-                    0).squeeze(0)  # [num_accepted_tokens, vocab_size]
-                print(
-                    f"target model accepted tokens' logits: shape: {target_model_accept_token_logits.shape}, value: {target_model_accept_token_logits}"
-                )
-
-        # Store the last iteration input_ids to check if EOS was encountered
-        last_input_ids = input_ids
-        # Update the input ids with new output_ids
-        input_ids = target_output_ids
-
-        # Evaluate criteria to stop generation loop.
-        # If we've hit or exceeded the max output length, should stop
-        length_stop = (len(input_ids) >= len(prompt_input_ids) + output_len)
-        # If draft and target have same outputs, should stop. Normally target should return 1 more token.
-        # If they are the same length, they should differ at the last token
-        target_draft_equal = draft_output_ids is not None and np.array_equal(
-            draft_output_ids, target_output_ids)
-        # If tokens no longer change, should stop, means we have hit early stopping
-        last_current_equal = np.array_equal(last_input_ids, input_ids)
-        # Need to check if stop words was encountered
-        hit_stop_words = encountered_stop_words(input_ids, stop_words_ids[0])
-
-        if verbose:
-            print("length_stop:", length_stop)
-            print("target_draft_equal:", target_draft_equal)
-            print("last_current_equal:", last_current_equal)
-            print("hit_stop_words:", hit_stop_words)
-
-        if (length_stop or target_draft_equal or last_current_equal
-                or hit_stop_words):
-            break
-
-    # Call the postprocessor
-    postprocessor_inputs = get_postprocessor_inputs(input_ids, cum_log_probs,
-                                                    output_log_probs,
-                                                    context_logits,
-                                                    generation_logits)
-    postprocessor_result = client_target.infer(postprocessor_model_name,
-                                               postprocessor_inputs,
-                                               request_id=request_id)
-    check_result(postprocessor_result, postprocessor_model_name)
-    output = postprocessor_result.as_numpy("OUTPUT")
-    return output[0].decode("utf8")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-
-    parser.add_argument('--url-target',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the target model')
-
-    parser.add_argument('--url-draft',
-                        type=str,
-                        required=False,
-                        help='Inference server URL for the draft model')
-
-    parser.add_argument(
-        '--preprocessor-model-name',
-        type=str,
-        required=False,
-        default="preprocessing",
-        help='Name of the preprocessor model (should be hosted at url-draft)')
-
-    parser.add_argument(
-        '--postprocessor-model-name',
-        type=str,
-        required=False,
-        default="postprocessing",
-        help='Name of the postprocessor model (should be hosted at url-target)')
-
-    parser.add_argument(
-        '--draft-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-draft)')
-
-    parser.add_argument(
-        '--target-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-target)')
-
-    parser.add_argument('-p',
-                        '--prompt',
-                        type=str,
-                        required=True,
-                        help='Input prompt.')
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument(
-        '--num-draft-tokens',
-        type=int,
-        default=5,
-        required=False,
-        help=
-        'Specify the number of speculative tokens for the draft model to generate per lookahead.'
-    )
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The end if token')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The pad if token')
-
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='1',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument(
-        "--return-draft-model-draft-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine"
-    )
-
-    parser.add_argument(
-        "--return-target-model-accepted-token-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine",
-    )
-
-    FLAGS = parser.parse_args()
-    if not FLAGS.url_target:
-        FLAGS.url_target = "localhost:8001"
-
-    if not FLAGS.url_draft:
-        FLAGS.url_draft = FLAGS.url_target
-
-    try:
-        client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target)
-        client_draft = grpcclient.InferenceServerClient(
-            url=FLAGS.url_draft) if (FLAGS.url_target
-                                     != FLAGS.url_draft) else client_target
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    if (FLAGS.beam_width > 1):
-        raise Exception(
-            'Beam width > 1 is not yet supported with speculative decoding')
-
-    output_text = run_speculative_inference(
-        client_draft, client_target, FLAGS.prompt, FLAGS.output_len,
-        FLAGS.num_draft_tokens, FLAGS.request_id, FLAGS.repetition_penalty,
-        FLAGS.presence_penalty, FLAGS.frequency_penalty, FLAGS.temperature,
-        FLAGS.stop_words, FLAGS.bad_words, FLAGS.end_id, FLAGS.pad_id,
-        FLAGS.beam_width, FLAGS.preprocessor_model_name,
-        FLAGS.draft_tensorrt_llm_model_name,
-        FLAGS.target_tensorrt_llm_model_name, FLAGS.postprocessor_model_name,
-        FLAGS.return_draft_model_draft_logits,
-        FLAGS.return_target_model_accepted_token_logits, FLAGS.verbose)
-
-    # Print the final text
-    print("Final text:\n", output_text)
diff --git a/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py b/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py
deleted file mode 100644
index 01bcbb7bca8..00000000000
--- a/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py
+++ /dev/null
@@ -1,736 +0,0 @@
-#!/usr/bin/python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import sys
-from functools import partial
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import json
-import queue
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-def prepare_inputs(prompt,
-                   output_len,
-                   repetition_penalty,
-                   presence_penalty,
-                   frequency_penalty,
-                   temperature,
-                   stop_words,
-                   bad_words,
-                   embedding_bias_words,
-                   embedding_bias_weights,
-                   streaming,
-                   beam_width,
-                   return_log_probs_data,
-                   return_context_logits_data,
-                   return_generation_logits_data,
-                   return_perf_metrics_data,
-                   end_id,
-                   pad_id,
-                   num_draft_tokens=0,
-                   use_draft_logits=None,
-                   num_return_sequences=1,
-                   lora_dir=None,
-                   lora_task_id=None,
-                   exclude_input_in_output=False,
-                   guided_decoding_guide_type=None,
-                   guided_decoding_guide=None):
-
-    input0 = [[prompt]]
-    input0_data = np.array(input0).astype(object)
-    output0_len = np.ones_like(input0).astype(np.int32) * output_len
-    streaming_data = np.array([[streaming]], dtype=bool)
-    beam_width_data = np.array([[beam_width]], dtype=np.int32)
-    temperature_data = np.array([[temperature]], dtype=np.float32)
-
-    inputs = {
-        "text_input": input0_data,
-        "max_tokens": output0_len,
-        "stream": streaming_data,
-        "beam_width": beam_width_data,
-        "temperature": temperature_data,
-    }
-
-    if num_return_sequences is not None:
-        inputs["num_return_sequences"] = np.array([[num_return_sequences]],
-                                                  dtype=np.int32)
-
-    if num_draft_tokens > 0:
-        inputs["num_draft_tokens"] = np.array([[num_draft_tokens]],
-                                              dtype=np.int32)
-    if use_draft_logits is not None:
-        inputs["use_draft_logits"] = np.array([[use_draft_logits]], dtype=bool)
-
-    if bad_words:
-        bad_words_list = np.array([bad_words], dtype=object)
-        inputs["bad_words"] = bad_words_list
-
-    if stop_words:
-        stop_words_list = np.array([stop_words], dtype=object)
-        inputs["stop_words"] = stop_words_list
-
-    if repetition_penalty is not None:
-        repetition_penalty = [[repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty, dtype=np.float32)
-        inputs["repetition_penalty"] = repetition_penalty_data
-
-    if presence_penalty is not None:
-        presence_penalty = [[presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-        inputs["presence_penalty"] = presence_penalty_data
-
-    if frequency_penalty is not None:
-        frequency_penalty = [[frequency_penalty]]
-        frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32)
-        inputs["frequency_penalty"] = frequency_penalty_data
-
-    if return_log_probs_data is not None:
-        inputs["return_log_probs"] = return_log_probs_data
-
-    if return_context_logits_data is not None:
-        inputs["return_context_logits"] = return_context_logits_data
-
-    if return_generation_logits_data is not None:
-        inputs["return_generation_logits"] = return_generation_logits_data
-
-    if (embedding_bias_words is not None and embedding_bias_weights
-            is None) or (embedding_bias_words is None
-                         and embedding_bias_weights is not None):
-        assert 0, "Both embedding bias words and weights must be specified"
-
-    if (embedding_bias_words is not None
-            and embedding_bias_weights is not None):
-        assert len(embedding_bias_words) == len(
-            embedding_bias_weights
-        ), "Embedding bias weights and words must have same length"
-        embedding_bias_words_data = np.array([embedding_bias_words],
-                                             dtype=object)
-        embedding_bias_weights_data = np.array([embedding_bias_weights],
-                                               dtype=np.float32)
-        inputs["embedding_bias_words"] = embedding_bias_words_data
-        inputs["embedding_bias_weights"] = embedding_bias_weights_data
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        inputs["end_id"] = end_id_data
-
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        inputs["pad_id"] = pad_id_data
-    if exclude_input_in_output is not None:
-        exclude_input_in_output_data = np.array([[exclude_input_in_output]],
-                                                dtype=bool)
-        inputs["exclude_input_in_output"] = exclude_input_in_output_data
-
-    if lora_dir:
-        assert lora_task_id is not None, "Must specify `lora-task-id` with `lora-path` argument"
-        inputs["lora_weights"] = np.load(
-            os.path.join(lora_dir, "model.lora_weights.npy"))
-        try:
-            inputs["lora_config"] = np.load(
-                os.path.join(lora_dir, "model.lora_config.npy"))
-        except Exception:
-            inputs["lora_config"] = np.load(
-                os.path.join(lora_dir, "model.lora_keys.npy"))
-
-    if lora_task_id:
-        inputs["lora_task_id"] = np.array([[lora_task_id]], dtype=np.uint64)
-
-    if return_perf_metrics_data is not None:
-        inputs["return_perf_metrics"] = return_perf_metrics_data
-
-    if guided_decoding_guide_type is not None:
-        inputs["guided_decoding_guide_type"] = np.array(
-            [[guided_decoding_guide_type]]).astype(object)
-
-    if guided_decoding_guide is not None:
-        inputs["guided_decoding_guide"] = np.array([[guided_decoding_guide]
-                                                    ]).astype(object)
-    return inputs
-
-
-def run_inference(triton_client,
-                  prompt,
-                  output_len,
-                  request_id,
-                  repetition_penalty,
-                  presence_penalty,
-                  frequency_penalty,
-                  temperature,
-                  stop_words,
-                  bad_words,
-                  embedding_bias_words,
-                  embedding_bias_weights,
-                  model_name,
-                  streaming,
-                  beam_width,
-                  overwrite_output_text,
-                  return_log_probs_data,
-                  return_context_logits_data,
-                  return_generation_logits_data,
-                  return_perf_metrics_data,
-                  end_id,
-                  pad_id,
-                  batch_inputs,
-                  verbose,
-                  num_draft_tokens=0,
-                  use_draft_logits=None,
-                  num_return_sequences=None,
-                  lora_dir=None,
-                  lora_task_id=None,
-                  exclude_input_in_output=False,
-                  guided_decoding_guide_type=None,
-                  guided_decoding_guide=None):
-
-    try:
-        prompts = json.loads(prompt)
-    except:
-        prompts = [prompt]
-
-    bs1_inputs = []
-    for prompt in prompts:
-        bs1_inputs.append(
-            prepare_inputs(
-                prompt, output_len, repetition_penalty, presence_penalty,
-                frequency_penalty, temperature, stop_words, bad_words,
-                embedding_bias_words, embedding_bias_weights, streaming,
-                beam_width, return_log_probs_data, return_context_logits_data,
-                return_generation_logits_data, return_perf_metrics_data, end_id,
-                pad_id, num_draft_tokens, use_draft_logits,
-                num_return_sequences, lora_dir, lora_task_id,
-                exclude_input_in_output, guided_decoding_guide_type,
-                guided_decoding_guide))
-
-    if batch_inputs:
-        multiple_inputs = []
-        for key in bs1_inputs[0].keys():
-            stackable_values = [value[key] for value in bs1_inputs]
-            stacked_values = np.concatenate(tuple(stackable_values), axis=0)
-            multiple_inputs.append(prepare_tensor(key, stacked_values))
-        multiple_inputs = [multiple_inputs]
-    else:
-        multiple_inputs = []
-        for bs1_input in bs1_inputs:
-            multiple_inputs.append([
-                prepare_tensor(key, value)
-                for (key, value) in bs1_input.items()
-            ])
-
-    if beam_width > 1 and FLAGS.check_outputs:
-        raise Exception(
-            "check_outputs flag only works with beam_width == 1 currently")
-
-    # Only include needed outputs
-    outputs = []
-    outputs.append(grpcclient.InferRequestedOutput("text_output"))
-    if return_log_probs_data is not None:
-        outputs.append(grpcclient.InferRequestedOutput("cum_log_probs"))
-        outputs.append(grpcclient.InferRequestedOutput("output_log_probs"))
-    if return_context_logits_data is not None:
-        outputs.append(grpcclient.InferRequestedOutput("context_logits"))
-    if return_generation_logits_data is not None:
-        outputs.append(grpcclient.InferRequestedOutput("generation_logits"))
-    if num_return_sequences is not None and num_return_sequences > 1:
-        outputs.append(grpcclient.InferRequestedOutput("sequence_index"))
-    if batch_inputs:
-        outputs.append(grpcclient.InferRequestedOutput("batch_index"))
-    if return_perf_metrics_data is not None:
-        outputs.append(
-            grpcclient.InferRequestedOutput("kv_cache_alloc_new_blocks"))
-        outputs.append(
-            grpcclient.InferRequestedOutput("kv_cache_reused_blocks"))
-        outputs.append(
-            grpcclient.InferRequestedOutput("kv_cache_alloc_total_blocks"))
-        outputs.append(grpcclient.InferRequestedOutput("arrival_time_ns"))
-        outputs.append(
-            grpcclient.InferRequestedOutput("first_scheduled_time_ns"))
-        outputs.append(grpcclient.InferRequestedOutput("first_token_time_ns"))
-        outputs.append(grpcclient.InferRequestedOutput("last_token_time_ns"))
-        outputs.append(grpcclient.InferRequestedOutput("acceptance_rate"))
-        outputs.append(
-            grpcclient.InferRequestedOutput("total_accepted_draft_tokens"))
-        outputs.append(grpcclient.InferRequestedOutput("total_draft_tokens"))
-
-    output_texts = []
-    user_data = UserData()
-    for inputs in multiple_inputs:
-        # Establish stream
-        triton_client.start_stream(callback=partial(callback, user_data))
-
-        # Send request
-        batch_size = inputs[0].shape()[0]
-        triton_client.async_stream_infer(model_name,
-                                         inputs,
-                                         outputs=outputs,
-                                         request_id=request_id)
-
-        #Wait for server to close the stream
-        triton_client.stop_stream()
-
-        # Parse the responses
-        if num_return_sequences is None:
-            num_generations = beam_width
-        else:
-            num_generations = num_return_sequences
-            assert beam_width == 1
-
-        batch_output_text = [[''] * num_generations for _ in range(batch_size)]
-        while True:
-            try:
-                result = user_data._completed_requests.get(block=False)
-            except Exception:
-                break
-
-            if type(result) == InferenceServerException:
-                print("Received an error from server:")
-                print(result)
-            else:
-                output = result.as_numpy('text_output')
-
-                batch_index = result.as_numpy('batch_index')
-                batch_index = 0 if batch_index is None else batch_index[0][0]
-
-                seq_index = result.as_numpy('sequence_index')
-                seq_index = 0 if seq_index is None else seq_index[0][0]
-
-                if streaming and beam_width == 1:
-                    if verbose and seq_index == 0:
-                        print(batch_index, output, flush=True)
-                    new_output = output[0].decode("utf-8")
-                    if overwrite_output_text:
-                        batch_output_text[batch_index][seq_index] = new_output
-                    else:
-                        batch_output_text[batch_index][seq_index] += new_output
-                else:
-                    output_text = output[0].decode("utf-8")
-                    batch_output_text[batch_index][seq_index] = output_text
-                    if verbose:
-                        if num_generations > 1:
-                            print(f"{batch_index} [{seq_index}]: {output_text}",
-                                  flush=True)
-                        else:
-                            print(f"{batch_index}: {output_text}", flush=True)
-
-                output_log_probs = result.as_numpy('output_log_probs')
-                if output_log_probs is not None and verbose:
-                    print(f"output_log_probs.shape: {output_log_probs.shape}")
-                    print(f"output_log_probs: {output_log_probs}")
-
-                context_logits = result.as_numpy('context_logits')
-                if context_logits is not None and verbose:
-                    print(f"context_logits.shape: {context_logits.shape}")
-                    print(f"context_logits: {context_logits}")
-
-                generation_logits = result.as_numpy("generation_logits")
-                if generation_logits is not None and verbose:
-                    print(f"generation_logits.shape: {generation_logits.shape}")
-                    print(f"generation_logits: {generation_logits}")
-
-                if return_perf_metrics_data is not None:
-                    kv_cache_alloc_new_blocks = result.as_numpy(
-                        "kv_cache_alloc_new_blocks")[0][0]
-                    kv_cache_reused_blocks = result.as_numpy(
-                        "kv_cache_reused_blocks")[0][0]
-                    kv_cache_alloc_total_blocks = result.as_numpy(
-                        "kv_cache_alloc_total_blocks")[0][0]
-                    arrival_time_ns = result.as_numpy("arrival_time_ns")[0][0]
-                    first_scheduled_time_ns = result.as_numpy(
-                        "first_scheduled_time_ns")[0][0]
-                    first_token_time_ns = result.as_numpy(
-                        "first_token_time_ns")[0][0]
-                    last_token_time_ns = result.as_numpy(
-                        "last_token_time_ns")[0][0]
-                    acceptance_rate = result.as_numpy("acceptance_rate")[0][0]
-                    total_accepted_draft_tokens = result.as_numpy(
-                        "total_accepted_draft_tokens")[0][0]
-                    total_draft_tokens = result.as_numpy(
-                        "total_draft_tokens")[0][0]
-                    if verbose:
-                        print(
-                            f"kv_cache_alloc_new_blocks: {kv_cache_alloc_new_blocks}"
-                        )
-                        print(
-                            f"kv_cache_reused_blocks: {kv_cache_reused_blocks}")
-                        print(
-                            f"kv_cache_alloc_total_blocks: {kv_cache_alloc_total_blocks}"
-                        )
-                        print(f"arrival_time_ns: {arrival_time_ns}")
-                        print(
-                            f"first_scheduled_time_ns: {first_scheduled_time_ns}"
-                        )
-                        print(f"first_token_time_ns: {first_token_time_ns}")
-                        print(f"last_token_time_ns: {last_token_time_ns}")
-                        print(f"acceptance_rate: {acceptance_rate}")
-                        print(
-                            f"total_accepted_draft_tokens: {total_accepted_draft_tokens}"
-                        )
-                        print(f"total_draft_tokens: {total_draft_tokens}")
-
-        if streaming and beam_width == 1:
-            if verbose:
-                for output_text in batch_output_text:
-                    print('\n'.join(output_text))
-
-        for output_text in batch_output_text:
-            output_texts.extend(output_text)
-
-    return prompts, output_texts
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-
-    parser.add_argument(
-        '--expected-outputs',
-        type=str,
-        required=False,
-        help=
-        'Expected outputs either a single string or a list of json encoded strings.'
-    )
-
-    parser.add_argument(
-        '--check-outputs',
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        'Boolean that indicates if outputs should be compared with expected outputs (passed via --expected-outputs)'
-    )
-
-    parser.add_argument(
-        '-p',
-        '--prompt',
-        type=str,
-        required=True,
-        help=
-        'Input prompt(s), either a single string or a list of json encoded strings.'
-    )
-
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default="ensemble",
-                        choices=["ensemble", "tensorrt_llm_bls"],
-                        help='Name of the Triton model to send request to')
-
-    parser.add_argument(
-        "-S",
-        "--streaming",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-return-sequences",
-        type=int,
-        required=False,
-        default=None,
-        help="Number of sequences to generate.",
-    )
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument('--embedding-bias-words',
-                        nargs='+',
-                        default=[],
-                        help='The biased words')
-
-    parser.add_argument('--embedding-bias-weights',
-                        nargs='+',
-                        default=[],
-                        help='The biased words weights')
-
-    parser.add_argument(
-        '--overwrite-output-text',
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        'In streaming mode, overwrite previously received output text instead of appending to it'
-    )
-
-    parser.add_argument(
-        "--return-log-probs",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return log probs",
-    )
-
-    parser.add_argument(
-        "--return-context-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-generation-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-perf-metrics",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return per-request perf metrics",
-    )
-
-    parser.add_argument(
-        '--batch-inputs',
-        action="store_true",
-        required=False,
-        default=False,
-        help='Whether inputs should be batched or processed individually.')
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        required=False,
-                        help='The token id for end token.')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        required=False,
-                        help='The token id for pad token.')
-
-    parser.add_argument("--lora-path",
-                        type=str,
-                        required=False,
-                        help="path to LoRA dir")
-
-    parser.add_argument("--lora-task-id",
-                        type=int,
-                        required=False,
-                        help="LoRA task ID")
-
-    parser.add_argument('--exclude-input-in-output',
-                        action="store_true",
-                        required=False,
-                        help='Option to exclude prompt in output text.')
-
-    parser.add_argument('--guided-decoding-guide-type',
-                        type=str,
-                        required=False,
-                        default=None,
-                        help="Guided decoding types.")
-
-    parser.add_argument('--guided-decoding-guide',
-                        type=str,
-                        required=False,
-                        default=None,
-                        help="Guided decoding guide prompts.")
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8001"
-
-    embedding_bias_words = FLAGS.embedding_bias_words if FLAGS.embedding_bias_words else None
-    embedding_bias_weights = FLAGS.embedding_bias_weights if FLAGS.embedding_bias_weights else None
-
-    lora_dir = FLAGS.lora_path if FLAGS.lora_path else None
-    lora_task_id = FLAGS.lora_task_id if FLAGS.lora_task_id is not None else None
-
-    try:
-        client = grpcclient.InferenceServerClient(url=FLAGS.url)
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    return_log_probs_data = None
-    if FLAGS.return_log_probs:
-        return_log_probs_data = np.array([[FLAGS.return_log_probs]], dtype=bool)
-
-    return_context_logits_data = None
-    if FLAGS.return_context_logits:
-        return_context_logits_data = np.array([[FLAGS.return_context_logits]],
-                                              dtype=bool)
-
-    return_generation_logits_data = None
-    if FLAGS.return_generation_logits:
-        return_generation_logits_data = np.array(
-            [[FLAGS.return_generation_logits]], dtype=bool)
-
-    return_perf_metrics_data = None
-    if FLAGS.return_perf_metrics:
-        return_perf_metrics_data = np.array([[FLAGS.return_perf_metrics]],
-                                            dtype=bool)
-
-    prompts, output_texts = run_inference(
-        triton_client=client,
-        prompt=FLAGS.prompt,
-        output_len=FLAGS.output_len,
-        request_id=FLAGS.request_id,
-        repetition_penalty=FLAGS.repetition_penalty,
-        presence_penalty=FLAGS.presence_penalty,
-        frequency_penalty=FLAGS.frequency_penalty,
-        temperature=FLAGS.temperature,
-        stop_words=FLAGS.stop_words,
-        bad_words=FLAGS.bad_words,
-        embedding_bias_words=embedding_bias_words,
-        embedding_bias_weights=embedding_bias_weights,
-        model_name=FLAGS.model_name,
-        streaming=FLAGS.streaming,
-        beam_width=FLAGS.beam_width,
-        overwrite_output_text=FLAGS.overwrite_output_text,
-        return_log_probs_data=return_log_probs_data,
-        return_context_logits_data=return_context_logits_data,
-        return_generation_logits_data=return_generation_logits_data,
-        return_perf_metrics_data=return_perf_metrics_data,
-        end_id=FLAGS.end_id,
-        pad_id=FLAGS.pad_id,
-        batch_inputs=FLAGS.batch_inputs,
-        verbose=True,
-        num_return_sequences=FLAGS.num_return_sequences,
-        exclude_input_in_output=FLAGS.exclude_input_in_output,
-        lora_dir=lora_dir,
-        lora_task_id=lora_task_id,
-        guided_decoding_guide_type=FLAGS.guided_decoding_guide_type,
-        guided_decoding_guide=FLAGS.guided_decoding_guide)
-
-    if FLAGS.check_outputs:
-        expected_outputs = json.loads(FLAGS.expected_outputs)
-        n = FLAGS.num_return_sequences or FLAGS.beam_width
-        assert len(expected_outputs) * n == len(output_texts)
-
-        batched_output_texts = [
-            output_texts[i:i + n] for i in range(0, len(output_texts), n)
-        ]
-        for out_texts, prompt, expected in zip(batched_output_texts, prompts,
-                                               expected_outputs):
-            if not FLAGS.streaming and not FLAGS.exclude_input_in_output:
-                expected = prompt + expected
-            assert all([out == expected for out in out_texts])
diff --git a/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py b/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py
deleted file mode 100755
index fd3a3f06756..00000000000
--- a/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py
+++ /dev/null
@@ -1,1250 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import csv
-import os
-import queue
-import sys
-import time
-from functools import partial
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from transformers import AutoTokenizer
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-#
-# Simple streaming client for TRT-LLM inflight bacthing backend
-#
-# In order for this code to work properly, config.pbtxt must contain these values:
-#
-# model_transaction_policy {
-#   decoupled: True
-# }
-#
-# parameters: {
-#   key: "gpt_model_type"
-#   value: {
-#     string_value: "inflight_batching"
-#   }
-# }
-#
-# In order for gpt_model_type 'inflight_batching' to work, you must copy engine from
-#
-# tensorrt_llm/cpp/tests/resources/models/rt_engine/gpt2/fp16-inflight-batching-plugin/1-gpu/
-#
-
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-_str_to_np_dict = dict(
-    float16=np.float16,
-    float32=np.float32,
-    int32=np.int32,
-    bfloat16=np_bfloat16,
-)
-
-
-def curate_log_output(token_sequence,
-                      identifier="Input",
-                      log_max_sequence_len=256):
-    if len(token_sequence) > log_max_sequence_len:
-        print(f"{identifier} sequence starts with: ",
-              token_sequence[:log_max_sequence_len])
-    else:
-        print(f"{identifier} sequence: ", token_sequence)
-
-
-def str_dtype_to_np(dtype):
-    ret = _str_to_np_dict.get(dtype)
-    assert ret is not None, f'Unsupported dtype: {dtype}'
-    return ret
-
-
-def check_output_names(expected_outputs, infer_result):
-    if expected_outputs:
-        output_names = set([o.name for o in infer_result._result.outputs])
-        if set(expected_outputs) != output_names:
-            raise Exception(
-                f"expected outputs do not match actual outputs {expected_outputs} != {output_names}"
-            )
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def prepare_outputs(output_names):
-
-    outputs = []
-    for output_name in output_names:
-        outputs.append(grpcclient.InferRequestedOutput(output_name))
-    return outputs
-
-
-def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
-                   beam_width_data, temperature_data, repetition_penalty_data,
-                   presence_penalty_data, frequency_penalty_data,
-                   streaming_data, end_id, pad_id, prompt_embedding_table_data,
-                   prompt_vocab_size_data, lora_task_id_data, lora_weights_data,
-                   lora_config_data, return_log_probs_data, top_k_data,
-                   top_p_data, draft_ids_data, return_context_logits_data,
-                   return_generation_logits_data, decoder_input_ids_data,
-                   prompt_table_extra_id_data, exclude_input_in_output,
-                   num_return_sequences_data, return_perf_metrics_data,
-                   lookahead_config_data, return_num_input_tokens_data,
-                   return_num_output_tokens_data):
-    inputs = [
-        prepare_tensor("input_ids", input_ids_data),
-        prepare_tensor("input_lengths", input_lengths_data),
-        prepare_tensor("request_output_len", request_output_len_data),
-        prepare_tensor("beam_width", beam_width_data),
-        prepare_tensor("temperature", temperature_data),
-        prepare_tensor("streaming", streaming_data),
-        prepare_tensor("end_id", end_id),
-        prepare_tensor("pad_id", pad_id),
-        prepare_tensor("runtime_top_k", top_k_data),
-        prepare_tensor("runtime_top_p", top_p_data),
-    ]
-    if return_num_input_tokens_data is not None:
-        inputs += [
-            prepare_tensor("return_num_input_tokens",
-                           return_num_input_tokens_data),
-        ]
-    if return_num_output_tokens_data is not None:
-        inputs += [
-            prepare_tensor("return_num_output_tokens",
-                           return_num_output_tokens_data),
-        ]
-    if num_return_sequences_data is not None:
-        inputs += [
-            prepare_tensor("num_return_sequences", num_return_sequences_data)
-        ]
-    if prompt_embedding_table_data is not None:
-        inputs += [
-            prepare_tensor("prompt_embedding_table",
-                           prompt_embedding_table_data),
-            prepare_tensor("prompt_vocab_size", prompt_vocab_size_data)
-        ]
-    if lora_task_id_data is not None:
-        inputs += [prepare_tensor("lora_task_id", lora_task_id_data)]
-    if lora_weights_data is not None:
-        inputs += [
-            prepare_tensor("lora_weights", lora_weights_data),
-            prepare_tensor("lora_config", lora_config_data),
-        ]
-    if repetition_penalty_data is not None:
-        inputs += [
-            prepare_tensor("repetition_penalty", repetition_penalty_data),
-        ]
-    if presence_penalty_data is not None:
-        inputs += [
-            prepare_tensor("presence_penalty", presence_penalty_data),
-        ]
-    if frequency_penalty_data is not None:
-        inputs += [
-            prepare_tensor("frequency_penalty", frequency_penalty_data),
-        ]
-    if draft_ids_data is not None:
-        inputs += [
-            prepare_tensor("draft_input_ids", draft_ids_data),
-        ]
-    if return_log_probs_data is not None:
-        inputs += [
-            prepare_tensor("return_log_probs", return_log_probs_data),
-        ]
-    if return_context_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_context_logits", return_context_logits_data),
-        ]
-    if return_generation_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_generation_logits",
-                           return_generation_logits_data),
-        ]
-    if decoder_input_ids_data is not None:
-        inputs += [
-            prepare_tensor("decoder_input_ids", decoder_input_ids_data),
-        ]
-    if prompt_table_extra_id_data is not None:
-        inputs += [
-            prepare_tensor("prompt_table_extra_ids",
-                           prompt_table_extra_id_data),
-        ]
-    if exclude_input_in_output is not None:
-        inputs += [
-            prepare_tensor("exclude_input_in_output", exclude_input_in_output),
-        ]
-    if return_perf_metrics_data is not None:
-        inputs += [
-            prepare_tensor("return_perf_metrics", return_perf_metrics_data),
-        ]
-    if lookahead_config_data is not None:
-        inputs += [
-            prepare_tensor(
-                "lookahead_window_size",
-                np.array([[lookahead_config_data[0]]], dtype=np.int32)),
-            prepare_tensor(
-                "lookahead_ngram_size",
-                np.array([[lookahead_config_data[1]]], dtype=np.int32)),
-            prepare_tensor(
-                "lookahead_verification_set_size",
-                np.array([[lookahead_config_data[2]]], dtype=np.int32)),
-        ]
-    return inputs
-
-
-def prepare_stop_signals():
-
-    inputs = [
-        grpcclient.InferInput('input_ids', [1, 1], "INT32"),
-        grpcclient.InferInput('input_lengths', [1, 1], "INT32"),
-        grpcclient.InferInput('request_output_len', [1, 1], "INT32"),
-        grpcclient.InferInput('stop', [1, 1], "BOOL"),
-    ]
-
-    inputs[0].set_data_from_numpy(np.empty([1, 1], dtype=np.int32))
-    inputs[1].set_data_from_numpy(np.zeros([1, 1], dtype=np.int32))
-    inputs[2].set_data_from_numpy(np.array([[0]], dtype=np.int32))
-    inputs[3].set_data_from_numpy(np.array([[True]], dtype='bool'))
-
-    return inputs
-
-
-# Define the callback function. Note the last two parameters should be
-# result and error. InferenceServerClient would povide the results of an
-# inference as grpcclient.InferResult in result. For successful
-# inference, error will be None, otherwise it will be an object of
-# tritonclientutils.InferenceServerException holding the error details
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-        if FLAGS.streaming:
-            # Print the first sequence only in streaming.
-            seq_idx = result.as_numpy('sequence_index')
-            seq_idx = seq_idx[0][0] if seq_idx is not None else 0
-
-            if seq_idx == 0 and result.get_output('output_ids') is not None:
-                output_ids = result.as_numpy('output_ids')
-                seq_lens = result.as_numpy('sequence_length')
-                if seq_lens is None or seq_lens[0][0] > 0:
-                    tokens = list(output_ids[0][0])
-                    print(tokens, flush=True)
-
-
-def expand_and_vstack(results: list, axis=0):
-    if len(results) == 1:
-        return np.vstack(results)
-    return np.vstack([np.expand_dims(r, axis=axis) for r in results])
-
-
-def parse_list(value):
-    try:
-        # Remove brackets and split by comma
-        value = value.strip('[]')
-        return [int(x.strip()) for x in value.split(',')]
-    except Exception as e:
-        raise argparse.ArgumentTypeError(
-            f'Invalid list format. Expected [x,y,z], got {value}. Error: {e}')
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        type=str,
-        required=False,
-        default="localhost:8001",
-        help="Inference server URL. Default is localhost:8001.",
-    )
-    parser.add_argument('--text',
-                        type=str,
-                        required=False,
-                        default='Born in north-east France, Soyer trained as a',
-                        help='Input text')
-
-    parser.add_argument('--input-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the input tokens')
-
-    parser.add_argument('--draft-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the draft tokens')
-
-    parser.add_argument(
-        '--output-tokens-csv',
-        type=str,
-        required=False,
-        default='',
-        help='Path to csv file containing the expected output tokens')
-
-    parser.add_argument(
-        '--end-id',
-        type=int,
-        required=False,
-        default=-1,
-        help='The token id for end token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        '--pad-id',
-        type=int,
-        required=False,
-        default=50256,
-        help='The token id for pad token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        "-s",
-        "--ssl",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable SSL encrypted channel to the server",
-    )
-    parser.add_argument(
-        "-t",
-        "--stream-timeout",
-        type=float,
-        required=False,
-        default=None,
-        help="Stream timeout in seconds. Default is None.",
-    )
-    parser.add_argument(
-        "-r",
-        "--root-certificates",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded root certificates. Default is None.",
-    )
-    parser.add_argument(
-        "-p",
-        "--private-key",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded private key. Default is None.",
-    )
-    parser.add_argument(
-        "-x",
-        "--certificate-chain",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded certificate chain. Default is None.",
-    )
-    parser.add_argument(
-        "-C",
-        "--grpc-compression-algorithm",
-        type=str,
-        required=False,
-        default=None,
-        help=
-        "The compression algorithm to be used when sending request to server. Default is None.",
-    )
-    parser.add_argument(
-        "-S",
-        "--streaming",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-    parser.add_argument(
-        "-c",
-        "--check-output",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable check of output ids for CI",
-    )
-    parser.add_argument(
-        "--correctness-threshold",
-        type=float,
-        required=False,
-        default=1.0,
-        help="Error tolerance when checking output for CI",
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-return-sequences",
-        type=int,
-        required=False,
-        default=None,
-        help="Number of sequences to generate.",
-    )
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument(
-        "--request-output-len",
-        type=int,
-        required=False,
-        default=16,
-        help="Request output length",
-    )
-    parser.add_argument(
-        '--stop-after-ms',
-        type=int,
-        required=False,
-        default=0,
-        help='Early stop the generation after a few milliseconds')
-    parser.add_argument(
-        "--stop-via-request-cancel",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Early stop use request cancellation instead of stop request")
-    parser.add_argument('--tokenizer-dir',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Specify tokenizer directory')
-    parser.add_argument('--tokenizer-type',
-                        type=str,
-                        default='auto',
-                        required=False,
-                        choices=['auto', 't5', 'llama'],
-                        help='Specify tokenizer type')
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--prompt-embedding-table-path',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The prompt embedding table to use for ptuning')
-    parser.add_argument("--lora-path",
-                        type=str,
-                        default='',
-                        required=False,
-                        help="LoRA weights")
-    parser.add_argument("--lora-task-id",
-                        type=int,
-                        default=None,
-                        required=False,
-                        help="LoRA task id")
-    parser.add_argument(
-        "--exclude-input-in-output",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Expect that output IDs do not contain input IDs",
-    )
-
-    parser.add_argument('--prompt-task-id',
-                        type=int,
-                        default=0,
-                        required=False,
-                        help='The prompt task id in the prompt embedding table')
-
-    parser.add_argument('--dtype',
-                        type=str,
-                        default='float16',
-                        choices=['float16', 'float32', 'bfloat16'])
-
-    parser.add_argument(
-        "--return-log-probs",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable computation of log probs",
-    )
-
-    parser.add_argument(
-        "--return-context-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-generation-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-perf-metrics",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return per-request perf metrics",
-    )
-
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        required=False,
-        default=1,
-        help="top k value",
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        required=False,
-        default=0.,
-        help="top p value",
-    )
-
-    parser.add_argument('--requested-outputs',
-                        nargs='+',
-                        default=[],
-                        help='The requested output tensors')
-
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default='tensorrt_llm',
-                        help='Specify model name')
-    parser.add_argument(
-        "--prompt_table_extra_id",
-        type=int,
-        required=False,
-        default=None,
-        help=
-        "When enable kv cache reuse, we need a unique id to determine whether the prompt tables are the same. The type of extra id is uint64, and its range is from 1 to the maximum value of uint64.",
-    )
-    parser.add_argument(
-        "--vocab_size",
-        type=int,
-        required=False,
-        default=None,
-    )
-    parser.add_argument(
-        '--lookahead_config',
-        type=parse_list,
-        help=
-        'Lookahead parameters in format [window_size,ngram_size,verification_set_size]. Example: [7,7,7]',
-        default=None,
-        required=False)
-    parser.add_argument(
-        "--return-num-input-tokens",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return the number of input tokens",
-    )
-    parser.add_argument(
-        "--return-num-output-tokens",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return the number of output tokens",
-    )
-
-    FLAGS = parser.parse_args()
-
-    tokenizer = None
-    draft_ids = None
-    decoder_input_ids = None
-    if FLAGS.input_tokens_csv != "":
-        with open(FLAGS.input_tokens_csv) as csv_file:
-            csv_reader = csv.reader(csv_file, delimiter=",")
-            for row in csv_reader:
-                input_ids = [[int(val) for val in row]]
-                break
-
-            curate_log_output(input_ids[0], "Input")
-
-        if FLAGS.draft_tokens_csv != "":
-            with open(FLAGS.draft_tokens_csv) as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=",")
-                for row in csv_reader:
-                    draft_ids = [[int(val) for val in row]]
-                    break
-
-        end_id = FLAGS.end_id
-        pad_id = FLAGS.pad_id
-
-    else:
-        print('=========')
-        if (os.path.isdir(FLAGS.tokenizer_dir)
-                and not os.path.exists(FLAGS.tokenizer_dir)):
-            raise FileNotFoundError(
-                "Input tokens are not provided and tokenizer directory does"
-                f" not exist: {FLAGS.tokenizer_dir}", )
-
-        tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                                  legacy=False,
-                                                  padding_side='left',
-                                                  trust_remote_code=True)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        pad_id = tokenizer.encode(tokenizer.pad_token,
-                                  add_special_tokens=False)[0]
-        end_id = tokenizer.encode(tokenizer.eos_token,
-                                  add_special_tokens=False)[0]
-        print("Using pad_id: ", pad_id)
-        print("Using end_id: ", end_id)
-
-        input_ids = [tokenizer.encode(FLAGS.text)]
-        curate_log_output(input_ids[0], "Input")
-
-    end_id_data = np.array([[end_id]], dtype=np.int32)
-    pad_id_data = np.array([[pad_id]], dtype=np.int32)
-
-    #Get the prompt embedding table for the task id
-    prompt_embedding_table_data = None
-    prompt_vocab_size_data = None
-    if (FLAGS.prompt_embedding_table_path != ""):
-        prompt_table = np.load(FLAGS.prompt_embedding_table_path)
-        prompt_table = prompt_table.astype(str_dtype_to_np(FLAGS.dtype))
-        task_vocab_size = prompt_table.shape[1]
-
-        # squeeze the first 2 dimensions
-        prompt_embedding_table_data = prompt_table[FLAGS.prompt_task_id]
-        prompt_embedding_table_data = np.expand_dims(
-            prompt_table[FLAGS.prompt_task_id], axis=0)
-
-        prompt_vocab_size = [[task_vocab_size]]
-        prompt_vocab_size_data = np.array(prompt_vocab_size, dtype=np.int32)
-
-    lora_weights_data = None
-    lora_config_data = None
-    if (FLAGS.lora_path != ""):
-        lora_weights_data = np.load(
-            os.path.join(FLAGS.lora_path, "model.lora_weights.npy"))
-        try:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_config.npy"))
-        except Exception:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_keys.npy"))
-    lora_task_id_data = None
-    if FLAGS.lora_task_id is not None and FLAGS.lora_task_id != 0:
-        lora_task_id_data = np.array([[FLAGS.lora_task_id]], dtype=np.uint64)
-
-    input_ids_data = np.array(input_ids, dtype=np.int32)
-    input_lengths = [[len(ii)] for ii in input_ids]
-    input_lengths_data = np.array(input_lengths, dtype=np.int32)
-    request_output_len = [[FLAGS.request_output_len]]
-    request_output_len_data = np.array(request_output_len, dtype=np.int32)
-
-    num_return_sequences_data = None
-    if FLAGS.num_return_sequences:
-        num_return_sequences_data = np.array([[FLAGS.num_return_sequences]],
-                                             dtype=np.int32)
-
-    beam_width = [[FLAGS.beam_width]]
-    beam_width_data = np.array(beam_width, dtype=np.int32)
-    top_k = [[FLAGS.top_k]]
-    top_k_data = np.array(top_k, dtype=np.int32)
-    top_p = [[FLAGS.top_p]]
-    top_p_data = np.array(top_p, dtype=np.float32)
-    temperature = [[FLAGS.temperature]]
-    temperature_data = np.array(temperature, dtype=np.float32)
-    return_log_probs_data = None
-    if FLAGS.return_log_probs:
-        return_log_probs_data = np.array([[FLAGS.return_log_probs]], dtype=bool)
-
-    return_context_logits_data = None
-    if FLAGS.return_context_logits:
-        return_context_logits_data = np.array([[FLAGS.return_context_logits]],
-                                              dtype=bool)
-
-    return_generation_logits_data = None
-    if FLAGS.return_generation_logits:
-        return_generation_logits_data = np.array(
-            [[FLAGS.return_generation_logits]], dtype=bool)
-
-    return_perf_metrics_data = None
-    if FLAGS.return_perf_metrics:
-        return_perf_metrics_data = np.array([[FLAGS.return_perf_metrics]],
-                                            dtype=bool)
-
-    repetition_penalty_data = None
-    if FLAGS.repetition_penalty is not None:
-        repetition_penalty = [[FLAGS.repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty, dtype=np.float32)
-    presence_penalty_data = None
-    if FLAGS.presence_penalty is not None:
-        presence_penalty = [[FLAGS.presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-    frequency_penalty_data = None
-    if FLAGS.frequency_penalty is not None:
-        frequency_penalty = [[FLAGS.frequency_penalty]]
-        frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32)
-    streaming = [[FLAGS.streaming]]
-    streaming_data = np.array(streaming, dtype=bool)
-
-    draft_ids_data = None
-    if draft_ids is not None:
-        draft_ids_data = np.array(draft_ids, dtype=np.int32)
-
-    decoder_input_ids_data = None
-    if decoder_input_ids is not None:
-        decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)
-
-    exclude_input_in_output = None
-    if FLAGS.exclude_input_in_output:
-        exclude_input_in_output = np.array([[FLAGS.exclude_input_in_output]],
-                                           dtype=bool)
-
-    if not FLAGS.vocab_size and tokenizer:
-        FLAGS.vocab_size = tokenizer.vocab_size
-    prompt_table_extra_id_data = None
-    if FLAGS.prompt_table_extra_id is not None:
-        if not FLAGS.vocab_size:
-            raise Exception(
-                "To use the prompt table extra id, you need to specify the vocab size."
-            )
-
-        prompt_table_extra_id_data = np.zeros_like(input_ids_data,
-                                                   dtype=np.uint64)
-        for i in range(input_ids_data.shape[0]):
-            prompt_table_extra_id_data[i] = np.where(
-                input_ids_data[i] >= FLAGS.vocab_size,
-                FLAGS.prompt_table_extra_id, 0)
-
-    lookahead_config_data = None
-    if FLAGS.lookahead_config is not None:
-        lookahead_config_data = np.array(FLAGS.lookahead_config, dtype=np.int32)
-
-    return_num_input_tokens_data = None
-    if FLAGS.return_num_input_tokens:
-        return_num_input_tokens_data = np.array(
-            [[FLAGS.return_num_input_tokens]], dtype=bool)
-
-    return_num_output_tokens_data = None
-    if FLAGS.return_num_output_tokens:
-        return_num_output_tokens_data = np.array(
-            [[FLAGS.return_num_output_tokens]], dtype=bool)
-
-    inputs = prepare_inputs(
-        input_ids_data, input_lengths_data, request_output_len_data,
-        beam_width_data, temperature_data, repetition_penalty_data,
-        presence_penalty_data, frequency_penalty_data, streaming_data,
-        end_id_data, pad_id_data, prompt_embedding_table_data,
-        prompt_vocab_size_data, lora_task_id_data, lora_weights_data,
-        lora_config_data, return_log_probs_data, top_k_data, top_p_data,
-        draft_ids_data, return_context_logits_data,
-        return_generation_logits_data, decoder_input_ids_data,
-        prompt_table_extra_id_data, exclude_input_in_output,
-        num_return_sequences_data, return_perf_metrics_data,
-        lookahead_config_data, return_num_input_tokens_data,
-        return_num_output_tokens_data)
-
-    if FLAGS.requested_outputs:
-        # Must have at least output_ids in requested outputs
-        if "output_ids" not in FLAGS.requested_outputs:
-            raise Exception(
-                "requested outputs must at least have \"output_ids\"")
-        outputs = prepare_outputs(FLAGS.requested_outputs)
-    else:
-        outputs = None
-
-    stop_inputs = None
-    if FLAGS.stop_after_ms > 0 and not FLAGS.stop_via_request_cancel:
-        stop_inputs = prepare_stop_signals()
-
-    request_id = FLAGS.request_id
-
-    if FLAGS.output_tokens_csv != "":
-        with open(FLAGS.output_tokens_csv) as csv_file:
-            csv_reader = csv.reader(csv_file, delimiter=",")
-            for row in csv_reader:
-                expected_output_ids = [[int(val) for val in row]]
-                break
-    else:
-        # expected_output_ids holds a list of lists, each list is a version of "expected" output ids
-        # The expected output could vary on different GPUs
-        expected_output_ids = []
-        expected_output_ids.append(
-            ([] if FLAGS.exclude_input_in_output else input_ids[0]) + [
-                21221,
-                290,
-                373,
-                257,
-                2888,
-                286,
-                262,
-                4141,
-                2351,
-                10006,
-                13,
-                679,
-                373,
-                7018,
-                284,
-                262,
-            ])
-        # Adding a second expected output ids for testing on A100 GPUs
-        expected_output_ids.append(
-            ([] if FLAGS.exclude_input_in_output else input_ids[0]) + [
-                21221, 290, 257, 4255, 379, 262, 1957, 7072, 11, 4689, 347,
-                2852, 2564, 494, 13, 679
-            ])
-
-    if FLAGS.num_return_sequences is None:
-        num_generations = FLAGS.beam_width
-    else:
-        num_generations = FLAGS.num_return_sequences
-        assert FLAGS.beam_width == 1
-
-    if FLAGS.streaming:
-        actual_output_ids = [
-            [] if FLAGS.exclude_input_in_output else input_ids[0]
-            for _ in range(num_generations)
-        ]
-    else:
-        actual_output_ids = [[] for _ in range(num_generations)]
-
-    # Expected result shapes: [num_sequences, ...]
-    sequence_lengths = [None] * num_generations
-    cum_log_probs = [None] * num_generations
-    output_log_probs = [None] * num_generations
-    context_logits = None
-    generation_logits = [None] * num_generations
-    returned_perf_metrics = {}
-    input_token_count = [None] * num_generations
-    output_token_count = [None] * num_generations
-
-    def set_output(outputs: list, data, seq_idx=None):
-        if FLAGS.beam_width > 1:
-            # data = beams
-            for seq_idx in range(FLAGS.beam_width):
-                outputs[seq_idx] = data[seq_idx]
-        else:
-            assert seq_idx is not None
-            outputs[seq_idx] = data
-
-    user_data = UserData()
-    with grpcclient.InferenceServerClient(
-            url=FLAGS.url,
-            verbose=FLAGS.verbose,
-            ssl=FLAGS.ssl,
-            root_certificates=FLAGS.root_certificates,
-            private_key=FLAGS.private_key,
-            certificate_chain=FLAGS.certificate_chain,
-    ) as triton_client:
-        try:
-            if FLAGS.streaming or FLAGS.num_return_sequences is not None:
-
-                # Establish stream
-                triton_client.start_stream(
-                    callback=partial(callback, user_data),
-                    stream_timeout=FLAGS.stream_timeout,
-                )
-                # Send request
-                triton_client.async_stream_infer(
-                    FLAGS.model_name,
-                    inputs,
-                    outputs=outputs,
-                    request_id=request_id,
-                )
-
-                if FLAGS.stop_after_ms > 0:
-                    time.sleep(FLAGS.stop_after_ms / 1000.0)
-
-                    if not FLAGS.stop_via_request_cancel:
-                        triton_client.async_stream_infer(
-                            FLAGS.model_name,
-                            stop_inputs,
-                            request_id=request_id,
-                            parameters={'Streaming': FLAGS.streaming})
-
-                # Close the grpc stream
-                cancel_requests = FLAGS.stop_after_ms > 0 and FLAGS.stop_via_request_cancel
-                triton_client.stop_stream(cancel_requests=cancel_requests)
-
-                # Parse the responses
-                while True:
-                    try:
-                        result = user_data._completed_requests.get(block=False)
-                    except Exception as e:
-                        print(str(e))
-                        break
-
-                    if type(result) == InferenceServerException:
-                        if result.status() == "StatusCode.CANCELLED":
-                            print("Request is cancelled")
-                        else:
-                            print("Received an error from server:")
-                            print(result)
-                            raise result
-                    else:
-                        check_output_names(FLAGS.requested_outputs, result)
-                        output_ids = result.as_numpy('output_ids')
-                        if output_ids is not None:
-                            seq_idx = result.as_numpy('sequence_index')
-                            seq_idx = seq_idx[0][0] if seq_idx is not None else 0
-
-                            if FLAGS.return_log_probs:
-                                set_output(cum_log_probs,
-                                           result.as_numpy('cum_log_probs')[0],
-                                           seq_idx)
-                                set_output(
-                                    output_log_probs,
-                                    result.as_numpy('output_log_probs')[0],
-                                    seq_idx)
-                            if FLAGS.return_context_logits:
-                                context_logits = result.as_numpy(
-                                    'context_logits')
-                            if FLAGS.return_generation_logits:
-                                set_output(
-                                    generation_logits,
-                                    result.as_numpy('generation_logits')[0],
-                                    seq_idx)
-                            if FLAGS.return_num_input_tokens:
-                                input_token_count[seq_idx] = result.as_numpy(
-                                    'num_input_tokens')
-                            if FLAGS.return_num_output_tokens:
-                                output_token_count[seq_idx] = result.as_numpy(
-                                    'num_output_tokens')
-                            if FLAGS.return_perf_metrics:
-                                returned_perf_metrics[
-                                    'kv_cache_alloc_new_blocks'] = result.as_numpy(
-                                        'kv_cache_alloc_new_blocks')
-                                returned_perf_metrics[
-                                    'kv_cache_reused_blocks'] = result.as_numpy(
-                                        'kv_cache_reused_blocks')
-                                returned_perf_metrics[
-                                    'kv_cache_alloc_total_blocks'] = result.as_numpy(
-                                        'kv_cache_alloc_total_blocks')
-                                returned_perf_metrics[
-                                    'arrival_time_ns'] = result.as_numpy(
-                                        'arrival_time_ns')
-                                returned_perf_metrics[
-                                    'first_scheduled_time_ns'] = result.as_numpy(
-                                        'first_scheduled_time_ns')
-                                returned_perf_metrics[
-                                    'first_token_time_ns'] = result.as_numpy(
-                                        'first_token_time_ns')
-                                returned_perf_metrics[
-                                    'last_token_time_ns'] = result.as_numpy(
-                                        'last_token_time_ns')
-                                returned_perf_metrics[
-                                    'acceptance_rate'] = result.as_numpy(
-                                        'acceptance_rate')
-                                returned_perf_metrics[
-                                    'total_accepted_draft_tokens'] = result.as_numpy(
-                                        'total_accepted_draft_tokens')
-                                returned_perf_metrics[
-                                    'total_draft_tokens'] = result.as_numpy(
-                                        'total_draft_tokens')
-
-                            sequence_lengths[seq_idx] = result.as_numpy(
-                                'sequence_length')[0][0]
-                            if FLAGS.streaming:
-                                # Only one beam is supported
-                                if (sequence_lengths[seq_idx] is None
-                                        or sequence_lengths[seq_idx] > 0):
-                                    tokens = list(output_ids[0][0])
-                                    actual_output_ids[seq_idx] = (
-                                        actual_output_ids[seq_idx] + tokens)
-                            else:
-                                # num_return_sequences > 1 under offline mode.
-                                tokens = list(output_ids[0][0])
-                                actual_output_ids[seq_idx].extend(tokens)
-                        else:
-                            print("Got cancellation response from server")
-            else:
-                # Send request
-                infer_future = triton_client.async_infer(
-                    FLAGS.model_name,
-                    inputs,
-                    outputs=outputs,
-                    request_id=request_id,
-                    callback=partial(callback, user_data),
-                    parameters={'Streaming': FLAGS.streaming})
-
-                expected_responses = 1
-                if FLAGS.num_return_sequences is not None:
-                    expected_responses = FLAGS.num_return_sequences
-
-                if FLAGS.stop_after_ms > 0:
-
-                    time.sleep(FLAGS.stop_after_ms / 1000.0)
-
-                    if FLAGS.stop_via_request_cancel:
-                        infer_future.cancel()
-                    else:
-                        triton_client.async_infer(
-                            FLAGS.model_name,
-                            stop_inputs,
-                            request_id=request_id,
-                            callback=partial(callback, user_data),
-                            parameters={'Streaming': FLAGS.streaming})
-                        expected_responses += 1
-
-                processed_count = 0
-                while processed_count < expected_responses:
-                    try:
-                        result = user_data._completed_requests.get()
-                        print("Got completed request", flush=True)
-                    except Exception:
-                        break
-
-                    if type(result) == InferenceServerException:
-                        if result.status() == "StatusCode.CANCELLED":
-                            print("Request is cancelled")
-                        else:
-                            print("Received an error from server:")
-                            print(result)
-                            raise result
-                    else:
-                        check_output_names(FLAGS.requested_outputs, result)
-                        output_ids = result.as_numpy('output_ids')
-                        seq_idx = result.as_numpy('sequence_index')
-                        seq_idx = seq_idx[0][0] if seq_idx is not None else 0
-
-                        if FLAGS.return_log_probs:
-                            set_output(cum_log_probs,
-                                       result.as_numpy('cum_log_probs')[0],
-                                       seq_idx)
-                            set_output(output_log_probs,
-                                       result.as_numpy('output_log_probs')[0],
-                                       seq_idx)
-                        if FLAGS.return_context_logits:
-                            context_logits = result.as_numpy('context_logits')
-                        if FLAGS.return_generation_logits:
-                            set_output(generation_logits,
-                                       result.as_numpy('generation_logits')[0],
-                                       seq_idx)
-                        if FLAGS.return_num_input_tokens:
-                            input_token_count[seq_idx] = result.as_numpy(
-                                'num_input_tokens')
-                        if FLAGS.return_num_output_tokens:
-                            output_token_count[seq_idx] = result.as_numpy(
-                                'num_output_tokens')
-                        if FLAGS.return_perf_metrics:
-                            returned_perf_metrics[
-                                'kv_cache_alloc_new_blocks'] = result.as_numpy(
-                                    'kv_cache_alloc_new_blocks')
-                            returned_perf_metrics[
-                                'kv_cache_reused_blocks'] = result.as_numpy(
-                                    'kv_cache_reused_blocks')
-                            returned_perf_metrics[
-                                'kv_cache_alloc_total_blocks'] = result.as_numpy(
-                                    'kv_cache_alloc_total_blocks')
-                            returned_perf_metrics[
-                                'arrival_time_ns'] = result.as_numpy(
-                                    'arrival_time_ns')
-                            returned_perf_metrics[
-                                'first_scheduled_time_ns'] = result.as_numpy(
-                                    'first_scheduled_time_ns')
-                            returned_perf_metrics[
-                                'first_token_time_ns'] = result.as_numpy(
-                                    'first_token_time_ns')
-                            returned_perf_metrics[
-                                'last_token_time_ns'] = result.as_numpy(
-                                    'last_token_time_ns')
-                            returned_perf_metrics[
-                                'acceptance_rate'] = result.as_numpy(
-                                    'acceptance_rate')
-                            returned_perf_metrics[
-                                'total_accepted_draft_tokens'] = result.as_numpy(
-                                    'total_accepted_draft_tokens')
-                            returned_perf_metrics[
-                                'total_draft_tokens'] = result.as_numpy(
-                                    'total_draft_tokens')
-                        if output_ids is not None:
-                            print(result.as_numpy('sequence_length'))
-                            if FLAGS.beam_width > 1:
-                                set_output(
-                                    sequence_lengths,
-                                    result.as_numpy('sequence_length')[0])
-                                for beam_idx in range(FLAGS.beam_width):
-                                    beam_output_ids = output_ids[0][beam_idx]
-                                    tokens = list(beam_output_ids)
-                                    actual_output_ids[beam_idx].extend(tokens)
-                            else:
-                                assert seq_idx is not None
-                                sequence_lengths[seq_idx] = result.as_numpy(
-                                    'sequence_length')[0][0]
-                                tokens = list(output_ids[0][0])
-                                actual_output_ids[seq_idx].extend(tokens)
-                        else:
-                            print("Got cancellation response from server")
-
-                    processed_count = processed_count + 1
-        except Exception as e:
-            err = "Encountered error: " + str(e)
-            print(err)
-            sys.exit(err)
-
-        passed = True
-
-        # Keep the output ids of seq_idx=0 for testing randomness
-        # across generated sequences.
-        output_ids_wo_prompt_0 = None
-
-        for seq_idx in range(num_generations):
-            seq_len = (sequence_lengths[seq_idx]
-                       if not FLAGS.streaming and len(sequence_lengths) > 0 else
-                       len(actual_output_ids[seq_idx]))
-            # These should be equal when input IDs are excluded from output
-            output_ids_w_prompt = actual_output_ids[seq_idx][:seq_len]
-            output_ids_wo_prompt = (
-                output_ids_w_prompt if FLAGS.exclude_input_in_output else
-                output_ids_w_prompt[input_ids_data.shape[1]:])
-            if seq_idx == 0:
-                output_ids_wo_prompt_0 = output_ids_wo_prompt
-            if tokenizer is not None:
-                output_text = tokenizer.decode(output_ids_wo_prompt)
-                print(f'Input: {FLAGS.text}')
-                print(f'Output beam {seq_idx}: {output_text}')
-            if FLAGS.return_num_input_tokens:
-                print(f'Input token count: {input_token_count[seq_idx]}')
-            if FLAGS.return_num_output_tokens:
-                print(f'Output token count: {output_token_count[seq_idx]}')
-            # If cancelled, the number of output tokens should be less than request output length.
-            if FLAGS.stop_after_ms > 0 and len(
-                    output_ids_wo_prompt) >= FLAGS.request_output_len:
-                raise AssertionError("expect less than " +
-                                     str(FLAGS.request_output_len) +
-                                     " output tokens, got " +
-                                     str(len(output_ids_wo_prompt)))
-            curate_log_output(output_ids_w_prompt, "Output")
-
-            if FLAGS.check_output and seq_idx == 0:
-                passed = False
-                if FLAGS.correctness_threshold == 1.0:
-                    passed = (output_ids_w_prompt in expected_output_ids)
-                else:
-                    # Compare the output tokens one by one
-                    num_same_output_id = [0] * len(expected_output_ids)
-                    for i, expect_output in enumerate(expected_output_ids):
-                        for output, expected in zip(output_ids_w_prompt,
-                                                    expect_output):
-                            if output == expected:
-                                num_same_output_id[i] += 1
-
-                    # Calculate the match rate
-                    match_rate = max(num_same_output_id) / len(
-                        output_ids_w_prompt)
-                    print(f"Output token matching rate: {match_rate}")
-                    passed = (match_rate > FLAGS.correctness_threshold)
-                    print("expected_output_ids = ", expected_output_ids)
-                print("\n=====")
-                print("PASS!" if passed else "FAIL!")
-                print("=====")
-
-            non_deterministic_sampling = FLAGS.beam_width == 1 and (
-                top_k[0][0] > 1 or top_p[0][0] > 0)
-            if FLAGS.check_output and non_deterministic_sampling and seq_idx > 0:
-                # Skip the correctness check under non-deterministic sampling.
-                # Generated sequences should not be identical.
-                passed = output_ids_w_prompt[seq_idx] not in expected_output_ids
-                if not passed:
-                    print(f"Output tokens of sequence {seq_idx} is identical "
-                          f"to the expected sequence.")
-
-        if FLAGS.return_log_probs:
-            print('cum_log_probs:', expand_and_vstack(cum_log_probs))
-            print('output_log_probs:', expand_and_vstack(output_log_probs))
-
-        if FLAGS.return_context_logits:
-            print(f"context_logits.shape: {context_logits.shape}")
-            print(f"context_logits: {context_logits}")
-
-        if FLAGS.return_generation_logits:
-            generation_logits = expand_and_vstack(generation_logits)
-            print(f"generation_logits.shape: {generation_logits.shape}")
-            print(f"generation_logits: {generation_logits}")
-
-        if FLAGS.return_num_input_tokens:
-            print(f'Input token count: {input_token_count[0]}')
-        if FLAGS.return_num_output_tokens:
-            print(f'Output token count: {output_token_count[0]}')
-
-        if FLAGS.return_perf_metrics:
-            for key, value in returned_perf_metrics.items():
-                print(f"{key}: {value[0][0]}")
-
-        sys.exit(not passed)
diff --git a/triton_backend/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in b/triton_backend/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in
deleted file mode 100644
index 84239071b92..00000000000
--- a/triton_backend/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include(CMakeFindDependencyMacro)
-
-get_filename_component(
-  TRITONTRTLLMBACKEND_CMAKE_DIR  "${CMAKE_CURRENT_LIST_FILE}" PATH
-)
-
-list(APPEND CMAKE_MODULE_PATH ${TRITONTRTLLMBACKEND_CMAKE_DIR })
-
-if(NOT TARGET TritonTRTLLMBackend::triton-trtllm-backend)
-  include("${TRITONTRTLLMBACKEND_CMAKE_DIR }/TritonTRTLLMBackendTargets.cmake")
-endif()
-
-set(TRITONTRTLLMBACKEND_LIBRARIES TritonTRTLLMBackend::triton-trtllm-backend)
diff --git a/triton_backend/inflight_batcher_llm/cmake/modules/set_ifndef.cmake b/triton_backend/inflight_batcher_llm/cmake/modules/set_ifndef.cmake
deleted file mode 100644
index bd8f0a3ea59..00000000000
--- a/triton_backend/inflight_batcher_llm/cmake/modules/set_ifndef.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License. You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations under
-# the License.
-#
-
-function(set_ifndef variable value)
-  if(NOT DEFINED ${variable})
-    set(${variable}
-        ${value}
-        PARENT_SCOPE)
-  endif()
-endfunction()
diff --git a/triton_backend/inflight_batcher_llm/scripts/build.sh b/triton_backend/inflight_batcher_llm/scripts/build.sh
deleted file mode 100644
index b68e8e1dd35..00000000000
--- a/triton_backend/inflight_batcher_llm/scripts/build.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-Help()
-{
-   # Display Help
-   echo "Syntax: build.sh [h|-t <trt_root>|u]"
-   echo "options:"
-   echo "h     Print this Help."
-   echo "t     Location of tensorrt library"
-   echo "u     Option to build unit tests"
-   echo "s     Triton short tag, e.g. 'r25.06'"
-   echo
-}
-
-TRT_ROOT='/usr/local/tensorrt'
-BUILD_UNIT_TESTS='false'
-
-# Get the options
-while getopts ":ht:us:" option; do
-   case $option in
-      h) # display Help
-         Help
-         exit;;
-      t) # Location of tensorrt
-         TRT_ROOT=$OPTARG;;
-      u) # Option to build unit tests
-         BUILD_UNIT_TESTS='true';;
-      s) # Triton short tag
-         TRITON_SHORT_TAG=$OPTARG;;
-     \?) # Invalid option
-         echo "Error: Invalid option"
-         echo ""
-         Help
-         exit;;
-   esac
-done
-
-echo "Using TRT_ROOT=${TRT_ROOT}"
-echo "Using BUILD_UNIT_TESTS=${BUILD_UNIT_TESTS}"
-
-DIRNAME="$(dirname "$(realpath "$0")")"
-if [ -z "$TRITON_SHORT_TAG" ]; then
-  # Get TRITON_SHORT_TAG from docker/Dockerfile.multi
-  LLM_ROOT="${DIRNAME}/../../.."
-  TRITON_SHORT_TAG=$("$LLM_ROOT/jenkins/scripts/get_triton_tag.sh" "$LLM_ROOT")
-fi
-echo "Using TRITON_SHORT_TAG=${TRITON_SHORT_TAG}"
-
-set -x
-apt-get update
-apt-get install -y --no-install-recommends rapidjson-dev
-
-
-BUILD_DIR=$(realpath "$DIRNAME/../build")
-mkdir -p "$BUILD_DIR"
-cd "$BUILD_DIR" || exit 1
-
-export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:${LD_LIBRARY_PATH}"
-
-BUILD_TESTS_ARG="-DUSE_CXX11_ABI=ON"
-if [[ "$BUILD_UNIT_TESTS" == "true" ]]; then
-  BUILD_TESTS_ARG="-DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON"
-fi
-
-cmake -DCMAKE_INSTALL_PREFIX:PATH="$(pwd)/install" \
-   ${BUILD_TESTS_ARG} \
-   -DTRITON_COMMON_REPO_TAG="${TRITON_SHORT_TAG}" \
-   -DTRITON_CORE_REPO_TAG="${TRITON_SHORT_TAG}" \
-   -DTRITON_THIRD_PARTY_REPO_TAG="${TRITON_SHORT_TAG}" \
-   -DTRITON_BACKEND_REPO_TAG="${TRITON_SHORT_TAG}" \
-   ..
-make install
-
-mkdir -p /opt/tritonserver/backends/tensorrtllm
-cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm
-cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm
diff --git a/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
deleted file mode 100644
index fd62003da0a..00000000000
--- a/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#include "custom_metrics_reporter.h"
-#include "triton/backend/backend_common.h"
-#include <iomanip>
-#include <sstream>
-#include <vector>
-
-using namespace ::triton::common; // TritonJson
-
-namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
-{
-
-const std::vector<std::string> CustomMetricsReporter::request_keys_{
-    "Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests", "Waiting Requests"};
-const std::vector<std::string> CustomMetricsReporter::request_labels_{
-    "active", "max", "scheduled", "context", "waiting"};
-
-const std::vector<std::string> CustomMetricsReporter::runtime_memory_keys_{
-    "Runtime CPU Memory Usage", "Runtime GPU Memory Usage", "Runtime Pinned Memory Usage"};
-const std::vector<std::string> CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"};
-
-const std::vector<std::string> CustomMetricsReporter::kv_cache_keys_{"Max KV cache blocks", "Free KV cache blocks",
-    "Used KV cache blocks", "Tokens per KV cache block", "Reused KV cache blocks", "Fraction used KV cache blocks"};
-const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{
-    "max", "free", "used", "tokens_per", "reused", "fraction"};
-
-const std::vector<std::string> CustomMetricsReporter::dis_serving_keys_{"KV cache transfer time", "Request count"};
-const std::vector<std::string> CustomMetricsReporter::dis_serving_labels_{"kv_cache_transfer_ms", "request_count"};
-
-const std::vector<std::string> CustomMetricsReporter::v1_specific_keys_{
-    "Total Context Tokens", "Total Generation Tokens", "Empty Generation Slots"};
-const std::vector<std::string> CustomMetricsReporter::v1_specific_labels_{
-    "total_context_tokens", "total_generation_tokens", "empty_generation_slots"};
-
-const std::vector<std::string> CustomMetricsReporter::IFB_specific_keys_{
-    "Total Context Tokens", "Generation Requests", "MicroBatch ID", "Paused Requests"};
-const std::vector<std::string> CustomMetricsReporter::IFB_specific_labels_{
-    "total_context_tokens", "generation_requests", "micro_batch_id", "paused_requests"};
-
-const std::vector<std::string> CustomMetricsReporter::general_metric_keys_{"Timestamp", "Iteration Counter"};
-const std::vector<std::string> CustomMetricsReporter::general_metric_labels_{"timestamp", "iteration_counter"};
-
-const std::vector<std::string> CustomMetricsReporter::response_metric_type_keys_{"Total Output Tokens"};
-const std::vector<std::string> CustomMetricsReporter::response_metric_type_labels_{"total_output_tokens"};
-
-const std::vector<std::string> CustomMetricsReporter::input_metric_type_keys_{"Total Input Tokens"};
-const std::vector<std::string> CustomMetricsReporter::input_metric_type_labels_{"total_input_tokens"};
-
-uint64_t convertTimestampToMicroseconds(std::string const& ts)
-{
-    std::tm tm = {};
-    std::stringstream ss(ts);
-    ss >> std::get_time(&tm, "%m-%d-%Y %H:%M:%S");
-    auto timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm));
-    auto epoch = std::chrono::time_point_cast<std::chrono::microseconds>(timestamp).time_since_epoch();
-    uint64_t seconds_to_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(epoch).count();
-
-    // std::get_time does not support microsecond resolution, so we must manually add
-    // microseconds to our converted value.
-    uint64_t microseconds = std::strtol(ts.substr(ts.rfind(".") + 1).c_str(), NULL, 10);
-    return (seconds_to_microseconds + microseconds);
-}
-
-TritonMetricGroup::TritonMetricGroup(std::string const& metric_family_label,
-    std::string const& metric_family_description, std::string const& category_label,
-    std::vector<std::string> const& json_keys, std::vector<std::string> const& sub_labels)
-    : metric_family_label_(metric_family_label)
-    , metric_family_description_(metric_family_description)
-    , category_label_(category_label)
-    , json_keys_(json_keys)
-    , sub_labels_(sub_labels)
-{
-}
-
-TRITONSERVER_Error* TritonMetricGroup::CreateGroup(std::string const& model_name, const uint64_t version,
-    TRITONSERVER_MetricKind kind, std::optional<const std::vector<double>> buckets_)
-{
-    TRITONSERVER_MetricFamily* metric_family = nullptr;
-    RETURN_IF_ERROR(TRITONSERVER_MetricFamilyNew(
-        &metric_family, kind, metric_family_label_.c_str(), metric_family_description_.c_str()));
-    metric_family_.reset(metric_family);
-    TRITONSERVER_MetricArgs* args = nullptr;
-    std::vector<double> buckets;
-    if (buckets_.has_value())
-    {
-        buckets = buckets_.value();
-    }
-    // overloading metris kind to determine update action as well
-    switch (kind)
-    {
-    case TRITONSERVER_METRIC_KIND_GAUGE: update_function_ = decltype(update_function_){TRITONSERVER_MetricSet}; break;
-    case TRITONSERVER_METRIC_KIND_COUNTER:
-        update_function_ = decltype(update_function_){TRITONSERVER_MetricIncrement};
-        break;
-    case TRITONSERVER_METRIC_KIND_HISTOGRAM:
-        update_function_ = decltype(update_function_){TRITONSERVER_MetricObserve};
-        TRITONSERVER_MetricArgsNew(&args);
-        TRITONSERVER_MetricArgsSetHistogram(args, buckets.data(), buckets.size());
-        break;
-    default: return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "unsupported Triton metrics kind"); break;
-    }
-    std::vector<TRITONSERVER_Parameter const*> labels;
-    std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> model_label(
-        TRITONSERVER_ParameterNew("model", TRITONSERVER_PARAMETER_STRING, model_name.c_str()));
-    std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> model_version(
-        TRITONSERVER_ParameterNew("version", TRITONSERVER_PARAMETER_STRING, std::to_string(version).c_str()));
-    labels.emplace_back(model_label.get());
-    labels.emplace_back(model_version.get());
-
-    for (size_t i = 0; i < sub_labels_.size(); ++i)
-    {
-        TRITONSERVER_Metric* metric;
-        std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> sub_label(
-            TRITONSERVER_ParameterNew(category_label_.c_str(), TRITONSERVER_PARAMETER_STRING, sub_labels_[i].c_str()));
-        labels.emplace_back(sub_label.get());
-        RETURN_IF_ERROR(
-            TRITONSERVER_MetricNewWithArgs(&metric, metric_family_.get(), labels.data(), labels.size(), args));
-        std::unique_ptr<TRITONSERVER_Metric, MetricDeleter> unique_metric(metric);
-        metrics_.push_back(std::move(unique_metric));
-        labels.pop_back();
-    }
-
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* TritonMetricGroup::UpdateGroup(std::vector<double>& values)
-{
-    for (size_t i = 0; i < values.size(); ++i)
-    {
-        RETURN_IF_ERROR(update_function_(metrics_[i].get(), values[i]));
-    }
-    return nullptr; // success
-}
-
-std::vector<std::string> const& TritonMetricGroup::JsonKeys() const
-{
-    return json_keys_;
-}
-
-TRITONSERVER_Error* CustomMetricsReporter::InitializeReporter(
-    std::string const& model_name, const uint64_t version, bool const is_v1_model)
-{
-    /* REQUEST METRIC GROUP */
-    request_metric_family_ = std::make_unique<TritonMetricGroup>(
-        "nv_trt_llm_request_metrics", "TRT LLM request metrics", "request_type", request_keys_, request_labels_);
-
-    RETURN_IF_ERROR(request_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(request_metric_family_));
-
-    /* RUNTIME MEMORY METRIC GROUP */
-    runtime_memory_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_runtime_memory_metrics",
-        "TRT LLM runtime memory metrics", "memory_type", runtime_memory_keys_, runtime_memory_labels_);
-
-    RETURN_IF_ERROR(runtime_memory_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(runtime_memory_metric_family_));
-
-    /* KV CACHE METRIC GROUP */
-    kv_cache_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_kv_cache_block_metrics",
-        "TRT LLM KV cache block metrics", "kv_cache_block_type", kv_cache_keys_, kv_cache_labels_);
-
-    RETURN_IF_ERROR(kv_cache_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(kv_cache_metric_family_));
-
-    /* DISAGGREGATED SERVING METRIC GROUP */
-    dis_serving_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_disaggregated_serving_metrics",
-        "TRT LLM disaggregated serving metrics", "disaggregated_serving_type", dis_serving_keys_, dis_serving_labels_);
-
-    // This group is created as counter because it is aggregation on request statistics
-    RETURN_IF_ERROR(dis_serving_metric_family_->CreateGroup(model_name, version, TRITONSERVER_METRIC_KIND_COUNTER));
-    metric_groups_.push_back(std::move(dis_serving_metric_family_));
-
-    /* MODEL-TYPE METRIC GROUP (V1 / IFB) */
-    std::string model = (is_v1_model) ? "v1" : "inflight_batcher";
-    std::string model_metric_family_label = "nv_trt_llm_" + model + "_metrics";
-    std::string model_metric_family_description = "TRT LLM " + model + "-specific metrics";
-    std::string model_metric_family_category = model + "_specific_metric";
-
-    if (is_v1_model)
-    {
-        model_type_metric_family_ = std::make_unique<TritonMetricGroup>(model_metric_family_label,
-            model_metric_family_description, model_metric_family_category, v1_specific_keys_, v1_specific_labels_);
-    }
-    else
-    {
-        model_type_metric_family_ = std::make_unique<TritonMetricGroup>(model_metric_family_label,
-            model_metric_family_description, model_metric_family_category, IFB_specific_keys_, IFB_specific_labels_);
-    }
-
-    RETURN_IF_ERROR(model_type_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(model_type_metric_family_));
-
-    /* GENERAL METRIC GROUP */
-    general_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_general_metrics",
-        "General TRT LLM metrics", "general_type", general_metric_keys_, general_metric_labels_);
-
-    RETURN_IF_ERROR(general_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(general_metric_family_));
-
-    /* GENERAL METRIC GROUP */
-    response_tokens_metric_family_ = std::make_unique<TritonMetricGroup>("nv_llm_output_token_len",
-        "TRT LLM response metrics", "response_metric_type", response_metric_type_keys_, response_metric_type_labels_);
-    std::vector<double> buckets = {10.0, 50.0, 100.0, 500.0, 1000.0};
-    RETURN_IF_ERROR(
-        response_tokens_metric_family_->CreateGroup(model_name, version, TRITONSERVER_METRIC_KIND_HISTOGRAM, buckets));
-    metric_groups_.push_back(std::move(response_tokens_metric_family_));
-
-    input_tokens_metric_family_ = std::make_unique<TritonMetricGroup>("nv_llm_input_token_len",
-        "TRT LLM response metrics", "response_metric_type", input_metric_type_keys_, input_metric_type_labels_);
-    RETURN_IF_ERROR(
-        input_tokens_metric_family_->CreateGroup(model_name, version, TRITONSERVER_METRIC_KIND_HISTOGRAM, buckets));
-    metric_groups_.push_back(std::move(input_tokens_metric_family_));
-
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* CustomMetricsReporter::UpdateCustomMetrics(std::string const& custom_metrics)
-{
-    triton::common::TritonJson::Value metrics;
-    std::vector<std::string> members;
-    metrics.Parse(custom_metrics);
-    metrics.Members(&members);
-
-    for (auto const& metric_group : metric_groups_)
-    {
-        std::vector<std::string> metric_group_keys = metric_group->JsonKeys();
-        std::vector<double> metric_group_values;
-        for (auto const& key : metric_group_keys)
-        {
-            triton::common::TritonJson::Value value_json;
-            double value;
-            if (!metrics.Find(key.c_str(), &value_json))
-            {
-                std::string errStr = std::string("Failed to find " + key + " in metrics.");
-                continue;
-            }
-            if (key == "Timestamp")
-            {
-                std::string timestamp;
-                value_json.AsString(&timestamp);
-                value = convertTimestampToMicroseconds(timestamp);
-            }
-            else
-            {
-                value_json.AsDouble(&value);
-            }
-
-            metric_group_values.push_back(value);
-        }
-        if (metric_group_values.size() > 0)
-        {
-            // Update only when something to update
-            RETURN_IF_ERROR(metric_group->UpdateGroup(metric_group_values));
-        }
-    }
-
-    return nullptr;
-}
-
-} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
diff --git a/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h b/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
deleted file mode 100644
index 98e769dad52..00000000000
--- a/triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#pragma once
-
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-#include <functional>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
-{
-
-/// TritonMetricGroups are handled by the CustomMetricsReporter class
-/// and encapsulate the creation/update functionality for a
-/// group of TRT LLM statistics to be reported as custom Triton metrics.
-/// The statistics (or custom metrics) handled by this class should
-/// not be confused with Triton base metrics.
-class TritonMetricGroup
-{
-public:
-    TritonMetricGroup(std::string const& metric_family_label, std::string const& metric_family_description,
-        std::string const& category_label, std::vector<std::string> const& json_keys,
-        std::vector<std::string> const& labels);
-    ~TritonMetricGroup(){};
-
-    /// Create a new Triton metric family with corresponding metric
-    /// pointers and parameters.
-    ///
-    /// \param model_name The name of the model to provide a metrics
-    /// group for.
-    /// \param version The version of the model to provide a metrics
-    /// group for.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* CreateGroup(std::string const& model_name, const uint64_t version,
-        TRITONSERVER_MetricKind kind = TRITONSERVER_METRIC_KIND_GAUGE,
-        std::optional<const std::vector<double>> buckets = std::nullopt);
-
-    /// Update the Triton metrics associated with this group using
-    /// the parsed TRT LLM backend statistics values.
-    ///
-    /// \param values Values parsed from the TRT LLM backend
-    /// statistics output, filtered by this group's JSON keys.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* UpdateGroup(std::vector<double>& values);
-
-    /// Return a list of JSON keys that correspond to the TRT LLM
-    /// statistics handled by this metric group.
-    ///
-    /// \return A const reference to vector of strings corresponding
-    /// to the JSON keys associated with this group.
-    std::vector<std::string> const& JsonKeys() const;
-
-    /// Custom deleter for a unique TRITONSERVER_MetricFamily pointer
-    struct MetricFamilyDeleter
-    {
-        void operator()(TRITONSERVER_MetricFamily* family)
-        {
-            if (family != nullptr)
-            {
-                TRITONSERVER_MetricFamilyDelete(family);
-            }
-        }
-    };
-
-    /// Custom deleter for a unique TRITONSERVER_Metric pointer
-    struct MetricDeleter
-    {
-        void operator()(TRITONSERVER_Metric* metric)
-        {
-            if (metric != nullptr)
-            {
-                TRITONSERVER_MetricDelete(metric);
-            }
-        }
-    };
-
-    /// Custom deleter for a unique TRITONSERVER_Parameter pointer
-    struct ParameterDeleter
-    {
-        void operator()(TRITONSERVER_Parameter* parameter)
-        {
-            if (parameter != nullptr)
-            {
-                TRITONSERVER_ParameterDelete(parameter);
-            }
-        }
-    };
-
-private:
-    std::unique_ptr<TRITONSERVER_MetricFamily, MetricFamilyDeleter> metric_family_;
-    std::vector<std::unique_ptr<TRITONSERVER_Metric, MetricDeleter>> metrics_;
-    std::function<struct TRITONSERVER_Error*(struct TRITONSERVER_Metric*, double)> update_function_;
-    std::string metric_family_label_;
-    std::string metric_family_description_;
-    std::string category_label_;
-    std::vector<std::string> json_keys_;
-    std::vector<std::string> sub_labels_;
-};
-
-/// CustomMetricsReporter is an interface class meant to facilitate the
-/// connection between TRT LLM backend statistics and Triton custom metrics.
-/// It functions by passing BatchManager statistics data from
-/// the TRT LLM backend to the multiple TritonMetricsGroup objects
-/// it handles.
-class CustomMetricsReporter
-{
-public:
-    CustomMetricsReporter(){};
-    ~CustomMetricsReporter(){};
-
-    /// Initialize the various TritonMetricGroups handled by
-    /// by this class using the static key/label members below.
-    ///
-    /// \param model The name of the model to provide metrics for.
-    /// \param version The version of the model to provide metrics for.
-    /// \param is_v1_model Whether the model type is v1 or an inflight
-    /// batching model.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* InitializeReporter(std::string const& model, const uint64_t version, bool const is_v1_model);
-
-    /// Updates the vector of TritonMetricGroup objects with a
-    /// JSON-formatted statistics string.
-    ///
-    /// \param statistics A JSON-formatted string of TRT LLM backend
-    /// statistics.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* UpdateCustomMetrics(std::string const& custom_metrics);
-
-    static const std::vector<std::string> request_keys_;
-    static const std::vector<std::string> request_labels_;
-
-    static const std::vector<std::string> runtime_memory_keys_;
-    static const std::vector<std::string> runtime_memory_labels_;
-
-    static const std::vector<std::string> kv_cache_keys_;
-    static const std::vector<std::string> kv_cache_labels_;
-
-    static const std::vector<std::string> dis_serving_keys_;
-    static const std::vector<std::string> dis_serving_labels_;
-
-    static const std::vector<std::string> v1_specific_keys_;
-    static const std::vector<std::string> v1_specific_labels_;
-
-    static const std::vector<std::string> IFB_specific_keys_;
-    static const std::vector<std::string> IFB_specific_labels_;
-
-    static const std::vector<std::string> general_metric_keys_;
-    static const std::vector<std::string> general_metric_labels_;
-
-    static const std::vector<std::string> response_metric_type_keys_;
-    static const std::vector<std::string> response_metric_type_labels_;
-
-    static const std::vector<std::string> input_metric_type_keys_;
-    static const std::vector<std::string> input_metric_type_labels_;
-
-private:
-    std::vector<std::unique_ptr<TritonMetricGroup>> metric_groups_;
-    std::unique_ptr<TritonMetricGroup> request_metric_family_;
-    std::unique_ptr<TritonMetricGroup> runtime_memory_metric_family_;
-    std::unique_ptr<TritonMetricGroup> kv_cache_metric_family_;
-    std::unique_ptr<TritonMetricGroup> dis_serving_metric_family_;
-    std::unique_ptr<TritonMetricGroup> model_type_metric_family_;
-    std::unique_ptr<TritonMetricGroup> general_metric_family_;
-    std::unique_ptr<TritonMetricGroup> response_tokens_metric_family_;
-    std::unique_ptr<TritonMetricGroup> input_tokens_metric_family_;
-};
-
-} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
diff --git a/triton_backend/inflight_batcher_llm/src/libtensorrtllm.cc b/triton_backend/inflight_batcher_llm/src/libtensorrtllm.cc
deleted file mode 100644
index 21e44ccd5c4..00000000000
--- a/triton_backend/inflight_batcher_llm/src/libtensorrtllm.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <cassert>
-#include <chrono>
-#include <fstream>
-#include <list>
-#include <memory>
-#include <thread>
-
-// Triton headers
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-// trtllm backend headers
-#include "model_instance_state.h"
-#include "model_state.h"
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-extern "C"
-{
-
-    TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
-    {
-        char const* cname;
-        RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
-        std::string name(cname);
-
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
-
-        // We should check the backend API version that Triton supports
-        // vs. what this backend was compiled against.
-        uint32_t api_version_major, api_version_minor;
-        RETURN_IF_ERROR(TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
-
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-            (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
-                + std::to_string(api_version_minor))
-                .c_str());
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-            (std::string("'") + name + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR)
-                + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
-                .c_str());
-
-        if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR)
-            || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR))
-        {
-            return TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_UNSUPPORTED, "triton backend API version does not support this backend");
-        }
-
-        // The backend configuration may contain information needed by the
-        // backend, such as command-line arguments.
-        TRITONSERVER_Message* backend_config_message;
-        RETURN_IF_ERROR(TRITONBACKEND_BackendConfig(backend, &backend_config_message));
-
-        char const* buffer;
-        size_t byte_size;
-        RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(backend_config_message, &buffer, &byte_size));
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("backend configuration:\n") + buffer).c_str());
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
-    // to allow the backend to create any state associated with the model,
-    // and to also examine the model configuration to determine if the
-    // configuration is suitable for the backend. Any errors reported by
-    // this function will prevent the model from loading.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
-    {
-        // Create a ModelState object and associate it with the
-        // TRITONBACKEND_Model. If anything goes wrong with initialization
-        // of the model state then an error is returned and Triton will fail
-        // to load the model.
-        char const* cname;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-        const std::string name(cname);
-
-        uint64_t version;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
-
-        ModelState* model_state;
-        RETURN_IF_ERROR(ModelState::Create(model, name, version, &model_state));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-            (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")")
-                .c_str());
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
-    // needed. The backend should cleanup any state associated with the
-    // model. This function will not be called until all model instances
-    // of the model have been finalized.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
-    {
-        void* vstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
-        ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
-        delete model_state;
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
-    // instance is created to allow the backend to initialize any state
-    // associated with the instance.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
-    {
-        // Get the model state associated with this instance's model.
-        TRITONBACKEND_Model* model;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-        void* vmodelstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
-        ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
-
-        // Create a ModelInstanceState object and associate it with the
-        // TRITONBACKEND_ModelInstance.
-        ModelInstanceState* instance_state;
-        RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
-
-        char const* cname;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
-        std::string name(cname);
-
-        int32_t device_id;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
-
-        std::string gpus = "";
-        if (instance_state->getGpuDeviceIds())
-        {
-            for (auto gpu : instance_state->getGpuDeviceIds().value())
-            {
-                gpus += std::to_string(gpu) + ",";
-            }
-            // Remove the last comma.
-            if (!gpus.empty())
-            {
-                gpus = gpus.substr(0, gpus.size() - 1);
-            }
-        }
-
-        if (!gpus.empty())
-        {
-            LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (devices: " + gpus + ").").c_str());
-        }
-        else
-        {
-            LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name).c_str());
-        }
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
-    // instance is no longer needed. The backend should cleanup any state
-    // associated with the model instance.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
-    {
-        TRITONBACKEND_Model* model;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-        void* vstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
-        ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
-        delete instance_state;
-
-        return nullptr; // success
-    }
-
-    // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
-    // that a backend create a response for each request in the batch. A
-    // response may be the output tensors required for that request or may
-    // be an error that is returned in the response.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
-        TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count)
-    {
-        ModelInstanceState* instance_state;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
-
-        instance_state->enqueue(requests, request_count);
-
-        return nullptr; // success
-    }
-
-} // extern "C"
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript b/triton_backend/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript
deleted file mode 100644
index 98a0f9402b7..00000000000
--- a/triton_backend/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-{
-  global:
-    TRITONBACKEND_*;
-    *ModelState*;
-    *ModelInstanceState*;
-    *utils*;
-  local: *;
-};
diff --git a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc b/triton_backend/inflight_batcher_llm/src/model_instance_state.cc
deleted file mode 100644
index 82ee70bc992..00000000000
--- a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc
+++ /dev/null
@@ -1,1569 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "model_instance_state.h"
-#include "utils.h"
-
-#include "tensorrt_llm/executor/serialization.h"
-#include "tensorrt_llm/runtime/utils/mpiUtils.h"
-
-#include <nlohmann/json.hpp>
-
-#include <fstream>
-
-namespace tle = tensorrt_llm::executor;
-using executor::SizeType32;
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-TRITONSERVER_Error* ModelInstanceState::Create(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state)
-{
-    try
-    {
-        *state = new ModelInstanceState(model_state, triton_model_instance);
-    }
-    catch (std::exception const& ex)
-    {
-        std::string errStr = std::string("unexpected error when creating modelInstanceState: ") + ex.what();
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
-    }
-
-    return nullptr; // success
-}
-
-executor::BatchingType ModelInstanceState::getBatchingTypeFromParams()
-{
-    executor::BatchingType batchingType;
-    auto gpt_model_type = model_state_->GetParameter<std::string>("gpt_model_type");
-
-    if (gpt_model_type == "V1" || gpt_model_type == "v1")
-    {
-        batchingType = executor::BatchingType::kSTATIC;
-    }
-    else if (gpt_model_type == "inflight_batching" || gpt_model_type == "inflight_fused_batching")
-    {
-        batchingType = executor::BatchingType::kINFLIGHT;
-    }
-    else
-    {
-        throw std::runtime_error(
-            "Invalid gpt_model_type. Must be "
-            "v1/inflight_batching/inflight_fused_batching.");
-    }
-    return batchingType;
-}
-
-executor::KvCacheConfig ModelInstanceState::getKvCacheConfigFromParams()
-{
-    std::optional<int32_t> maxTokensInPagedKvCache = std::nullopt;
-    try
-    {
-        maxTokensInPagedKvCache = model_state_->GetParameter<int32_t>("max_tokens_in_paged_kv_cache");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "max_tokens_in_paged_kv_cache is not specified, will "
-            "use default value");
-    }
-
-    std::optional<float> kvCacheFreeGpuMemFraction = std::nullopt;
-    try
-    {
-        kvCacheFreeGpuMemFraction = model_state_->GetParameter<float>("kv_cache_free_gpu_mem_fraction");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.9 or "
-            "max_tokens_in_paged_kv_cache");
-    }
-
-    std::optional<float> crossKvCacheFraction = std::nullopt;
-    try
-    {
-        crossKvCacheFraction = model_state_->GetParameter<float>("cross_kv_cache_fraction");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("cross_kv_cache_fraction is not specified, error if it's encoder-decoder model, otherwise ok");
-    }
-
-    std::optional<size_t> kvCacheHostCacheSize = std::nullopt;
-    try
-    {
-        kvCacheHostCacheSize = model_state_->GetParameter<size_t>("kv_cache_host_memory_bytes");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("kv_cache_host_memory_bytes not set, defaulting to 0");
-    }
-
-    bool kvCacheOnboardBlocks = true;
-    try
-    {
-        kvCacheOnboardBlocks = model_state_->GetParameter<bool>("kv_cache_onboard_blocks");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("kv_cache_onboard_blocks not set, defaulting to true");
-    }
-
-    std::optional<std::vector<int32_t>> maxAttentionWindow = std::nullopt;
-    try
-    {
-        maxAttentionWindow = model_state_->GetParameter<std::vector<int32_t>>("max_attention_window_size");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "max_attention_window_size is not specified, will "
-            "use default value (i.e. max_sequence_length)");
-    }
-
-    std::optional<int32_t> sinkTokenLength = std::nullopt;
-    try
-    {
-        sinkTokenLength = model_state_->GetParameter<int32_t>("sink_token_length");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "sink_token_length is not specified, will "
-            "use default value");
-    }
-
-    bool enableKVCacheReuse = true;
-    try
-    {
-        enableKVCacheReuse = model_state_->GetParameter<bool>("enable_kv_cache_reuse");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to true");
-    }
-
-    std::optional<std::vector<SizeType32>> maxAttentionWindowVec = std::nullopt;
-    if (maxAttentionWindow.has_value())
-    {
-        maxAttentionWindowVec
-            = std::vector<SizeType32>(maxAttentionWindow.value().begin(), maxAttentionWindow.value().end());
-    }
-
-    return executor::KvCacheConfig(enableKVCacheReuse, maxTokensInPagedKvCache, maxAttentionWindowVec, sinkTokenLength,
-        kvCacheFreeGpuMemFraction, kvCacheHostCacheSize, kvCacheOnboardBlocks, crossKvCacheFraction);
-}
-
-executor::ExtendedRuntimePerfKnobConfig ModelInstanceState::getExtendedRuntimePerfKnobConfigFromParams()
-{
-    bool multiBlockMode = true;
-    try
-    {
-        multiBlockMode = model_state_->GetParameter<bool>("multi_block_mode");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("multi_block_mode is not specified, will be set to true");
-    }
-
-    bool enableContextFMHAFP32Acc = false;
-    try
-    {
-        enableContextFMHAFP32Acc = model_state_->GetParameter<bool>("enable_context_fmha_fp32_acc");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_context_fmha_fp32_acc is not specified, will be set to false");
-    }
-
-    bool cudaGraphMode = false;
-    try
-    {
-        cudaGraphMode = model_state_->GetParameter<bool>("cuda_graph_mode");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("cuda_graph_mode is not specified, will be set to false");
-    }
-
-    SizeType32 cudaGraphCacheSize = 0;
-    try
-    {
-        cudaGraphCacheSize = model_state_->GetParameter<SizeType32>("cuda_graph_cache_size");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("cuda_graph_cache_size is not specified, will be set to 0");
-    }
-
-    return executor::ExtendedRuntimePerfKnobConfig(
-        multiBlockMode, enableContextFMHAFP32Acc, cudaGraphMode, cudaGraphCacheSize);
-}
-
-executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams()
-{
-    executor::ParallelConfig parallelConfig;
-    if (mGpuDeviceIds)
-    {
-        parallelConfig.setDeviceIds(mGpuDeviceIds.value());
-    }
-
-    char const* useOrchestratorMode = std::getenv("TRTLLM_ORCHESTRATOR");
-    auto const spawnProcessesEnvVar = std::getenv("TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES");
-    auto const spawnProcesses = !spawnProcessesEnvVar || std::atoi(spawnProcessesEnvVar);
-    if (useOrchestratorMode && std::atoi(useOrchestratorMode) != 0)
-    {
-        parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR);
-        mIsOrchestratorMode = true;
-
-        tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);
-
-        auto const workerExecutablePath = model_state_->GetExecutorWorkerPath();
-        auto const isOrchestrator = spawnProcesses || (tensorrt_llm::mpi::MpiComm::world().getRank() == 0);
-        auto orchestratorConfig
-            = executor::OrchestratorConfig(isOrchestrator, workerExecutablePath, nullptr, spawnProcesses);
-        parallelConfig.setOrchestratorConfig(orchestratorConfig);
-    }
-
-    if (mParticipantIds)
-    {
-        parallelConfig.setParticipantIds(mParticipantIds.value());
-    }
-    else if (!spawnProcesses && mIsOrchestratorMode)
-    {
-        TLLM_THROW("Spawning of processes was disabled in orchestrator mode, but participant IDs is missing.");
-    }
-
-    executor::SizeType32 numNodes = 1;
-    try
-    {
-        numNodes = model_state_->GetParameter<int32_t>("num_nodes");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_INFO("num_nodes is not specified, will be set to 1");
-    }
-    parallelConfig.setNumNodes(numNodes);
-
-    return parallelConfig;
-}
-
-executor::PeftCacheConfig ModelInstanceState::getPeftCacheConfigFromParams()
-{
-    // parse LoRA / Peft cache parameters
-    // lora_cache_max_adapter_size
-    // lora_cache_optimal_adapter_size
-    // lora_cache_gpu_memory_fraction
-    // lora_cache_host_memory_bytes
-    // lora_prefetch_dir
-
-    SizeType32 maxAdapterSize = 64;
-    SizeType32 optimalAdapterSize = 8;
-    std::optional<size_t> hostCacheSize = std::nullopt;
-    std::optional<float> deviceCachePercent = std::nullopt;
-    std::optional<std::string> loraPrefetchDir = std::nullopt;
-
-    std::string fieldName = "lora_cache_max_adapter_size";
-    try
-    {
-        maxAdapterSize = model_state_->GetParameter<SizeType32>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 64");
-    }
-
-    fieldName = "lora_cache_optimal_adapter_size";
-    try
-    {
-        optimalAdapterSize = model_state_->GetParameter<SizeType32>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 8");
-    }
-    fieldName = "lora_cache_gpu_memory_fraction";
-    try
-    {
-        deviceCachePercent = model_state_->GetParameter<float>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 0.05");
-    }
-    fieldName = "lora_cache_host_memory_bytes";
-    try
-    {
-        hostCacheSize = model_state_->GetParameter<size_t>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 1GB");
-    }
-    fieldName = "lora_prefetch_dir";
-    try
-    {
-        loraPrefetchDir = model_state_->GetParameter<std::string>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 1GB");
-    }
-
-    return executor::PeftCacheConfig(0, 0, optimalAdapterSize, maxAdapterSize,
-        ModelInstanceState::kPeftCacheNumPutWorkers, ModelInstanceState::kPeftCacheNumEnsureWorkers,
-        ModelInstanceState::kPeftCacheNumCopyStreams, 24, 8, deviceCachePercent, hostCacheSize, loraPrefetchDir);
-}
-
-executor::SchedulerConfig ModelInstanceState::getSchedulerConfigFromParams(bool enableChunkedContext)
-{
-    using executor::CapacitySchedulerPolicy;
-    auto schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
-    try
-    {
-        std::string schedulerPolicyStr = model_state_->GetParameter<std::string>("batch_scheduler_policy");
-        if (schedulerPolicyStr == "max_utilization")
-        {
-            schedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION;
-        }
-        else if (schedulerPolicyStr == "guaranteed_no_evict")
-        {
-            schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
-        }
-        else
-        {
-            throw std::runtime_error(
-                "batch_scheduler_policy parameter was not found or is invalid "
-                "(must be max_utilization or guaranteed_no_evict)");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(e.what());
-    }
-
-    return executor::SchedulerConfig(schedulerPolicy);
-}
-
-executor::SpeculativeDecodingConfig ModelInstanceState::getSpeculativeDecodingConfigFromParams(
-    std::optional<executor::OrchestratorConfig> orchConfig)
-{
-    bool fastLogits = false;
-    try
-    {
-        fastLogits = model_state_->GetParameter<bool>("speculative_decoding_fast_logits");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_INFO("speculative_decoding_fast_logits is not specified, will be set to false");
-    }
-
-    if (fastLogits && (!orchConfig.has_value() || orchConfig.value().getSpawnProcesses()))
-    {
-        TLLM_LOG_WARNING(
-            "speculative_decoding_fast_logits is set, but requires orchestrator with spawn_processes disabled."
-            "Disabling fast logits.");
-        fastLogits = false;
-    }
-
-    mSpeculativeDecodingFastLogits = fastLogits;
-
-    return executor::SpeculativeDecodingConfig(fastLogits);
-}
-
-std::optional<executor::GuidedDecodingConfig> ModelInstanceState::getGuidedDecodingConfigFromParams()
-{
-    std::optional<executor::GuidedDecodingConfig> guidedDecodingConfig = std::nullopt;
-    std::string tokenizerDir = model_state_->GetParameter<std::string>("tokenizer_dir");
-    std::string tokenizerInfoPath = model_state_->GetParameter<std::string>("xgrammar_tokenizer_info_path");
-    std::string guidedDecodingBackendStr = model_state_->GetParameter<std::string>("guided_decoding_backend");
-
-    if (!tokenizerDir.empty() && tokenizerDir != "${tokenizer_dir}")
-    {
-        TLLM_LOG_INFO(
-            "Guided decoding C++ workflow does not use tokenizer_dir, this parameter will "
-            "be ignored.");
-    }
-
-    if (guidedDecodingBackendStr.empty() || guidedDecodingBackendStr == "${guided_decoding_backend}"
-        || tokenizerInfoPath.empty() || tokenizerInfoPath == "${xgrammar_tokenizer_info_path}")
-    {
-        return guidedDecodingConfig;
-    }
-
-    TLLM_CHECK_WITH_INFO(std::filesystem::exists(tokenizerInfoPath),
-        "Xgrammar's tokenizer info path at %s does not exist.", tokenizerInfoPath.c_str());
-
-    auto const tokenizerInfo = nlohmann::json::parse(std::ifstream{std::filesystem::path(tokenizerInfoPath)});
-    auto const encodedVocab = tokenizerInfo["encoded_vocab"].template get<std::vector<std::string>>();
-    auto const tokenizerStr = tokenizerInfo["tokenizer_str"].template get<std::string>();
-    auto const stopTokenIds
-        = tokenizerInfo["stop_token_ids"].template get<std::vector<tensorrt_llm::runtime::TokenIdType>>();
-
-    executor::GuidedDecodingConfig::GuidedDecodingBackend guidedDecodingBackend;
-    if (guidedDecodingBackendStr == "xgrammar")
-    {
-        guidedDecodingBackend = executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR;
-    }
-    else
-    {
-        TLLM_THROW(
-            "Guided decoding is currently supported with 'xgrammar' backend. Invalid guided_decoding_backend parameter "
-            "provided.");
-    }
-    guidedDecodingConfig
-        = executor::GuidedDecodingConfig(guidedDecodingBackend, encodedVocab, tokenizerStr, stopTokenIds);
-    return guidedDecodingConfig;
-}
-
-executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
-{
-    auto batchingType = getBatchingTypeFromParams();
-
-    int32_t maxBeamWidth = 1;
-    try
-    {
-        maxBeamWidth = model_state_->GetParameter<int32_t>("max_beam_width");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("max_beam_width is not specified, will use default value of 1");
-    }
-
-    int32_t iterStatsMaxIterations = tle::ExecutorConfig::kDefaultIterStatsMaxIterations;
-    try
-    {
-        iterStatsMaxIterations = model_state_->GetParameter<int32_t>("iter_stats_max_iterations");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("iter_stats_max_iterations is not specified, will use default value of "
-            + std::to_string(iterStatsMaxIterations));
-    }
-
-    int32_t requestStatsMaxIterations = tle::ExecutorConfig::kDefaultRequestStatsMaxIterations;
-    try
-    {
-        requestStatsMaxIterations = model_state_->GetParameter<int32_t>("request_stats_max_iterations");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("request_stats_max_iterations is not specified, will use default value of "
-            + std::to_string(requestStatsMaxIterations));
-    }
-
-    try
-    {
-        model_state_->GetParameter<bool>("enable_trt_overlap");
-        TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored");
-    }
-    catch (std::exception const& e)
-    {
-    }
-
-    bool normalizeLogProbs = true;
-    try
-    {
-        normalizeLogProbs = model_state_->GetParameter<bool>("normalize_log_probs");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("normalize_log_probs is not specified, will be set to true");
-    }
-
-    executor::ExecutorConfig executorConfig;
-
-    auto kvCacheConfig = getKvCacheConfigFromParams();
-
-    bool enableChunkedContext = false;
-    try
-    {
-        enableChunkedContext = model_state_->GetParameter<bool>("enable_chunked_context");
-        if (enableChunkedContext)
-        {
-            TLLM_LOG_WARNING(
-                "enable_chunked_context is set to true, will use context chunking "
-                "(requires building the model with use_paged_context_fmha).");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false.");
-    }
-
-    auto schedulerConfig = getSchedulerConfigFromParams(enableChunkedContext);
-
-    auto peftCacheConfig = getPeftCacheConfigFromParams();
-
-    auto parallelConfig = getParallelConfigFromParams();
-
-    auto extendedRuntimePerfKnobConfig = getExtendedRuntimePerfKnobConfigFromParams();
-
-    auto specDecConfig = getSpeculativeDecodingConfigFromParams(parallelConfig.getOrchestratorConfig());
-
-    std::optional<executor::DecodingMode> decodingMode = std::nullopt;
-    try
-    {
-        std::string decodingModeStr = model_state_->GetParameter<std::string>("decoding_mode");
-        if (decodingModeStr == "top_k")
-        {
-            decodingMode = executor::DecodingMode::TopK();
-        }
-        else if (decodingModeStr == "top_p")
-        {
-            decodingMode = executor::DecodingMode::TopP();
-        }
-        else if (decodingModeStr == "top_k_top_p")
-        {
-            decodingMode = executor::DecodingMode::TopKTopP();
-        }
-        else if (decodingModeStr == "beam_search")
-        {
-            decodingMode = executor::DecodingMode::BeamSearch();
-        }
-        else if (decodingModeStr == "medusa")
-        {
-            decodingMode = executor::DecodingMode::Medusa();
-        }
-        else if (decodingModeStr == "redrafter")
-        {
-            decodingMode = executor::DecodingMode::ExplicitDraftTokens();
-        }
-        else if (decodingModeStr == "lookahead")
-        {
-            decodingMode = executor::DecodingMode::Lookahead();
-        }
-        else if (decodingModeStr == "eagle")
-        {
-            decodingMode = executor::DecodingMode::Eagle();
-        }
-        else
-        {
-            throw std::runtime_error("");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(
-            "decoding_mode parameter is invalid or not specified"
-            "(must be one of the {top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle})."
-            "Using default: top_k_top_p if max_beam_width == 1, beam_search otherwise");
-    }
-
-    executor::DecodingConfig decodingConfig(decodingMode);
-
-    try
-    {
-        auto medusaChoices = model_state_->GetParameter<executor::MedusaChoices>("medusa_choices");
-        decodingConfig.setMedusaChoices(medusaChoices);
-    }
-    catch (std::exception const& e)
-    {
-        if (decodingMode && decodingMode->isMedusa())
-        {
-            TLLM_LOG_WARNING(
-                "medusa_choices parameter is not specified. "
-                "Will be using default mc_sim_7b_63 choices instead.");
-        }
-    }
-
-    try
-    {
-        auto eagleChoices = model_state_->GetParameter<executor::EagleChoices>("eagle_choices");
-        executor::EagleConfig eagleConfig(eagleChoices);
-        decodingConfig.setEagleConfig(eagleConfig);
-    }
-    catch (std::exception const& e)
-    {
-        if (decodingMode && decodingMode->isEagle())
-        {
-            TLLM_LOG_WARNING(
-                "eagle_choices parameter is not specified. "
-                "Will be using default mc_sim_7b_63 choices instead or choices specified per-request.");
-        }
-    }
-
-    if (decodingMode && decodingMode->isLookahead())
-    {
-        try
-        {
-            executor::SizeType32 windowSize = 0, ngramSize = 0, verificationSetSize = 0;
-            windowSize = model_state_->GetParameter<uint32_t>("lookahead_window_size");
-            ngramSize = model_state_->GetParameter<uint32_t>("lookahead_ngram_size");
-            verificationSetSize = model_state_->GetParameter<uint32_t>("lookahead_verification_set_size");
-
-            mExecutorLookaheadDecodingConfig
-                = executor::LookaheadDecodingConfig{windowSize, ngramSize, verificationSetSize};
-            decodingConfig.setLookaheadDecodingConfig(mExecutorLookaheadDecodingConfig.value());
-        }
-        catch (std::exception const& e)
-        {
-            TLLM_THROW(
-                "Decoding mode is set to lookahead but lookahead parameters are not specified. "
-                "Please set parameters lookahead_window_size, lookahead_ngram_size, and "
-                "lookahead_verification_set_size.");
-        }
-    }
-
-    float gpuWeightsPercent = 1.0f;
-    try
-    {
-        gpuWeightsPercent = model_state_->GetParameter<float>("gpu_weights_percent");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("gpu_weights_percent parameter is not specified, will use default value of 1.0");
-    }
-
-    std::optional<SizeType32> maxQueueSize = std::nullopt;
-    try
-    {
-        triton::common::TritonJson::Value dynamic_batching;
-        if (model_state_->GetModelConfig().Find("dynamic_batching", &dynamic_batching))
-        {
-            triton::common::TritonJson::Value default_queue_policy;
-            if (dynamic_batching.Find("default_queue_policy", &default_queue_policy))
-            {
-                int64_t max_queue_size = 0;
-                auto err = default_queue_policy.MemberAsInt("max_queue_size", &max_queue_size);
-                if (err == nullptr)
-                {
-                    maxQueueSize = static_cast<SizeType32>(max_queue_size);
-                }
-            }
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(e.what());
-    }
-
-    SizeType32 recvPollPeriodMs = 0;
-    try
-    {
-        recvPollPeriodMs = model_state_->GetParameter<int>("recv_poll_period_ms");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_INFO("recv_poll_period_ms is not set, will use busy loop");
-    }
-
-    auto guidedConfig = getGuidedDecodingConfigFromParams();
-
-    auto execConfig = executor::ExecutorConfig{maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext,
-        normalizeLogProbs, iterStatsMaxIterations, requestStatsMaxIterations, batchingType,
-        /*maxBatchSize*/ std::nullopt, /*maxNumTokens*/ std::nullopt, parallelConfig, peftCacheConfig,
-        /*LogitsPostProcessorConfig*/ std::nullopt, decodingConfig, /*useGpuDirectStorage*/ false, gpuWeightsPercent,
-        maxQueueSize, extendedRuntimePerfKnobConfig,
-        /*DebugConfig*/ std::nullopt, recvPollPeriodMs};
-    execConfig.setSpecDecConfig(specDecConfig);
-    execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI));
-    if (guidedConfig.has_value())
-    {
-        execConfig.setGuidedDecodingConfig(guidedConfig.value());
-        TLLM_LOG_INFO("Guided decoding config has been provided and set as guided decoder.");
-    }
-    return execConfig;
-}
-
-ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
-    : model_state_(model_state)
-    , modelInstance_(triton_model_instance)
-{
-
-    mInstanceIndex = model_state->getAndIncrementInstanceIndex();
-    if (model_state_->getDeviceIds() && model_state_->getDeviceIds().value().size())
-    {
-        mGpuDeviceIds
-            = model_state_->getDeviceIds().value()[mInstanceIndex % model_state_->getDeviceIds().value().size()];
-    }
-    else
-    {
-        mGpuDeviceIds = std::nullopt;
-    }
-    mIsOrchestratorMode = false;
-
-    auto participantIds = model_state_->getParticipantIds();
-    if (participantIds && !participantIds.value().empty())
-    {
-        mParticipantIds = participantIds.value()[mInstanceIndex % participantIds.value().size()];
-    }
-    else
-    {
-        mParticipantIds = std::nullopt;
-    }
-    auto executorConfig = getExecutorConfigFromParams();
-
-#ifdef TRITON_ENABLE_METRICS
-    custom_metrics_reporter_ = std::make_unique<custom_metrics_reporter::CustomMetricsReporter>();
-    custom_metrics_reporter_->InitializeReporter(model_state->GetModelName(), model_state->GetModelVersion(),
-        (executorConfig.getBatchingType() == executor::BatchingType::kSTATIC));
-#endif
-
-    std::string decoderModelPath;
-    try
-    {
-        decoderModelPath = model_state_->GetParameter<std::string>("gpt_model_path");
-        TLLM_CHECK_WITH_INFO(std::filesystem::exists(decoderModelPath),
-            "Decoder (GPT) model path at %s does not exist.", decoderModelPath.c_str());
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("gpt_model_path is not specified, will be left empty");
-        decoderModelPath = "";
-    }
-
-    std::string encoderModelPath;
-    try
-    {
-        encoderModelPath = model_state_->GetParameter<std::string>("encoder_model_path");
-        TLLM_CHECK_WITH_INFO(std::filesystem::exists(encoderModelPath), "Encoder model path at %s does not exist.",
-            encoderModelPath.c_str());
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("encoder_model_path is not specified, will be left empty");
-        encoderModelPath = "";
-    }
-
-    TLLM_CHECK_WITH_INFO(
-        !decoderModelPath.empty() || !encoderModelPath.empty(), "Both encoder and decoder model paths are empty");
-
-    if (!decoderModelPath.empty())
-    {
-        // Encoder-decoder model
-        if (!encoderModelPath.empty())
-        {
-            mModelType = executor::ModelType::kENCODER_DECODER;
-            mExecutor
-                = std::make_unique<executor::Executor>(encoderModelPath, decoderModelPath, mModelType, executorConfig);
-        }
-        // Decoder only model
-        else
-        {
-            mModelType = executor::ModelType::kDECODER_ONLY;
-            mExecutor = std::make_unique<executor::Executor>(decoderModelPath, mModelType, executorConfig);
-        }
-    }
-    // Encoder only
-    else
-    {
-        mModelType = executor::ModelType::kENCODER_ONLY;
-        mExecutor = std::make_unique<executor::Executor>(encoderModelPath, mModelType, executorConfig);
-    }
-
-    bool excludeInputInOutput = false;
-    try
-    {
-        excludeInputInOutput = model_state_->GetParameter<bool>("exclude_input_in_output");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("exclude_input_in_output is not specified, will be set to false");
-    }
-    mInstanceSpecificConfig.excludeInputFromOutput = excludeInputInOutput;
-
-    int cancellationCheckPeriodMs = 100;
-    try
-    {
-        cancellationCheckPeriodMs = model_state_->GetParameter<int>("cancellation_check_period_ms");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("cancellation_check_period_ms is not specified, will be set to 100 (ms)");
-    }
-    mInstanceSpecificConfig.cancellationCheckPeriodMs = cancellationCheckPeriodMs;
-
-    int statsCheckPeriodMs = 100;
-    try
-    {
-        statsCheckPeriodMs = model_state_->GetParameter<int>("stats_check_period_ms");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("stats_check_period_ms is not specified, will be set to 100 (ms)");
-    }
-    mInstanceSpecificConfig.statsCheckPeriodMs = statsCheckPeriodMs;
-
-    if (mExecutor->canEnqueueRequests())
-    {
-        mStopWaitForResponse = false;
-        mWaitForResponseThread = std::thread(&ModelInstanceState::WaitForResponse, this);
-
-        mStopWaitForStats = false;
-        mWaitForStatsThread = std::thread(&ModelInstanceState::WaitForStats, this);
-
-        mStopWaitForCancel = false;
-        mWaitForCancelThread = std::thread(&ModelInstanceState::WaitForCancel, this);
-    }
-    else
-    {
-        // Shutdown the worker ranks which will cause them to wait for leader/orchestrator to terminate
-        mExecutor->shutdown();
-
-        if (mExecutor->isParticipant())
-        {
-            // Since leader/orchestrator can terminate if there are issues loading other models like pre/post processing
-            // we still don't want to return from initialize since Triton server would appear as ready
-            // So exit
-            TLLM_LOG_INFO("Terminating worker process since shutdown signal was received from leader or orchestrator");
-            exit(0);
-        }
-    }
-}
-
-void ModelInstanceState::sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error)
-{
-    TRITONBACKEND_ResponseFactory* factory;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory");
-    TRITONBACKEND_Response* tritonResponse;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response");
-    LOG_IF_ERROR(TRITONBACKEND_ResponseSend(tritonResponse, TRITONSERVER_RESPONSE_COMPLETE_FINAL, error),
-        "Cannot send response");
-    LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "Cannot release request");
-}
-
-bool ModelInstanceState::handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId)
-{
-    bool stopRequest = utils::getRequestBooleanInputTensor(request, kStopInputTensorName);
-    if (!stopRequest)
-    {
-        return false;
-    }
-
-    TRITONSERVER_Error* error = nullptr;
-
-    try
-    {
-        if (tritonRequestId == "")
-        {
-            throw std::runtime_error("Trying to stop a request but request ID is not provided");
-        }
-        std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-        if (mTritonRequestIdToRequestIds.count(tritonRequestId))
-        {
-            auto requestIds = mTritonRequestIdToRequestIds[tritonRequestId];
-            for (auto const& requestId : requestIds)
-            {
-                mExecutor->cancelRequest(requestId);
-            }
-        }
-    }
-    catch (std::exception const& e)
-    {
-        error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what());
-    }
-    // mTritonRequestIdToRequestIds.count(tritonRequestId) == false doesn't necessary mean an error since the
-    // request to cancel may already be completed.
-    // Send an empty response to indicate the request has been successfully cancelled
-    sendEnqueueResponse(request, error);
-    return true;
-}
-
-// Split batched TRITONBACKEND_Request into one executor:Request object per sample.
-std::vector<executor::Request> ModelInstanceState::createExecutorRequests(TRITONBACKEND_Request* request,
-    bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType, bool isOrchestrator,
-    bool specDecFastLogits, std::optional<executor::LookaheadDecodingConfig> const& lookaheadDecodingConfig)
-{
-    auto inputsTensors = utils::readInputsTensors(request);
-    bool streaming = utils::getRequestBooleanInputTensor(request, kStreamingInputTensorName);
-    executor::RequestType requestType = utils::getRequestType(request);
-
-    return utils::createRequestsFromInputTensors(inputsTensors, excludeInputFromOutput, isDecoupled, streaming,
-        modelType, requestType, isOrchestrator, specDecFastLogits, lookaheadDecodingConfig);
-}
-
-void ModelInstanceState::enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count)
-{
-
-    uint64_t exec_start_ns{0};
-    SET_TIMESTAMP(exec_start_ns);
-
-    for (uint32_t i = 0; i < request_count; ++i)
-    {
-        TRITONBACKEND_Request* request = requests[i];
-
-        try
-        {
-            char const* charRequestId = nullptr;
-            TRITONBACKEND_RequestId(request, &charRequestId);
-            std::string tritonRequestId;
-            if (charRequestId != nullptr)
-            {
-                tritonRequestId = charRequestId;
-            }
-
-            if (handleStopRequest(request, tritonRequestId))
-            {
-                continue;
-            }
-
-            auto executorRequests
-                = createExecutorRequests(request, mInstanceSpecificConfig.excludeInputFromOutput, isDecoupled(),
-                    mModelType, mIsOrchestratorMode, mSpeculativeDecodingFastLogits, mExecutorLookaheadDecodingConfig);
-
-            std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-            TRITONBACKEND_ResponseFactory* factory;
-            LOG_IF_ERROR(
-                TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory");
-
-            uint64_t compute_start_ns{0};
-            SET_TIMESTAMP(compute_start_ns);
-
-            auto requestIds = mExecutor->enqueueRequests(executorRequests);
-            auto requestIdsSet = std::make_shared<std::set<executor::IdType>>(requestIds.begin(), requestIds.end());
-
-            bool returnKvCacheReuseStats
-                = utils::getRequestBooleanInputTensor(request, InputFieldsNames::returnKvCacheReuseStats);
-            if (returnKvCacheReuseStats)
-            {
-                TLLM_LOG_WARNING("return_kv_cache_reuse_stats is deprecated, please use return_perf_metrics instead");
-            }
-
-            bool returnPerfMetrics = utils::getRequestBooleanInputTensor(request, InputFieldsNames::returnPerfMetrics);
-            bool returnNumInputTokens
-                = utils::getRequestBooleanInputTensor(request, InputFieldsNames::returnNumInputTokens);
-            bool returnNumOutputTokens
-                = utils::getRequestBooleanInputTensor(request, InputFieldsNames::returnNumOutputTokens);
-
-            // Note:
-            // A single TRITONBACKEND_Request will produce multiple executor requests when bs > 1.
-            // They are treated as individual executor requests until they come back to triton server,
-            // which generates a single response combining responses for all requests in the batch.
-            for (int32_t batchIndex = 0; batchIndex < static_cast<int32_t>(requestIds.size()); ++batchIndex)
-            {
-                auto const& requestId = requestIds.at(batchIndex);
-                auto const& executorRequest = executorRequests.at(batchIndex);
-                int64_t inputTokensSize = executorRequest.getInputTokenIds().size();
-                bool streaming = executorRequest.getStreaming();
-                executor::SizeType32 beamWidthCopy = executorRequest.getSamplingConfig().getBeamWidth();
-                bool excludeInputFromOutput = executorRequest.getOutputConfig().excludeInputFromOutput;
-                if (mRequestIdToRequestData.count(requestId))
-                {
-                    TLLM_LOG_ERROR(
-                        "Executor returns a request ID that already exists. This shouldn't happen unless there is "
-                        "something "
-                        "wrong in TRT-LLM runtime.");
-                }
-                auto requestOutputNames = utils::getRequestOutputNames(request);
-                int32_t const numReturnSequences
-                    = executorRequest.getSamplingConfig().getNumReturnSequences().value_or(1);
-                mRequestIdToRequestData.emplace(requestId,
-                    RequestData{factory, request, tritonRequestId, inputTokensSize, 0, streaming,
-                        excludeInputFromOutput, beamWidthCopy, std::move(requestOutputNames),
-                        {exec_start_ns, compute_start_ns, 0, 0}, batchIndex, static_cast<int32_t>(requestIds.size()),
-                        numReturnSequences, requestIdsSet, executorRequest.getRequestType(), returnPerfMetrics,
-                        returnNumInputTokens, returnNumOutputTokens});
-            }
-            if (tritonRequestId != "")
-            {
-                mTritonRequestIdToRequestIds[tritonRequestId] = *requestIdsSet;
-            }
-        }
-        catch (std::exception const& e)
-        {
-            sendEnqueueResponse(request, TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what()));
-        }
-    }
-    return;
-}
-
-TRITONSERVER_Error* ModelInstanceState::reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error)
-{
-    auto& timestamps = requestData.timestamps;
-    SET_TIMESTAMP(timestamps.exec_end_ns);
-
-    RETURN_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(modelInstance_, requestData.tritonRequest, (error == nullptr),
-            timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns));
-
-    // For now we will assume a batch size of 1 for each request. This may change in the future but for
-    // now it seems that even when requests are dynamically batched together each workItem is associated
-    // with its own request object and is handled independently due to the nature of IFB.
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceReportBatchStatistics(modelInstance_, 1 /* batch size */,
-        timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns));
-
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* ModelInstanceState::reportCustomMetrics(
-    int64_t inputTokensSize, int64_t outputTokensSize, TRITONSERVER_Error* error)
-{
-    std::string statJson = "{";
-    statJson.append("\"Total Output Tokens\":" + std::to_string(outputTokensSize) + ",");
-    statJson.append("\"Total Input Tokens\":" + std::to_string(inputTokensSize) + ",");
-    statJson.back() = '}';
-#ifdef TRITON_ENABLE_METRICS
-    LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(statJson), "Failed updating TRT LLM statistics");
-#endif
-    return nullptr; // success
-}
-
-std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*, int64_t> ModelInstanceState::fillTritonResponse(
-    TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData)
-{
-    TRITONBACKEND_Response* tritonResponse;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response");
-
-    TRITONSERVER_Error* error = nullptr;
-    bool isFinal = false;
-    int64_t outputTokensSize = 0;
-    try
-    {
-        if (!response.hasError())
-        {
-            auto result = response.getResult();
-            isFinal = result.isFinal;
-            error = nullptr;
-            auto sequenceIndex = result.sequenceIndex;
-            auto& outputIds = result.outputTokenIds;
-            std::vector<int32_t> beamLength(outputIds.size());
-            int32_t maxBeamLength = -1;
-            for (size_t i = 0; i < outputIds.size(); ++i)
-            {
-                // We want to capture ALL output tokens for ALL beams
-                outputTokensSize += outputIds[i].size();
-                beamLength[i] = outputIds[i].size();
-                maxBeamLength = std::max(beamLength[i], maxBeamLength);
-            }
-            if (maxBeamLength == -1)
-            {
-                TLLM_LOG_ERROR("Output ids is empty");
-                maxBeamLength = 0;
-            }
-            for (auto& vec : outputIds)
-            {
-                vec.resize(maxBeamLength, -1);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::outputIds) > 0)
-            {
-                std::vector<int64_t> outputIdsShape{1, static_cast<int64_t>(outputIds.size()), maxBeamLength};
-                auto outputIdsType = TRITONSERVER_TYPE_INT32;
-                auto outputIdsBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, outputIdsShape, outputIdsType, OutputFieldsNames::outputIds);
-                utils::flatten<int32_t>(outputIds, outputIdsBuffer, outputIdsShape);
-            }
-            else
-            {
-                TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::outputIds);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::sequenceLength) > 0)
-            {
-                std::vector<int64_t> sequenceLengthShape{1, static_cast<int64_t>(outputIds.size())};
-                auto sequenceLengthType = TRITONSERVER_TYPE_INT32;
-                auto sequenceLengthBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, sequenceLengthShape, sequenceLengthType, OutputFieldsNames::sequenceLength);
-                utils::flatten<int32_t>(beamLength, sequenceLengthBuffer, sequenceLengthShape);
-            }
-            else
-            {
-                TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::sequenceLength);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::contextLogits) > 0)
-            {
-                if (result.contextLogits.has_value())
-                {
-                    auto contextLogitsShapeOriginal = result.contextLogits.value().getShape();
-                    std::vector<int64_t> contextLogitsShape{
-                        1, contextLogitsShapeOriginal[0], contextLogitsShapeOriginal[1]};
-                    auto contextLogitsType = utils::to_triton_datatype(result.contextLogits.value().getDataType());
-                    TLLM_CHECK(contextLogitsType == model_state_->getLogitsDataType());
-                    if (contextLogitsType == TRITONSERVER_TYPE_FP32)
-                    {
-                        auto contextLogitsBuffer = utils::getResponseBuffer<float>(
-                            tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits);
-                        utils::flatten<float>(result.contextLogits.value(), contextLogitsBuffer, contextLogitsShape);
-                    }
-                    else if (contextLogitsType == TRITONSERVER_TYPE_FP16)
-                    {
-                        auto contextLogitsBuffer = utils::getResponseBuffer<half>(
-                            tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits);
-                        utils::flatten<half>(result.contextLogits.value(), contextLogitsBuffer, contextLogitsShape);
-                    }
-                    else
-                    {
-                        TLLM_THROW("Logits type is not supported");
-                    }
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::generationLogits) > 0)
-            {
-                if (result.generationLogits.has_value())
-                {
-                    auto generationLogitsShapeOriginal = result.generationLogits.value().getShape();
-                    std::vector<int64_t> generationLogitsShape{1, generationLogitsShapeOriginal[0],
-                        generationLogitsShapeOriginal[1], generationLogitsShapeOriginal[2]};
-                    auto generationLogitsType
-                        = utils::to_triton_datatype(result.generationLogits.value().getDataType());
-                    TLLM_CHECK(generationLogitsType == model_state_->getLogitsDataType());
-                    if (generationLogitsType == TRITONSERVER_TYPE_FP32)
-                    {
-                        auto generationLogitsBuffer = utils::getResponseBuffer<float>(tritonResponse,
-                            generationLogitsShape, generationLogitsType, OutputFieldsNames::generationLogits);
-                        utils::flatten<float>(
-                            result.generationLogits.value(), generationLogitsBuffer, generationLogitsShape);
-                    }
-                    else if (generationLogitsType == TRITONSERVER_TYPE_FP16)
-                    {
-                        auto generationLogitsBuffer = utils::getResponseBuffer<half>(tritonResponse,
-                            generationLogitsShape, generationLogitsType, OutputFieldsNames::generationLogits);
-                        utils::flatten<half>(
-                            result.generationLogits.value(), generationLogitsBuffer, generationLogitsShape);
-                    }
-                    else
-                    {
-                        TLLM_THROW("Logits type is not supported");
-                    }
-                }
-                else if (result.specDecFastLogitsInfo.has_value())
-                {
-                    auto const& logitsInfo = result.specDecFastLogitsInfo.value();
-                    size_t const numLogitsNeeded = (sizeof(logitsInfo) + 1) / sizeof(float);
-                    std::vector<int64_t> generationLogitsShape{1, 1, 1, numLogitsNeeded};
-                    auto generationLogitsType = TRITONSERVER_TYPE_FP32;
-                    std::vector<float> data(numLogitsNeeded);
-                    std::memcpy(data.data(), &logitsInfo, sizeof(logitsInfo));
-                    auto generationLogitsBuffer = utils::getResponseBuffer<float>(tritonResponse, generationLogitsShape,
-                        generationLogitsType, OutputFieldsNames::generationLogits);
-                    utils::flatten<float>(data, generationLogitsBuffer, generationLogitsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::outputLogProbs) > 0)
-            {
-                if (result.logProbs.has_value())
-                {
-                    auto& logProbs = result.logProbs.value();
-                    size_t maxLogProbs = 0;
-                    for (auto const& vec : logProbs)
-                    {
-                        maxLogProbs = std::max(maxLogProbs, vec.size());
-                    }
-                    for (auto& vec : logProbs)
-                    {
-                        vec.resize(maxLogProbs, -1);
-                    }
-                    std::vector<int64_t> outputLogProbsShape{
-                        1, static_cast<int64_t>(logProbs.size()), static_cast<int64_t>(logProbs[0].size())};
-                    auto outputLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto outputLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, outputLogProbsShape, outputLogProbsType, OutputFieldsNames::outputLogProbs);
-                    utils::flatten<float>(logProbs, outputLogProbsBuffer, outputLogProbsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::cumLogProbs) > 0)
-            {
-                if (result.cumLogProbs.has_value())
-                {
-                    std::vector<int64_t> cumLogProbsShape{1, static_cast<int64_t>(result.cumLogProbs.value().size())};
-                    auto cumLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto cumLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, cumLogProbsShape, cumLogProbsType, OutputFieldsNames::cumLogProbs);
-                    utils::flatten<float>(result.cumLogProbs.value(), cumLogProbsBuffer, cumLogProbsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::batchIndex) > 0 && requestData.batchSize > 1)
-            {
-                std::vector<int64_t> batchIndexShape{1, 1};
-                auto batchIndexType = TRITONSERVER_TYPE_INT32;
-                auto batchIndexBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, batchIndexShape, batchIndexType, OutputFieldsNames::batchIndex);
-                std::vector<int32_t> batchIndexVec = {requestData.batchIndex};
-                utils::flatten<int32_t>(batchIndexVec, batchIndexBuffer, batchIndexShape);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::sequenceIndex) > 0
-                && requestData.numReturnSequences > 1)
-            {
-                std::vector<int64_t> sequenceIndexShape{1, 1};
-                auto sequenceIndexType = TRITONSERVER_TYPE_INT32;
-                auto sequenceIndexBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, sequenceIndexShape, sequenceIndexType, OutputFieldsNames::sequenceIndex);
-                std::vector<int32_t> sequenceIndexVec = {sequenceIndex};
-                utils::flatten<int32_t>(sequenceIndexVec, sequenceIndexBuffer, sequenceIndexShape);
-            }
-
-            if (requestData.requestType == executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY)
-            {
-                if (response.getResult().contextPhaseParams.has_value())
-                {
-                    size_t contextPhaseParamsSize
-                        = executor::Serialization::serializedSize(response.getResult().contextPhaseParams.value());
-                    std::vector<int64_t> contextPhaseParamsShape{1, static_cast<int64_t>(contextPhaseParamsSize)};
-                    TRITONSERVER_DataType contextPhaseParamsType = TRITONSERVER_TYPE_UINT8;
-                    auto contextPhaseParamsBuffer = utils::getResponseBuffer<uint8_t>(tritonResponse,
-                        contextPhaseParamsShape, contextPhaseParamsType, OutputFieldsNames::contextPhaseParams);
-
-                    std::stringbuf contextPhaseSerializationBuffer(std::ios_base::out | std::ios_base::in);
-                    contextPhaseSerializationBuffer.pubsetbuf(
-                        reinterpret_cast<char*>(contextPhaseParamsBuffer), contextPhaseParamsSize);
-                    std::ostream os(&contextPhaseSerializationBuffer);
-                    executor::Serialization::serialize(response.getResult().contextPhaseParams.value(), os);
-                }
-                else
-                {
-                    TLLM_THROW("contextParams must be present in the response");
-                }
-            }
-
-            // Add token count outputs if requested
-            if (requestData.returnNumInputTokens
-                && requestData.outputNames.count(OutputFieldsNames::numInputTokens) > 0)
-            {
-                std::vector<int64_t> inputTokenCountShape{1, 1};
-                auto inputTokenCountType = TRITONSERVER_TYPE_INT32;
-                auto inputTokenCountBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, inputTokenCountShape, inputTokenCountType, OutputFieldsNames::numInputTokens);
-                std::vector<int32_t> inputTokenCountVec = {static_cast<int32_t>(requestData.inputTokensSize)};
-                utils::flatten<int32_t>(inputTokenCountVec, inputTokenCountBuffer, inputTokenCountShape);
-            }
-
-            if (requestData.returnNumOutputTokens
-                && requestData.outputNames.count(OutputFieldsNames::numOutputTokens) > 0)
-            {
-                std::vector<int64_t> outputTokenCountShape{1, 1};
-                auto outputTokenCountType = TRITONSERVER_TYPE_INT32;
-                auto outputTokenCountBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, outputTokenCountShape, outputTokenCountType, OutputFieldsNames::numOutputTokens);
-                std::vector<int32_t> outputTokenCountVec = {static_cast<int32_t>(outputTokensSize)};
-                utils::flatten<int32_t>(outputTokenCountVec, outputTokenCountBuffer, outputTokenCountShape);
-            }
-
-            if (requestData.returnPerfMetrics)
-            {
-                auto processStats = [&](std::string const& fieldName, auto const& value)
-                {
-                    std::vector<int64_t> shape{1, 1};
-
-                    if constexpr (std::is_same_v<decltype(value), int32_t const&>)
-                    {
-                        auto type = TRITONSERVER_TYPE_INT32;
-                        auto buffer = utils::getResponseBuffer<int32_t>(tritonResponse, shape, type, fieldName);
-                        std::vector<int32_t> vec = {value};
-                        utils::flatten<int32_t>(vec, buffer, shape);
-                    }
-                    else if constexpr (std::is_same_v<decltype(value), float const&>)
-                    {
-                        auto type = TRITONSERVER_TYPE_FP32;
-                        auto buffer = utils::getResponseBuffer<float>(tritonResponse, shape, type, fieldName);
-                        std::vector<float> vec = {value};
-                        utils::flatten<float>(vec, buffer, shape);
-                    }
-                    else if constexpr (std::is_same_v<decltype(value), executor::RequestPerfMetrics::TimePoint const&>)
-                    {
-                        auto type = TRITONSERVER_TYPE_INT64;
-                        auto buffer = utils::getResponseBuffer<int64_t>(tritonResponse, shape, type, fieldName);
-                        auto duration = value.time_since_epoch();
-                        auto nanoseconds = std::chrono::duration_cast<std::chrono::nanoseconds>(duration);
-                        std::vector<int64_t> vec = {nanoseconds.count()};
-                        utils::flatten<int64_t>(vec, buffer, shape);
-                    }
-                    else
-                    {
-                        TLLM_THROW("Unexpected type for field %s", fieldName.c_str());
-                    }
-                };
-                if (result.requestPerfMetrics.has_value())
-                {
-                    auto const& kvStats = result.requestPerfMetrics.value().kvCacheMetrics;
-                    processStats(OutputFieldsNames::kvCacheAllocNewBlocks, kvStats.numNewAllocatedBlocks);
-                    processStats(OutputFieldsNames::kvCacheReusedBlocks, kvStats.numReusedBlocks);
-                    processStats(OutputFieldsNames::kvCacheAllocTotalBlocks, kvStats.numTotalAllocatedBlocks);
-
-                    auto const& timingStats = result.requestPerfMetrics.value().timingMetrics;
-                    processStats(OutputFieldsNames::arrivalTime, timingStats.arrivalTime);
-                    processStats(OutputFieldsNames::firstScheduledTime, timingStats.firstScheduledTime);
-                    processStats(OutputFieldsNames::firstTokenTime, timingStats.firstTokenTime);
-                    processStats(OutputFieldsNames::lastTokenTime, timingStats.lastTokenTime);
-
-                    auto const& specDecodingStats = result.requestPerfMetrics.value().speculativeDecoding;
-                    processStats(OutputFieldsNames::acceptanceRate, specDecodingStats.acceptanceRate);
-                    processStats(
-                        OutputFieldsNames::totalAcceptedDraftTokens, specDecodingStats.totalAcceptedDraftTokens);
-                    processStats(OutputFieldsNames::totalDraftTokens, specDecodingStats.totalDraftTokens);
-                }
-            }
-        }
-        else
-        {
-            isFinal = true;
-            std::string errMsg = "Executor failed process requestId " + std::to_string(response.getRequestId())
-                + " due to the following error: " + response.getErrorMsg();
-            error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str());
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // In case of error while processing response, return response with error
-        isFinal = true;
-        std::string errMsg = "Error encountered while populating response: " + std::string(e.what());
-        error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str());
-    }
-
-    return {tritonResponse, isFinal, error, outputTokensSize};
-}
-
-void ModelInstanceState::WaitForResponse()
-{
-    while (!mStopWaitForResponse)
-    {
-        std::chrono::milliseconds waitTime(1);
-        auto responses = mExecutor->awaitResponses(waitTime);
-        uint64_t compute_end_ns{0};
-        SET_TIMESTAMP(compute_end_ns);
-
-        for (auto const& response : responses)
-        {
-            auto requestId = response.getRequestId();
-            RequestData requestData;
-            {
-                std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-                if (!mRequestIdToRequestData.count(requestId))
-                {
-                    TLLM_LOG_ERROR("Unexpected response for a request ID that is not active");
-                    continue;
-                }
-                requestData = mRequestIdToRequestData[requestId];
-            }
-
-            auto factory = requestData.factory;
-
-            auto [tritonResponse, isFinal, error, outputTokensSize]
-                = fillTritonResponse(factory, response, requestData);
-            {
-                std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-                if (!mRequestIdToRequestData.count(requestId))
-                {
-                    TLLM_LOG_ERROR("Unexpected response for a request ID that is not active");
-                    continue;
-                }
-                mRequestIdToRequestData[requestId].outputTokensSize += outputTokensSize;
-                requestData = mRequestIdToRequestData[requestId];
-            }
-
-            if (isFinal)
-            {
-                std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-                bool signalFinal = requestData.pendingBatchedRequestIds->size() == 1;
-                LOG_IF_ERROR(TRITONBACKEND_ResponseSend(
-                                 tritonResponse, signalFinal ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0, error),
-                    "Cannot send response");
-                if (requestData.tritonRequestId != "")
-                {
-                    auto itr = mTritonRequestIdToRequestIds.find(requestData.tritonRequestId);
-                    if (itr != mTritonRequestIdToRequestIds.end())
-                    {
-                        auto& pendingBatchedRequestIds = itr->second;
-                        pendingBatchedRequestIds.erase(requestId);
-                        if (pendingBatchedRequestIds.size() == 0)
-                        {
-                            mTritonRequestIdToRequestIds.erase(requestData.tritonRequestId);
-                        }
-                    }
-                }
-                auto& pendingBatchedRequestIds = *requestData.pendingBatchedRequestIds;
-                pendingBatchedRequestIds.erase(requestId);
-                if (pendingBatchedRequestIds.size() == 0)
-                {
-                    if (!requestData.excludeInputInOutput && !requestData.streaming)
-                    {
-                        // Need to do this as initial tokens sent by the executor are the input tokens IF NOT Streaming
-                        requestData.outputTokensSize -= (requestData.inputTokensSize * requestData.beamWidth);
-                    }
-                    requestData.timestamps.compute_end_ns = compute_end_ns;
-                    LOG_IF_ERROR(reportBaseMetrics(requestData, error), "Error reporting metrics");
-                    LOG_IF_ERROR(reportCustomMetrics(requestData.inputTokensSize, requestData.outputTokensSize, error),
-                        "Error reporting custom metrics per request");
-                    LOG_IF_ERROR(
-                        TRITONBACKEND_RequestRelease(requestData.tritonRequest, TRITONSERVER_REQUEST_RELEASE_ALL),
-                        "Cannot release request");
-
-                    LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryDelete(factory), "Cannot delete response factory");
-                }
-                mRequestIdToRequestData.erase(requestId);
-            }
-            else
-            {
-                LOG_IF_ERROR(TRITONBACKEND_ResponseSend(tritonResponse, 0, error), "Cannot send response");
-            }
-        }
-    }
-}
-
-void ModelInstanceState::WaitForStats()
-{
-    while (!mStopWaitForStats)
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.statsCheckPeriodMs));
-        auto stats = mExecutor->getLatestIterationStats();
-        for (auto const& stat : stats)
-        {
-            std::string statJson = "{";
-            statJson.append("\"Active Request Count\":" + std::to_string(stat.numActiveRequests) + ",");
-            statJson.append("\"Iteration Counter\":" + std::to_string(stat.iter) + ",");
-            statJson.append("\"Max Request Count\":" + std::to_string(stat.maxNumActiveRequests) + ",");
-            statJson.append("\"Runtime CPU Memory Usage\":" + std::to_string(stat.cpuMemUsage) + ",");
-            statJson.append("\"Runtime GPU Memory Usage\":" + std::to_string(stat.gpuMemUsage) + ",");
-            statJson.append("\"Runtime Pinned Memory Usage\":" + std::to_string(stat.pinnedMemUsage) + ",");
-            statJson.append("\"Timestamp\":" + ("\"" + stat.timestamp + "\"") + ",");
-
-            if (stat.inflightBatchingStats.has_value())
-            {
-                auto const& modelStats = stat.inflightBatchingStats.value();
-                statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ",");
-                statJson.append("\"Generation Requests\":" + std::to_string(modelStats.numGenRequests) + ",");
-                statJson.append("\"MicroBatch ID\":" + std::to_string(modelStats.microBatchId) + ",");
-                statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ",");
-                statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
-                statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
-                statJson.append("\"Waiting Requests\":"
-                    + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
-            }
-            else if (stat.staticBatchingStats.has_value())
-            {
-                auto const& modelStats = stat.staticBatchingStats.value();
-                statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ",");
-                statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
-                statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
-                statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ",");
-                statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ",");
-                statJson.append("\"Waiting Requests\":"
-                    + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
-            }
-            else
-            {
-                TLLM_LOG_ERROR("Missing stats");
-                continue;
-            }
-
-            if (stat.kvCacheStats.has_value())
-            {
-                auto const& kvStats = stat.kvCacheStats.value();
-                statJson.append("\"Free KV cache blocks\":" + std::to_string(kvStats.freeNumBlocks) + ",");
-                statJson.append("\"Max KV cache blocks\":" + std::to_string(kvStats.maxNumBlocks) + ",");
-                statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");
-                statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");
-                statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ",");
-                // Calculate and append the used KV cache block fraction.
-                double fraction = 0.0;
-                if (static_cast<double>(kvStats.maxNumBlocks) > 0.0)
-                {
-                    fraction = static_cast<double>(kvStats.usedNumBlocks) / static_cast<double>(kvStats.maxNumBlocks);
-                }
-                statJson.append("\"Fraction used KV cache blocks\":" + std::to_string(fraction) + ",");
-            }
-
-            // requestStats is a list where each item is associated with an iteration,
-            // currently the metrics related to request stats only concern with aggregated
-            // results so that we can retrieve request stats and process all of them
-            // whenever metrics is to be reported.
-            double totalKvCacheTransferMS = 0;
-            size_t requestCount = 0;
-            if (!mIsOrchestratorMode)
-            {
-                // TODO: implement orchestrator mode support: https://jirasw.nvidia.com/browse/TRTLLM-1581
-                auto requestStats = mExecutor->getLatestRequestStats();
-                for (auto const& iteration : requestStats)
-                {
-                    for (auto const& request : iteration.requestStats)
-                    {
-                        // only check and aggregate results when request is completed
-                        if (request.stage == executor::RequestStage::kGENERATION_COMPLETE)
-                        {
-                            if (request.disServingStats.has_value())
-                            {
-                                auto const& disServingStats = request.disServingStats.value();
-                                totalKvCacheTransferMS += disServingStats.kvCacheTransferMS;
-                                requestCount++;
-                            }
-                        }
-                    }
-                }
-            }
-            statJson.append("\"KV cache transfer time\":" + std::to_string(totalKvCacheTransferMS) + ",");
-            statJson.append("\"Request count\":" + std::to_string(requestCount) + ",");
-
-            statJson.back() = '}';
-
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, statJson.c_str());
-#ifdef TRITON_ENABLE_METRICS
-            LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(statJson), "Failed updating TRT LLM statistics");
-#endif
-        }
-    }
-}
-
-void ModelInstanceState::WaitForCancel()
-{
-    while (!mStopWaitForCancel)
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.cancellationCheckPeriodMs));
-        std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-        for (auto const& pair : mRequestIdToRequestData)
-        {
-            auto const& requestId = pair.first;
-            auto const& requestData = pair.second;
-            bool isCancelled = false;
-            LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryIsCancelled(requestData.factory, &isCancelled),
-                "Failed to query factory status");
-            if (isCancelled)
-            {
-                mExecutor->cancelRequest(requestId);
-            }
-        }
-    }
-}
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/src/model_instance_state.h b/triton_backend/inflight_batcher_llm/src/model_instance_state.h
deleted file mode 100644
index 37d1cc78ab5..00000000000
--- a/triton_backend/inflight_batcher_llm/src/model_instance_state.h
+++ /dev/null
@@ -1,295 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/types.h"
-
-#include "model_state.h"
-
-#ifdef TRITON_ENABLE_METRICS
-#include "custom_metrics_reporter/custom_metrics_reporter.h"
-#endif
-
-#include <map>
-#include <queue>
-#include <thread>
-
-using namespace tensorrt_llm;
-using namespace tensorrt_llm::batch_manager;
-
-namespace triton::backend::inflight_batcher_llm::tests
-{
-class ModelInstanceStateTest;
-class ModelInstanceStateTest_ExecutorConfig_Test;
-} // namespace triton::backend::inflight_batcher_llm::tests
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-/// @brief Struct to hold configs that is will be used later when creating the executor requests
-struct InstanceSpecificConfig
-{
-    bool excludeInputFromOutput;
-    int cancellationCheckPeriodMs;
-    int statsCheckPeriodMs;
-};
-
-/// @brief Timestamps for each request, used to report Triton metrics
-struct Timestamps
-{
-    uint64_t exec_start_ns = 0;
-    uint64_t compute_start_ns = 0;
-    uint64_t compute_end_ns = 0;
-    uint64_t exec_end_ns = 0;
-
-    void Reset()
-    {
-        exec_start_ns = 0;
-        compute_start_ns = 0;
-        compute_end_ns = 0;
-        exec_end_ns = 0;
-    }
-};
-
-/// @brief Per-request data stored for handling requests
-struct RequestData
-{
-    TRITONBACKEND_ResponseFactory* factory;
-    TRITONBACKEND_Request* tritonRequest;
-    std::string tritonRequestId;
-    int64_t inputTokensSize;
-    int64_t outputTokensSize;
-    bool streaming;
-    bool excludeInputInOutput;
-    executor::SizeType32 beamWidth;
-    std::unordered_set<std::string> outputNames;
-    Timestamps timestamps;
-    int32_t batchIndex;
-    int32_t batchSize;
-    int32_t numReturnSequences;
-    std::shared_ptr<std::set<executor::IdType>> pendingBatchedRequestIds;
-    executor::RequestType requestType;
-    bool returnPerfMetrics;
-    bool returnNumInputTokens;
-    bool returnNumOutputTokens;
-};
-
-//
-// ModelInstanceState
-// State associated with a model instance. An object of this class is
-// created and associated with each
-// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
-//
-class ModelInstanceState
-{
-public:
-    // number of cpu workers used to move weights host cache to gpu cache
-    static constexpr executor::SizeType32 kPeftCacheNumEnsureWorkers = 4;
-    // number of cuda streams used for H2D copies of peft cache pages
-    static constexpr executor::SizeType32 kPeftCacheNumCopyStreams = 4;
-    // number of cpu workers used to load weight into host cache
-    static constexpr executor::SizeType32 kPeftCacheNumPutWorkers = 4;
-
-    /// @brief Create a ModelInstanceObject
-    static TRITONSERVER_Error* Create(
-        ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
-
-    virtual ~ModelInstanceState()
-    {
-        mStopWaitForResponse = true;
-        if (mWaitForResponseThread.joinable())
-        {
-            mWaitForResponseThread.join();
-        }
-
-        mStopWaitForStats = true;
-        if (mWaitForStatsThread.joinable())
-        {
-            mWaitForStatsThread.join();
-        }
-
-        mStopWaitForCancel = true;
-        if (mWaitForCancelThread.joinable())
-        {
-            mWaitForCancelThread.join();
-        }
-    }
-
-    // Get the state of the model that corresponds to this instance.
-    ModelState* StateForModel() const
-    {
-        return model_state_;
-    }
-
-    bool isDecoupled() const
-    {
-        return model_state_->IsDecoupled();
-    }
-
-    /// @brief Add the request to the executor
-    void enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count);
-
-    /// @brief Get GPU device IDs
-    std::optional<std::vector<int32_t>> const& getGpuDeviceIds() const
-    {
-        return mGpuDeviceIds;
-    }
-
-private:
-    friend class triton::backend::inflight_batcher_llm::tests::ModelInstanceStateTest_ExecutorConfig_Test;
-
-    /// @brief Get batching type
-    executor::BatchingType getBatchingTypeFromParams();
-
-    /// @brief Get kv cache config
-    executor::KvCacheConfig getKvCacheConfigFromParams();
-
-    /// @brief Get scheduler config
-    executor::SchedulerConfig getSchedulerConfigFromParams(bool enableChunkedContext);
-
-    /// @brief Get peft config
-    executor::PeftCacheConfig getPeftCacheConfigFromParams();
-
-    /// @brief Get parallel config
-    executor::ParallelConfig getParallelConfigFromParams();
-
-    /// @brief Get extended runtime perf knob config
-    executor::ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfigFromParams();
-
-    /// @brief Get speculative decoding config
-    executor::SpeculativeDecodingConfig getSpeculativeDecodingConfigFromParams(
-        std::optional<executor::OrchestratorConfig> orchConfig);
-
-    /// @brief Get guided decoding config
-    std::optional<executor::GuidedDecodingConfig> getGuidedDecodingConfigFromParams();
-
-    /// @brief Get executor config
-    executor::ExecutorConfig getExecutorConfigFromParams();
-
-    /// @brief Constructor
-    ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
-
-    /// @brief Constructor used for testing purposes
-    ModelInstanceState(ModelState* model_state)
-        : model_state_(model_state)
-    {
-    }
-
-    ModelState* model_state_;
-    TRITONBACKEND_ModelInstance* modelInstance_;
-
-    /// @brief Send a response during enqueue
-    void sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error);
-
-    /// @brief Cancel a request
-    bool handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId);
-
-    /// @brief Create an executor::Request from input tensors for each sample in batch
-    static std::vector<executor::Request> createExecutorRequests(TRITONBACKEND_Request* request,
-        bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType, bool isOrchestratorMode,
-        bool specDecFastLogits, std::optional<executor::LookaheadDecodingConfig> const& lookaheadDecodingConfig);
-
-    /// @brief Fill in a triton response based on executor response
-    std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*, int64_t> fillTritonResponse(
-        TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData);
-
-    /// @brief TRT-LLM Executor that handles requests
-    std::unique_ptr<executor::Executor> mExecutor;
-
-    /// @brief Config to be used when sending requests to executor
-    InstanceSpecificConfig mInstanceSpecificConfig;
-
-    /// @brief Report Triton base metrics for a given request
-    TRITONSERVER_Error* reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error);
-
-    /// @brief Report Triton custom metrics for a given request
-    TRITONSERVER_Error* reportCustomMetrics(
-        int64_t inputTokensSize, int64_t outputTokensSize, TRITONSERVER_Error* error);
-
-    /// @brief Retrieve responses from the executor
-    void WaitForResponse();
-
-    /// @brief The thread for WaitForResponse() to run
-    std::thread mWaitForResponseThread;
-
-    /// @brief Flag to stop the WaitForResponse thread when the model instance is being destroyed
-    bool mStopWaitForResponse;
-
-    /// @brief Retrieve stats from the executor
-    void WaitForStats();
-
-    /// @brief The thread for WaitForStats() to run
-    std::thread mWaitForStatsThread;
-
-    /// @brief Flag to stop the WaitForStats thread when the model instance is being destroyed
-    bool mStopWaitForStats;
-
-    /// @brief Cancel a request for executor if it is marked as cancelled by Triton backend
-    void WaitForCancel();
-
-    /// @brief The thread for WaitForCancel() to run
-    std::thread mWaitForCancelThread;
-
-    /// @brief Flag to stop the WaitForCancel thread when the model instance is being destroyed
-    bool mStopWaitForCancel;
-
-    std::unordered_map<executor::IdType, RequestData> mRequestIdToRequestData;
-    std::unordered_map<std::string, std::set<executor::IdType>> mTritonRequestIdToRequestIds;
-    std::mutex mRequestIdToRequestDataMutex;
-
-    // The type of model (encoder-only, decoder-only, encoder-decoder)
-    executor::ModelType mModelType;
-
-    /// @brief The instance index
-    uint32_t mInstanceIndex;
-
-    /// @brief GPU device ids for this instance
-    std::optional<std::vector<int32_t>> mGpuDeviceIds;
-
-    /// @brief Participant ids for this instance
-    std::optional<std::vector<int32_t>> mParticipantIds;
-
-    /// @brief Boolean indicating whether it is using orchestrator mode or not
-    bool mIsOrchestratorMode;
-
-    /// @brief Is speculative decoding fast logits transfer enabled
-    bool mSpeculativeDecodingFastLogits;
-
-    /// @brief Lookahead Decoding Configuration of this instance
-    std::optional<executor::LookaheadDecodingConfig> mExecutorLookaheadDecodingConfig{std::nullopt};
-
-#ifdef TRITON_ENABLE_METRICS
-    std::unique_ptr<custom_metrics_reporter::CustomMetricsReporter> custom_metrics_reporter_;
-#endif
-};
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/src/model_state.cc b/triton_backend/inflight_batcher_llm/src/model_state.cc
deleted file mode 100644
index b161e63a2ca..00000000000
--- a/triton_backend/inflight_batcher_llm/src/model_state.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "model_state.h"
-
-#include "utils.h"
-
-#include <algorithm>
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-TRITONSERVER_Error* ModelState::Create(
-    TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state)
-{
-    TRITONSERVER_Message* config_message;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message));
-    // We can get the model configuration as a json string from
-    // config_message, parse it with our favorite json parser to create
-    // DOM that we can access when we need to example the
-    // configuration. We use TritonJson, which is a wrapper that returns
-    // nice errors (currently the underlying implementation is
-    // rapidjson... but others could be added). You can use any json
-    // parser you prefer.
-    char const* buffer;
-    size_t byte_size;
-    RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
-
-    common::TritonJson::Value model_config;
-    TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size);
-    RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
-    RETURN_IF_ERROR(err);
-
-    try
-    {
-        *state = new ModelState(triton_model, name, version, std::move(model_config));
-    }
-    catch (std::exception const& ex)
-    {
-        std::string errStr = std::string("unexpected error when creating modelState: ") + ex.what();
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
-    }
-
-    return nullptr; // success
-}
-
-void ModelState::LoadParameters()
-{
-    // Check if model is in decoupled mode:
-    triton::common::TritonJson::Value transaction_policy;
-    model_config_.MemberAsObject("model_transaction_policy", &transaction_policy);
-    transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
-
-    // Retrieve logits datatype
-    triton::common::TritonJson::Value outputs;
-    model_config_.MemberAsArray("output", &outputs);
-    for (size_t i = 0; i < outputs.ArraySize(); ++i)
-    {
-        triton::common::TritonJson::Value output;
-        std::string dtype_str, output_name;
-        outputs.IndexAsObject(i, &output);
-        output.MemberAsString("name", &output_name);
-        if (output_name == "generation_logits" || output_name == "context_logits")
-        {
-            output.MemberAsString("data_type", &dtype_str);
-            dtype_str.erase(0, 5); // Remove the 'TYPE_' prefix
-            mLogitsDataType = TRITONSERVER_StringToDataType(dtype_str.c_str());
-            break;
-        }
-    }
-
-    try
-    {
-        auto gpuDeviceIds = GetParameter<std::string>("gpu_device_ids");
-
-        auto deviceIdsList = utils::split(gpuDeviceIds, ';');
-
-        for (auto const& deviceIds : deviceIdsList)
-        {
-            if (!mGpuDeviceIds)
-            {
-                mGpuDeviceIds = std::vector<std::vector<int32_t>>{};
-            }
-            mGpuDeviceIds.value().emplace_back(utils::csvStrToVecInt(deviceIds));
-        }
-
-        if (deviceIdsList.size() > 0)
-        {
-            auto deviceIdInfo = std::string{"Using GPU device ids: " + gpuDeviceIds};
-            TLLM_LOG_INFO(deviceIdInfo);
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set");
-    }
-
-    try
-    {
-        auto participantIds = GetParameter<std::string>("participant_ids");
-
-        auto participantIdsList = utils::split(participantIds, ';');
-
-        if (!participantIdsList.empty())
-        {
-            mParticipantIds = std::vector<std::vector<int32_t>>{};
-        }
-        for (auto const& participantList : participantIdsList)
-        {
-            mParticipantIds.value().emplace_back(utils::csvStrToVecInt(participantList));
-        }
-
-        if (!participantIdsList.empty())
-        {
-            auto participantIdsInfo = std::string{"Using participant ids: " + participantIds};
-            TLLM_LOG_INFO(participantIdsInfo);
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("participant_ids is not specified, will be automatically set");
-    }
-}
-
-common::TritonJson::Value& ModelState::GetModelConfig()
-{
-    return model_config_;
-}
-
-std::string const& ModelState::GetModelName() const
-{
-    return model_name_;
-}
-
-uint64_t ModelState::GetModelVersion() const
-{
-    return model_version_;
-}
-
-std::string const ModelState::GetExecutorWorkerPath()
-{
-
-    // Check if worker_path is specified, if so throw an error
-    try
-    {
-        auto workerPath = GetParameter<std::string>("worker_path");
-        TLLM_THROW(
-            "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path "
-            "instead to specify the location of the trtllmExecutorWorker executable.");
-    }
-    catch (std::exception const& e)
-    {
-    }
-
-    std::string executorWorkerPath = "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker";
-    try
-    {
-        executorWorkerPath = GetParameter<std::string>("executor_worker_path");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("executor_worker_path is not specified, will use default value");
-    }
-
-    return executorWorkerPath;
-}
-
-std::vector<int64_t> ModelState::serialize() const
-{
-    // model name
-    // model version
-    // model config
-    size_t totalSize = 3;
-
-    int nameSize = (model_name_.size() + sizeof(int64_t)) / sizeof(int64_t);
-    totalSize += nameSize;
-
-    TritonJson::WriteBuffer buffer;
-    model_config_.Write(&buffer);
-
-    totalSize += buffer.Size();
-
-    std::vector<int64_t> packed(totalSize);
-    int64_t* ptr = packed.data();
-
-    *ptr++ = model_name_.size();
-    std::memcpy(ptr, model_name_.c_str(), model_name_.size());
-    ptr += nameSize;
-
-    *ptr++ = model_version_;
-    *ptr++ = buffer.Size();
-    std::memcpy(ptr, buffer.Base(), buffer.Size());
-
-    return packed;
-}
-
-ModelState ModelState::deserialize(int64_t const* packed_ptr)
-{
-    auto const nameSize = *packed_ptr++;
-    char const* cname = reinterpret_cast<char const*>(packed_ptr);
-    packed_ptr += (nameSize + sizeof(int64_t)) / sizeof(int64_t);
-
-    uint64_t const version = *packed_ptr++;
-
-    auto const jsonSize = *packed_ptr++;
-    char const* jsonBuffer = reinterpret_cast<char const*>(packed_ptr);
-    common::TritonJson::Value model_config;
-    TRITONSERVER_Error* err = model_config.Parse(jsonBuffer, jsonSize);
-    if (err)
-    {
-        TRITONSERVER_ErrorDelete(err);
-        throw std::runtime_error("Failed to parse model config");
-    }
-
-    return ModelState{nullptr, cname, version, std::move(model_config)};
-}
-
-ModelState ModelState::deserialize(std::vector<int64_t> const& packed)
-{
-    return ModelState::deserialize(packed.data());
-}
-
-template <>
-std::string ModelState::GetParameter<std::string>(std::string const& name)
-{
-    TritonJson::Value parameters;
-    TRITONSERVER_Error* err = model_config_.MemberAsObject("parameters", &parameters);
-    if (err != nullptr)
-    {
-        TRITONSERVER_ErrorDelete(err);
-        throw std::runtime_error("Model config doesn't have a parameters section");
-    }
-    TritonJson::Value value;
-    std::string str_value;
-    err = parameters.MemberAsObject(name.c_str(), &value);
-    if (err != nullptr)
-    {
-        TRITONSERVER_ErrorDelete(err);
-        std::string errStr = "Cannot find parameter with name: " + name;
-        throw std::runtime_error(errStr);
-    }
-    value.MemberAsString("string_value", &str_value);
-    return str_value;
-}
-
-template <>
-int32_t ModelState::GetParameter<int32_t>(std::string const& name)
-{
-    return std::stoi(GetParameter<std::string>(name));
-}
-
-template <>
-std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(std::string const& name)
-{
-    auto deviceIdsStr = GetParameter<std::string>(name);
-    // Parse as comma delimited string
-    return utils::csvStrToVecInt(deviceIdsStr);
-}
-
-template <>
-uint32_t ModelState::GetParameter<uint32_t>(std::string const& name)
-{
-    return (uint32_t) std::stoul(GetParameter<std::string>(name));
-}
-
-template <>
-int64_t ModelState::GetParameter<int64_t>(std::string const& name)
-{
-    return std::stoll(GetParameter<std::string>(name));
-}
-
-template <>
-uint64_t ModelState::GetParameter<uint64_t>(std::string const& name)
-{
-    return std::stoull(GetParameter<std::string>(name));
-}
-
-template <>
-float ModelState::GetParameter<float>(std::string const& name)
-{
-    return std::stof(GetParameter<std::string>(name));
-}
-
-template <>
-bool ModelState::GetParameter<bool>(std::string const& name)
-{
-    auto val = GetParameter<std::string>(name);
-    if (val == "True" || val == "true" || val == "TRUE" || val == "1")
-    {
-        return true;
-    }
-    else if (val == "False" || val == "false" || val == "FALSE" || val == "0")
-    {
-        return false;
-    }
-    else
-    {
-        std::string err = "Cannot convert " + val + " to a boolean.";
-        throw std::runtime_error(err);
-    }
-}
-
-template <>
-std::vector<std::vector<int32_t>> ModelState::GetParameter<std::vector<std::vector<int32_t>>>(std::string const& name)
-{
-    auto str = GetParameter<std::string>(name);
-    // Parse as comma delimited string and {} as array bounders
-    return utils::csvStrToVecVecInt(str);
-}
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/src/model_state.h b/triton_backend/inflight_batcher_llm/src/model_state.h
deleted file mode 100644
index 36555acb114..00000000000
--- a/triton_backend/inflight_batcher_llm/src/model_state.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/plugins/api/tllmPlugin.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-#include <atomic>
-#include <optional>
-
-using namespace ::triton::common; // TritonJson
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-// ModelState
-//
-// State associated with a model that is using this backend. An object
-// of this class is created and associated with each
-// TRITONBACKEND_Model.
-
-class ModelState
-{
-public:
-    static TRITONSERVER_Error* Create(
-        TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state);
-
-    template <typename T>
-    T GetParameter(std::string const& name)
-    {
-        assert(false);
-        auto dummy = T();
-        return dummy;
-    }
-
-    virtual ~ModelState() = default;
-
-    common::TritonJson::Value& GetModelConfig();
-    std::string const& GetModelName() const;
-    uint64_t GetModelVersion() const;
-    std::string const GetExecutorWorkerPath();
-
-    std::optional<std::vector<std::vector<int32_t>>> getDeviceIds()
-    {
-        return mGpuDeviceIds;
-    }
-
-    std::optional<std::vector<std::vector<int32_t>>> getParticipantIds()
-    {
-        return mParticipantIds;
-    }
-
-    bool IsDecoupled() const
-    {
-        return is_decoupled_;
-    }
-
-    TRITONSERVER_DataType getLogitsDataType() const
-    {
-        return mLogitsDataType;
-    }
-
-    uint32_t getAndIncrementInstanceIndex()
-    {
-        return mInstanceIndex++;
-    }
-
-    [[nodiscard]] std::vector<int64_t> serialize() const;
-
-    static ModelState deserialize(int64_t const* packed_ptr);
-
-    static ModelState deserialize(std::vector<int64_t> const& packed);
-
-private:
-    std::string const model_name_;
-    uint64_t model_version_;
-    common::TritonJson::Value model_config_;
-    std::shared_ptr<nvinfer1::ILogger> mTrtLogger{};
-    std::atomic<uint32_t> mInstanceIndex{0};
-
-    // model parameters
-    std::optional<std::vector<std::vector<int32_t>>> mGpuDeviceIds;
-    std::optional<std::vector<std::vector<int32_t>>> mParticipantIds;
-    bool is_decoupled_ = false;
-    TRITONSERVER_DataType mLogitsDataType = TRITONSERVER_TYPE_INVALID;
-
-    void LoadParameters();
-
-public:
-    ModelState(
-        TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t version, TritonJson::Value&& model_config)
-        : model_name_(name)
-        , model_version_(version)
-        , model_config_(std::move(model_config))
-    {
-        mTrtLogger = std::make_shared<tensorrt_llm::runtime::TllmLogger>();
-        initTrtLlmPlugins(mTrtLogger.get());
-
-        LoadParameters();
-    }
-};
-
-template <>
-std::string ModelState::GetParameter<std::string>(std::string const& name);
-
-template <>
-int32_t ModelState::GetParameter<int32_t>(std::string const& name);
-
-template <>
-uint32_t ModelState::GetParameter<uint32_t>(std::string const& name);
-
-template <>
-int64_t ModelState::GetParameter<int64_t>(std::string const& name);
-
-template <>
-uint64_t ModelState::GetParameter<uint64_t>(std::string const& name);
-
-template <>
-float ModelState::GetParameter<float>(std::string const& name);
-
-template <>
-bool ModelState::GetParameter<bool>(std::string const& name);
-
-template <>
-std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(std::string const& name);
-
-template <>
-std::vector<std::vector<int32_t>> ModelState::GetParameter<std::vector<std::vector<int32_t>>>(std::string const& name);
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/src/namedTensor.cpp b/triton_backend/inflight_batcher_llm/src/namedTensor.cpp
deleted file mode 100644
index 4267dcdb20e..00000000000
--- a/triton_backend/inflight_batcher_llm/src/namedTensor.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "namedTensor.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
-
-#include <cstring>
-#include <vector>
-
-namespace triton::backend::inflight_batcher_llm::utils
-{
-
-NamedTensor::NamedTensor(
-    nvinfer1::DataType _type, std::vector<int64_t> const& _shape, std::string _name, void const* _data)
-    : Base(std::move(_name))
-{
-    nvinfer1::Dims dims;
-    dims.nbDims = _shape.size();
-    for (int32_t i = 0; i < dims.nbDims; ++i)
-    {
-        dims.d[i] = _shape[i];
-    }
-    tensor = tensorrt_llm::runtime::BufferManager::pinnedPool(dims, _type);
-    if (_data)
-    {
-        std::memcpy(tensor->data(), _data, tensor->getSizeInBytes());
-    }
-}
-
-size_t NamedTensor::serializedSize() const
-{
-    size_t totalSize = 1;
-
-    int n = (name.size() + sizeof(int64_t)) / sizeof(int64_t);
-    totalSize += n;
-
-    // memType
-    // dataType
-    // nbDims
-    totalSize += 3;
-    totalSize += tensor->getShape().nbDims;
-
-    int m = tensor->getSizeInBytes();
-    int mm = (m + sizeof(int64_t) - 1) / sizeof(int64_t);
-    totalSize += mm;
-    return totalSize;
-}
-
-void NamedTensor::serialize(int64_t* vpacked, const size_t totalSize) const
-{
-    int n = (name.size() + sizeof(int64_t)) / sizeof(int64_t);
-
-    int m = tensor->getSizeInBytes();
-
-    vpacked[0] = name.size();
-    std::memcpy(&(vpacked[1]), name.c_str(), name.size());
-
-    int64_t* tensorPtr = &(vpacked[n + 1]);
-    *tensorPtr++ = static_cast<int64_t>(tensor->getMemoryType());
-    *tensorPtr++ = static_cast<int64_t>(tensor->getDataType());
-    *tensorPtr++ = static_cast<int64_t>(tensor->getShape().nbDims);
-    for (size_t i = 0; i < static_cast<size_t>(tensor->getShape().nbDims); ++i)
-    {
-        *tensorPtr++ = static_cast<int64_t>(tensor->getShape().d[i]);
-    }
-    std::memcpy(tensorPtr, tensor->data(), m);
-
-    tensorPtr += (m + sizeof(int64_t) - 1) / sizeof(int64_t);
-
-    TLLM_CHECK_WITH_INFO(tensorPtr - vpacked == (int64_t) totalSize, "serialize and serializedSize are out of sync");
-}
-
-std::vector<int64_t> NamedTensor::serialize() const
-{
-    size_t totalSize = serializedSize();
-
-    std::vector<int64_t> vpacked(totalSize);
-    serialize(vpacked.data(), totalSize);
-
-    return vpacked;
-}
-
-NamedTensor NamedTensor::deserialize(int64_t const* packed)
-{
-    int n = *packed++;
-    char const* cname = reinterpret_cast<char const*>(packed);
-    int nn = (n + sizeof(int64_t)) / sizeof(int64_t);
-    packed += nn;
-    ++packed; // tensorrt_llm::runtime::MemoryType
-    nvinfer1::DataType trtDType = static_cast<nvinfer1::DataType>(*packed++);
-    int64_t nshape = *packed++;
-    std::vector<int64_t> shape(nshape);
-    memcpy(shape.data(), packed, nshape * sizeof(int64_t));
-    packed += nshape;
-    return NamedTensor{trtDType, shape, cname, packed};
-}
-
-} // namespace triton::backend::inflight_batcher_llm::utils
diff --git a/triton_backend/inflight_batcher_llm/src/namedTensor.h b/triton_backend/inflight_batcher_llm/src/namedTensor.h
deleted file mode 100644
index 90bbda77089..00000000000
--- a/triton_backend/inflight_batcher_llm/src/namedTensor.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "tensorrt_llm/runtime/iTensor.h"
-
-#include <string>
-
-namespace triton::backend::inflight_batcher_llm::utils
-{
-template <typename TTensor>
-class GenericNamedTensor
-{
-public:
-    using TensorPtr = TTensor;
-
-    TensorPtr tensor;
-    std::string name;
-
-    GenericNamedTensor() = default;
-    ~GenericNamedTensor() = default;
-
-    GenericNamedTensor(TensorPtr _tensor, std::string _name)
-        : tensor{std::move(_tensor)}
-        , name{std::move(_name)}
-    {
-    }
-
-    explicit GenericNamedTensor(std::string _name)
-        : tensor{}
-        , name{std::move(_name)}
-    {
-    }
-
-    TensorPtr operator()()
-    {
-        return tensor;
-    }
-
-    TensorPtr const& operator()() const
-    {
-        return tensor;
-    }
-};
-
-class NamedTensor : public GenericNamedTensor<tensorrt_llm::runtime::ITensor::SharedPtr>
-{
-public:
-    using Base = GenericNamedTensor<tensorrt_llm::runtime::ITensor::SharedPtr>;
-    using TensorPtr = Base::TensorPtr;
-
-    NamedTensor(
-        nvinfer1::DataType _type, std::vector<int64_t> const& _shape, std::string _name, void const* _data = nullptr);
-
-    NamedTensor(TensorPtr _tensor, std::string _name)
-        : Base(std::move(_tensor), std::move(_name)){};
-
-    explicit NamedTensor(std::string _name)
-        : Base(std::move(_name)){};
-
-    [[nodiscard]] std::vector<int64_t> serialize() const;
-
-    void serialize(int64_t* out, const size_t totalSize) const;
-
-    [[nodiscard]] size_t serializedSize() const;
-
-    static NamedTensor deserialize(int64_t const* packed);
-};
-} // namespace triton::backend::inflight_batcher_llm::utils
diff --git a/triton_backend/inflight_batcher_llm/src/utils.cc b/triton_backend/inflight_batcher_llm/src/utils.cc
deleted file mode 100644
index 5ccc586e785..00000000000
--- a/triton_backend/inflight_batcher_llm/src/utils.cc
+++ /dev/null
@@ -1,1139 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "utils.h"
-#include "tensorrt_llm/executor/serialization.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
-#include <cassert>
-
-using namespace tensorrt_llm::batch_manager;
-
-namespace triton::backend::inflight_batcher_llm::utils
-{
-
-auto constexpr kRetentionNoExpirationTime = -1;
-auto constexpr kRetentionNoRangeEnd = -1;
-
-nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type)
-{
-    if (data_type == TRITONSERVER_TYPE_INVALID)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_BOOL)
-    {
-        return nvinfer1::DataType::kBOOL;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT8)
-    {
-        return nvinfer1::DataType::kUINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT16)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT32)
-    {
-        return nvinfer1::DataType::kINT32;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT64)
-    {
-        return nvinfer1::DataType::kINT64;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT8)
-    {
-        return nvinfer1::DataType::kINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT16)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT32)
-    {
-        return nvinfer1::DataType::kINT32;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT64)
-    {
-        return nvinfer1::DataType::kINT64;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP16)
-    {
-        return nvinfer1::DataType::kHALF;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP32)
-    {
-        return nvinfer1::DataType::kFLOAT;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP64)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_BYTES)
-    {
-        return nvinfer1::DataType::kINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_BF16)
-    {
-        return nvinfer1::DataType::kBF16;
-    }
-    else
-    {
-        assert(false);
-    }
-    return nvinfer1::DataType(0);
-}
-
-TRITONSERVER_DataType to_triton_datatype(executor::DataType data_type)
-{
-    if (data_type == executor::DataType::kBOOL)
-    {
-        return TRITONSERVER_TYPE_BOOL;
-    }
-    else if (data_type == executor::DataType::kUINT8)
-    {
-        return TRITONSERVER_TYPE_UINT8;
-    }
-    else if (data_type == executor::DataType::kINT8)
-    {
-        return TRITONSERVER_TYPE_INT8;
-    }
-    else if (data_type == executor::DataType::kINT32)
-    {
-        return TRITONSERVER_TYPE_INT32;
-    }
-    else if (data_type == executor::DataType::kINT64)
-    {
-        return TRITONSERVER_TYPE_INT64;
-    }
-    else if (data_type == executor::DataType::kBF16)
-    {
-        return TRITONSERVER_TYPE_BF16;
-    }
-    else if (data_type == executor::DataType::kFP8)
-    {
-        assert(false);
-    }
-    else if (data_type == executor::DataType::kFP16)
-    {
-        return TRITONSERVER_TYPE_FP16;
-    }
-    else if (data_type == executor::DataType::kFP32)
-    {
-        return TRITONSERVER_TYPE_FP32;
-    }
-    else if (data_type == executor::DataType::kUNKNOWN)
-    {
-        assert(false);
-    }
-    else
-    {
-        assert(false);
-    }
-    return TRITONSERVER_TYPE_INVALID;
-}
-
-std::vector<InputTensors> splitBatchInputsTensors(InputTensors const& inputsTensors)
-{
-    auto inputTokensIt = inputsTensors.find(InputFieldsNames::inputTokens);
-    auto encoderInputFeaturesIt = inputsTensors.find(InputFieldsNames::encoderInputFeatures);
-    auto batchSize = 1;
-    if (inputTokensIt != inputsTensors.end())
-    {
-        auto inputTokensTensor = inputTokensIt->second.tensor;
-        auto inputTokensShape = inputTokensTensor->getShape();
-        TLLM_CHECK_WITH_INFO(inputTokensShape.nbDims == 2, "Expected inputTokens tensors to have 2 dimensions");
-        batchSize = inputTokensShape.d[0];
-    }
-    else if (encoderInputFeaturesIt != inputsTensors.end())
-    {
-        auto encoderInputFeaturesTensor = encoderInputFeaturesIt->second.tensor;
-        auto encoderInputFeaturesShape = encoderInputFeaturesTensor->getShape();
-        batchSize = encoderInputFeaturesShape.d[0];
-    }
-    else
-    {
-        TLLM_THROW("inputTokens or encoderInputFeatures tensor not provided");
-    }
-
-    if (batchSize > 1)
-    {
-        // Split tensors along batch dimension. All tensors have same batch size.
-        auto itrInputLengths = inputsTensors.find(InputFieldsNames::inputLengths);
-        TLLM_CHECK_WITH_INFO(itrInputLengths != inputsTensors.end(), "input lengths tensor not provided");
-        auto inputLengthsTensor = itrInputLengths->second.tensor;
-        auto inputLengthsShape = inputLengthsTensor->getShape();
-        TLLM_CHECK_WITH_INFO(inputLengthsShape.nbDims == 1, "Expected inputLengths tensors to have 1 dimension");
-
-        using typedPtr = typename runtime::DataTypeTraits<nvinfer1::DataType::kINT32>::type*;
-        auto inputLengths = static_cast<typedPtr>(inputLengthsTensor->data());
-        std::vector<InputTensors> rval(batchSize);
-        for (int batchIdx = 0; batchIdx < batchSize; ++batchIdx)
-        {
-            auto inputLength = inputLengths[batchIdx];
-            for (auto const& [name, batchedTensor] : inputsTensors)
-            {
-                std::shared_ptr<runtime::ITensor> slicedTensor
-                    = std::move(runtime::ITensor::slice(batchedTensor.tensor, batchIdx, 1));
-                // Use input_length to strip off padding for inputTokens
-                if (name == InputFieldsNames::inputTokens || name == InputFieldsNames::encoderInputFeatures)
-                {
-                    slicedTensor->squeeze(0);
-                    slicedTensor = runtime::ITensor::slice(slicedTensor, 0, static_cast<int>(inputLength));
-                    slicedTensor->unsqueeze(0);
-                }
-                NamedTensor namedTensor(slicedTensor, name);
-                rval[batchIdx].insert({name, namedTensor});
-            }
-        }
-        return rval;
-    }
-    else
-    {
-        return {inputsTensors};
-    }
-}
-
-std::vector<InputTensors> readInputsTensors(TRITONBACKEND_Request* request)
-{
-    InputTensors inputsTensors;
-    uint32_t num_inputs;
-    LOG_IF_ERROR(TRITONBACKEND_RequestInputCount(request, &num_inputs), "Error getting input count");
-    auto const stream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
-    auto const manager = tensorrt_llm::runtime::BufferManager{std::move(stream)};
-    for (uint32_t idx = 0; idx < num_inputs; ++idx)
-    {
-        TRITONBACKEND_Input* input = nullptr;
-        LOG_IF_ERROR(TRITONBACKEND_RequestInputByIndex(request, idx, &input), "Error getting input index");
-
-        char const* input_name = nullptr;
-        TRITONSERVER_DataType data_type = TRITONSERVER_TYPE_INVALID;
-        int64_t const* shape = nullptr;
-        uint32_t dims_count = 0;
-        uint64_t byte_size = 0;
-        uint32_t buffer_count = 0;
-        LOG_IF_ERROR(TRITONBACKEND_InputProperties(
-                         input, &input_name, &data_type, &shape, &dims_count, &byte_size, &buffer_count),
-            "Error getting input properties");
-
-        if (std::string(input_name) == "START" || std::string(input_name) == "CORRID"
-            || std::string(input_name) == "END" || std::string(input_name) == kStopInputTensorName
-            || std::string(input_name) == kStreamingInputTensorName)
-        {
-            continue;
-        }
-
-        std::vector<int64_t> shapev;
-        for (uint32_t i = 0; i < dims_count; ++i)
-        {
-            shapev.push_back(shape[i]);
-            // NOTE
-            // To handle the encoded string, we create placeholder shapes for string input.
-            // The encoded string is passed with a shape of [1] and a data type of int8, as determined by
-            // utils::to_trt_datatype. This makes that the tensor size in bytes is 1, even if the actual string size in
-            // bytes is larger. In orer to address this, we intentionally set the last dimension of the tensor's shape
-            // to match the byte size obtained from TRITONBACKEND_InputProperties. This allows us to accurately recover
-            // the actual byte size of the tensor.
-            if (data_type == TRITONSERVER_TYPE_BYTES && i == dims_count - 1)
-            {
-                shapev[dims_count - 1] = byte_size / shapev[0];
-            }
-        }
-
-        NamedTensor t(utils::to_trt_datatype(data_type), shapev, input_name);
-        uint64_t buffer_offset = 0;
-        for (int64_t buffer_id = 0; buffer_id < buffer_count; ++buffer_id)
-        {
-            void const* buffer = nullptr;
-            uint64_t buffer_byte_size = 0;
-            TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-            int64_t memory_type_id = 0;
-            LOG_IF_ERROR(
-                TRITONBACKEND_InputBuffer(input, buffer_id, &buffer, &buffer_byte_size, &memory_type, &memory_type_id),
-                "failed to get input buffer");
-            if (memory_type == TRITONSERVER_MEMORY_GPU)
-            {
-                if (buffer_id == 0)
-                {
-                    t.tensor = manager.gpu(t.tensor->getShape(), utils::to_trt_datatype(data_type));
-                }
-                TLLM_CUDA_CHECK(cudaMemcpyAsync(static_cast<char*>(t.tensor->data()) + buffer_offset, buffer,
-                    buffer_byte_size, cudaMemcpyDeviceToDevice, manager.getStream().get()));
-            }
-            else
-            {
-                std::memcpy(static_cast<char*>(t.tensor->data()) + buffer_offset, buffer, buffer_byte_size);
-            }
-            buffer_offset += buffer_byte_size;
-        }
-        inputsTensors.insert(make_pair(t.name, std::move(t)));
-    }
-    manager.getStream().synchronize();
-    return splitBatchInputsTensors(inputsTensors);
-}
-
-uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap)
-{
-    char const* charRequestId;
-    TRITONBACKEND_RequestId(request, &charRequestId);
-    uint64_t requestId = 0;
-    if (charRequestId != nullptr)
-    {
-        std::string strRequestId(charRequestId);
-        if (!strRequestId.empty())
-        {
-            try
-            {
-                requestId = stoul(strRequestId);
-            }
-            catch (std::exception const& e)
-            {
-                std::hash<std::string> hasher;
-                requestId = hasher(strRequestId);
-
-                // Check for hash collisions
-                // If requestID already exists in the map with the same string, increment the ID and check again
-                for (auto it = requestIdStrMap.find(requestId);
-                     it != requestIdStrMap.end() && it->second != strRequestId;)
-                {
-                    requestId++;
-                }
-            }
-            requestIdStrMap.insert({requestId, strRequestId});
-        }
-    }
-
-    return requestId;
-}
-
-executor::RequestType getRequestType(TRITONBACKEND_Request* request)
-{
-    executor::RequestType requestType = executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION;
-    auto requestTypeStr = getRequestParameter<std::string>(request, kRequestTypeParameterName);
-    if (requestTypeStr)
-    {
-        if (stringToRequestType.count(requestTypeStr.value()) > 0)
-        {
-            requestType = stringToRequestType.at(requestTypeStr.value());
-        }
-        else
-        {
-            LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
-                (std::string("Unexpected parameter value for 'triton_trtllm_request_type'. Found: ")
-                    + requestTypeStr.value())
-                    .c_str());
-        }
-    }
-
-    return requestType;
-}
-
-std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request)
-{
-    std::unordered_set<std::string> outputNames;
-    uint32_t outputCount;
-    LOG_IF_ERROR(TRITONBACKEND_RequestOutputCount(request, &outputCount), "Error getting request output count");
-    for (size_t i = 0; i < outputCount; ++i)
-    {
-        char const* name;
-        LOG_IF_ERROR(TRITONBACKEND_RequestOutputName(request, i, &name), "Error getting request output name");
-        std::string name_s(name);
-        outputNames.insert(std::move(name_s));
-    }
-    return outputNames;
-}
-
-bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName)
-{
-    // Get stop signal from the request
-    TRITONBACKEND_Input* input;
-    TRITONSERVER_Error* error = TRITONBACKEND_RequestInput(request, inputTensorName.c_str(), &input);
-    if (error)
-    {
-        // If the user does not provide input "stop", then regard the request as
-        // unstopped
-        std::string msg
-            = "ModelInstanceState::getRequestBooleanInputTensor: user "
-              "did not not provide "
-            + inputTensorName + " input for the request";
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, msg.c_str());
-        TRITONSERVER_ErrorDelete(error);
-        return false;
-    }
-
-    uint64_t input_byte_size = 0;
-    uint32_t buffer_count = 0;
-    TRITONBACKEND_InputProperties(input, nullptr, nullptr, nullptr, nullptr, &input_byte_size, &buffer_count);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-        ("ModelInstanceState::getRequestStopSignal: buffer_count = " + std::to_string(buffer_count)).c_str());
-
-    void const* buffer = 0L;
-    uint64_t buffer_byte_size = 0;
-    TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-    int64_t memory_type_id = 0;
-    TRITONBACKEND_InputBuffer(input, 0, &buffer, &buffer_byte_size, &memory_type, &memory_type_id);
-
-    assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED));
-
-    bool boolean = *reinterpret_cast<bool const*>(buffer);
-
-    return boolean;
-}
-
-std::string sparseListToStr(executor::VecTokens const& sparseList)
-{
-    std::string buffer;
-    for (auto v : sparseList)
-    {
-        buffer.append(std::to_string(v) + " ");
-    }
-    return buffer;
-}
-
-std::list<executor::VecTokens> convertWordList(executor::VecTokens const& sparseList)
-{
-    std::list<executor::VecTokens> convertedList;
-    int32_t n = sparseList.size();
-    TLLM_CHECK_WITH_INFO(n % 2 == 0, "Sparse list must not have odd length: " + sparseListToStr(sparseList));
-    int32_t numTokens = n / 2;
-    int32_t currentIndex = 0;
-    for (auto i = numTokens; i < n; ++i)
-    {
-        if (sparseList[i] == -1)
-        {
-            for (auto j = i + 1; j < n; ++j)
-            {
-                TLLM_CHECK_WITH_INFO(
-                    sparseList[j] == -1, "Sparse list must not have additional -1s: " + sparseListToStr(sparseList));
-            }
-            break;
-        }
-        TLLM_CHECK_WITH_INFO(sparseList[i] <= numTokens,
-            "Sparse list must not have out-of-bound offsets: " + sparseListToStr(sparseList));
-        if (i != numTokens)
-        {
-            TLLM_CHECK_WITH_INFO(sparseList[i] > sparseList[i - 1],
-                "Sparse list must not have non-increasing offsets: " + sparseListToStr(sparseList));
-        }
-        executor::VecTokens currentWords;
-        while (currentIndex < sparseList[i])
-        {
-            currentWords.push_back(sparseList[currentIndex]);
-            ++currentIndex;
-        }
-        convertedList.push_back(currentWords);
-    }
-    return convertedList;
-}
-
-void squeezeTensor(std::shared_ptr<runtime::ITensor> const& tensor, int32_t expectedNumDims)
-{
-    auto shape = tensor->getShape();
-    if (shape.nbDims == expectedNumDims)
-    {
-        return;
-    }
-    if (shape.nbDims == expectedNumDims + 1 && shape.d[0] == 1)
-    {
-        --shape.nbDims;
-        for (int32_t i = 0; i < expectedNumDims; ++i)
-        {
-            shape.d[i] = shape.d[i + 1];
-        }
-        tensor->reshape(shape);
-    }
-    else
-    {
-        TLLM_LOG_ERROR("Unexpected prompt tensor shape");
-    }
-}
-
-std::vector<int32_t> csvStrToVecInt(std::string const& str)
-{
-    TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints");
-
-    std::vector<int32_t> output;
-    std::stringstream ss(str);
-    while (ss.good())
-    {
-        std::string substr;
-        ss >> std::ws;
-        getline(ss, substr, ',');
-        if (substr.empty())
-        {
-            break;
-        }
-        output.push_back(std::stoi(substr));
-    }
-    TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector");
-    return output;
-}
-
-std::vector<std::vector<int32_t>> csvStrToVecVecInt(std::string const& str)
-{
-    TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints");
-
-    std::vector<std::vector<int32_t>> output;
-    std::stringstream ss(str);
-
-    while (true)
-    {
-        std::string substr;
-        getline(ss, substr, '}');
-        if (substr.empty() || ss.eof())
-        {
-            break;
-        }
-        if (substr[0] == '{')
-        {
-            // Remove the opening bracket from the content
-            substr = substr.substr(1);
-        }
-        output.push_back(csvStrToVecInt(substr));
-        // Ignore the comma and any whitespace
-        ss >> std::ws;
-        ss.ignore(std::numeric_limits<std::streamsize>::max(), ',');
-        ss >> std::ws;
-    }
-    TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector of vector");
-    return output;
-}
-
-std::vector<std::string> split(std::string const& str, char delimiter)
-{
-    std::vector<std::string> tokens;
-    std::string token;
-    std::stringstream ss(str);
-
-    while (std::getline(ss, token, delimiter))
-    {
-        tokens.push_back(token);
-    }
-
-    return tokens;
-}
-
-int64_t numElements(std::vector<int64_t> const& shape)
-{
-    int64_t n = 1;
-    for (auto d : shape)
-    {
-        n *= d;
-    }
-    return n;
-}
-
-executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors)
-{
-    int32_t beamWidth = 1;
-    // If beam_width is specified, set it from config.pbtxt
-    extractSingleton<int32_t>(inputsTensors, InputFieldsNames::beamWidth, beamWidth);
-
-    std::optional<executor::SizeType32> topK{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::topK, topK);
-
-    std::optional<float> topP{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topP, topP);
-    if (topP.has_value() && topP.value() <= 0.F)
-    {
-        topP.reset();
-    }
-
-    std::optional<float> topPMin{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topPMin, topPMin);
-
-    std::optional<float> topPDecay{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topPDecay, topPDecay);
-
-    std::optional<int32_t> topPResetIds{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::topPResetIds, topPResetIds);
-
-    std::optional<float> temperature{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::temperature, temperature);
-
-    std::optional<float> lengthPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::lengthPenalty, lengthPenalty);
-
-    std::optional<int32_t> earlyStopping{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::earlyStopping, earlyStopping);
-
-    std::optional<float> repetitionPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::repetitionPenalty, repetitionPenalty);
-
-    std::optional<int32_t> minTokens{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::minTokens, minTokens);
-
-    std::optional<float> beamSearchDiversityRate{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::beamSearchDiversityRate, beamSearchDiversityRate);
-
-    std::optional<float> presencePenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::presencePenalty, presencePenalty);
-
-    std::optional<float> frequencyPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::frequencyPenalty, frequencyPenalty);
-
-    std::optional<uint64_t> seed{std::nullopt};
-    extractOptionalSingleton<uint64_t>(inputsTensors, InputFieldsNames::seed, seed);
-
-    std::optional<int32_t> noRepeatNgramSize{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::noRepeatNgramSize, noRepeatNgramSize);
-
-    std::optional<int32_t> numReturnSequences{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::numReturnSequences, numReturnSequences);
-
-    return executor::SamplingConfig(beamWidth, topK, topP, topPMin, topPResetIds, topPDecay, seed, temperature,
-        minTokens, beamSearchDiversityRate, repetitionPenalty, presencePenalty, frequencyPenalty, lengthPenalty,
-        earlyStopping, noRepeatNgramSize, numReturnSequences);
-}
-
-executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors)
-{
-    bool returnLogProbs{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnLogProbs, returnLogProbs);
-
-    bool returnGenerationLogits{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnGenerationLogits, returnGenerationLogits);
-
-    bool returnContextLogits{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits);
-
-    bool returnPerfMetrics{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnPerfMetrics, returnPerfMetrics);
-
-    return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits,
-        false /* excludeInputFromOutput */, false /* returnEncoderOutput */, returnPerfMetrics);
-}
-
-std::optional<executor::ExternalDraftTokensConfig> getExternalDraftTokensConfigFromTensors(
-    InputTensors const& inputsTensors, bool const fastLogits)
-{
-    std::optional<executor::ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt;
-
-    if (inputsTensors.count(InputFieldsNames::draftInputs))
-    {
-        executor::VecTokens draftInputs;
-        extractVector<int32_t>(inputsTensors, InputFieldsNames::draftInputs, draftInputs);
-
-        std::optional<executor::Tensor> draftLogits = std::nullopt;
-        if (inputsTensors.count(InputFieldsNames::draftLogits))
-        {
-            std::shared_ptr<runtime::ITensor> originaldraftLogitsTensor
-                = inputsTensors.at(InputFieldsNames::draftLogits).tensor;
-            utils::squeezeTensor(originaldraftLogitsTensor, 2);
-            draftLogits = executor::detail::ofITensor(originaldraftLogitsTensor);
-        }
-
-        std::optional<float> draftAcceptanceThreshold{std::nullopt};
-        utils::extractOptionalSingleton<float>(
-            inputsTensors, InputFieldsNames::draftAcceptanceThreshold, draftAcceptanceThreshold);
-
-        externalDraftTokensConfig
-            = executor::ExternalDraftTokensConfig(draftInputs, draftLogits, draftAcceptanceThreshold, fastLogits);
-    }
-    return externalDraftTokensConfig;
-}
-
-std::optional<executor::PromptTuningConfig> getPromptTuningConfigFromTensors(
-    InputTensors const& inputsTensors, size_t inputlen)
-{
-    std::optional<executor::PromptTuningConfig> pTuningConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::promptEmbeddingTable))
-    {
-        std::shared_ptr<runtime::ITensor> originalTensor
-            = inputsTensors.at(InputFieldsNames::promptEmbeddingTable).tensor;
-        utils::squeezeTensor(originalTensor, 2);
-        auto const& executorTensor = executor::detail::ofITensor(originalTensor);
-
-        std::optional<executor::VecTokenExtraIds> optionalInputTokenExtraIds{std::nullopt};
-        executor::VecTokenExtraIds InputTokenExtraIds;
-        if (utils::extractVector<uint64_t>(inputsTensors, InputFieldsNames::InputTokenExtraIds, InputTokenExtraIds)
-            && InputTokenExtraIds.size() == inputlen)
-        {
-            optionalInputTokenExtraIds = InputTokenExtraIds;
-            pTuningConfig = executor::PromptTuningConfig(executorTensor, optionalInputTokenExtraIds);
-        }
-        else
-            pTuningConfig = executor::PromptTuningConfig(executorTensor);
-    }
-    return pTuningConfig;
-}
-
-std::optional<executor::MropeConfig> getMropeConfigFromTensors(InputTensors const& inputsTensors)
-{
-    std::optional<executor::MropeConfig> mropeConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::mropeRotaryCosSin)
-        && inputsTensors.count(InputFieldsNames::mropePositionDeltas))
-    {
-        std::shared_ptr<runtime::ITensor> originalMropeRotaryCosSinTensor
-            = inputsTensors.at(InputFieldsNames::mropeRotaryCosSin).tensor;
-        utils::squeezeTensor(originalMropeRotaryCosSinTensor, 1);
-        auto const& mropeRotaryCosSinTensor = executor::detail::ofITensor(originalMropeRotaryCosSinTensor);
-
-        executor::SizeType32 mropePositionDeltas;
-        utils::extractSingleton<executor::SizeType32>(
-            inputsTensors, InputFieldsNames::mropePositionDeltas, mropePositionDeltas);
-        mropeConfig = executor::MropeConfig(mropeRotaryCosSinTensor, mropePositionDeltas);
-    }
-    return mropeConfig;
-}
-
-std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const& inputsTensors)
-{
-    std::optional<executor::LoraConfig> loraConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::loraTaskId))
-    {
-        uint64_t taskId;
-        if (!utils::extractSingleton<uint64_t>(inputsTensors, InputFieldsNames::loraTaskId, taskId))
-        {
-            throw std::runtime_error("failed to extract lora task id");
-        }
-
-        std::optional<executor::Tensor> loraConfigTensor{std::nullopt};
-        if (inputsTensors.count(InputFieldsNames::loraConfig))
-        {
-            std::shared_ptr<runtime::ITensor> originalLoraConfigTensor
-                = inputsTensors.at(InputFieldsNames::loraConfig).tensor;
-            utils::squeezeTensor(originalLoraConfigTensor, 2);
-            loraConfigTensor = executor::detail::ofITensor(originalLoraConfigTensor);
-        }
-
-        std::optional<executor::Tensor> loraWeightsTensor{std::nullopt};
-        if (inputsTensors.count(InputFieldsNames::loraWeights))
-        {
-            std::shared_ptr<runtime::ITensor> originalLoraWeightsTensor
-                = inputsTensors.at(InputFieldsNames::loraWeights).tensor;
-            utils::squeezeTensor(originalLoraWeightsTensor, 2);
-            loraWeightsTensor = executor::detail::ofITensor(originalLoraWeightsTensor);
-        }
-
-        loraConfig = executor::LoraConfig(taskId, loraWeightsTensor, loraConfigTensor);
-    }
-    return loraConfig;
-}
-
-std::optional<executor::KvCacheRetentionConfig> getKvCacheRetentionConfigFromTensors(InputTensors const& inputsTensors)
-{
-
-    if (inputsTensors.count(InputFieldsNames::retentionTokenRangeStarts))
-    {
-        std::vector<executor::SizeType32> tokenRangeStarts;
-        std::vector<executor::SizeType32> tokenRangeEnds;
-
-        utils::extractVector<executor::SizeType32>(
-            inputsTensors, InputFieldsNames::retentionTokenRangeStarts, tokenRangeStarts);
-
-        if (!utils::extractVector<executor::SizeType32>(
-                inputsTensors, InputFieldsNames::retentionTokenRangeEnds, tokenRangeEnds)
-            || tokenRangeStarts.size() != tokenRangeEnds.size())
-        {
-            throw std::runtime_error(
-                "retention_token_range_ends must be provided, and have the same length as "
-                "retention_token_range_starts");
-        }
-
-        std::vector<executor::RetentionPriority> priorities;
-
-        if (!utils::extractVector<executor::SizeType32>(
-                inputsTensors, InputFieldsNames::retentionTokenRangePriorities, priorities)
-            || priorities.size() != tokenRangeStarts.size())
-        {
-            throw std::runtime_error(
-                "retention_token_range_priorities must be provided, and have the same length as "
-                "retention_token_range_starts");
-        }
-
-        std::vector<std::optional<std::chrono::milliseconds>> durationsMs(tokenRangeStarts.size(), std::nullopt);
-
-        if (inputsTensors.count(InputFieldsNames::retentionTokenRangeDurations))
-        {
-            std::vector<executor::SizeType32> durationVector;
-            utils::extractVector<executor::SizeType32>(
-                inputsTensors, InputFieldsNames::retentionTokenRangeDurations, durationVector);
-
-            if (durationVector.size() != tokenRangeStarts.size())
-            {
-                throw std::runtime_error(
-                    "retention_token_range_durations_ms must have the same length as retention_token_range_starts");
-            }
-
-            for (size_t i = 0; i < durationVector.size(); i++)
-            {
-                durationsMs[i] = durationVector[i] != kRetentionNoExpirationTime
-                    ? std::optional(std::chrono::milliseconds(durationVector[i]))
-                    : std::nullopt;
-            }
-        }
-
-        auto decodePriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority;
-        std::optional<executor::SizeType32> decodeDurationMs = std::nullopt;
-
-        utils::extractSingleton<executor::RetentionPriority>(
-            inputsTensors, InputFieldsNames::retentionDecodePriority, decodePriority);
-
-        utils::extractOptionalSingleton<executor::SizeType32>(
-            inputsTensors, InputFieldsNames::retentionDecodeDuration, decodeDurationMs);
-
-        auto decodeDuration
-            = decodeDurationMs.has_value() ? std::optional(std::chrono::milliseconds(*decodeDurationMs)) : std::nullopt;
-
-        std::vector<executor::KvCacheRetentionConfig::TokenRangeRetentionConfig> tokenRanges;
-
-        for (size_t i = 0; i < tokenRangeStarts.size(); i++)
-        {
-            tokenRanges.emplace_back(tokenRangeStarts[i],
-                tokenRangeEnds[i] == kRetentionNoRangeEnd ? std::nullopt : std::optional(tokenRangeEnds[i]),
-                priorities[i], durationsMs[i]);
-        }
-
-        return executor::KvCacheRetentionConfig(tokenRanges, decodePriority, decodeDuration);
-    }
-    return std::nullopt;
-}
-
-std::optional<executor::GuidedDecodingParams> getGuidedDecodingParamsFromTensors(InputTensors const& inputsTensors)
-{
-    std::unordered_map<std::string, executor::GuidedDecodingParams::GuideType> guideTypeMapping
-        = {{"json", executor::GuidedDecodingParams::GuideType::kJSON},
-            {"json_schema", executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA},
-            {"regex", executor::GuidedDecodingParams::GuideType::kREGEX},
-            {"ebnf_grammar", executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR}};
-
-    std::optional<executor::GuidedDecodingParams::GuideType> guideTypeOpt = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::guidedDecodingGuideType))
-    {
-        auto guideTypeTensor = inputsTensors.at(InputFieldsNames::guidedDecodingGuideType).tensor;
-        utils::squeezeTensor(guideTypeTensor, 2);
-        char* encodedGuideType = static_cast<char*>(guideTypeTensor->data());
-        std::string guideType(encodedGuideType, guideTypeTensor->getSizeInBytes());
-        // NOTE
-        // The first 4 bytes is null character and need to be removed for mapping.
-        // Same happens for guideOpt below.
-        guideType.erase(0, 4);
-        if (guideTypeMapping.count(guideType))
-        {
-            guideTypeOpt = guideTypeMapping[guideType];
-        }
-    }
-
-    std::optional<std::string> guideOpt = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::guidedDecodingGuide))
-    {
-        auto guideTensor = inputsTensors.at(InputFieldsNames::guidedDecodingGuide).tensor;
-
-        char* encodedGuide = static_cast<char*>(guideTensor->data());
-        guideOpt = std::string(encodedGuide, guideTensor->getSizeInBytes());
-        if (guideOpt.has_value())
-        {
-            guideOpt->erase(0, 4);
-        }
-    }
-
-    std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt;
-    if (guideTypeOpt.has_value())
-    {
-        guidedDecodingParams = executor::GuidedDecodingParams(guideTypeOpt.value(), guideOpt);
-    }
-
-    return guidedDecodingParams;
-}
-
-std::optional<executor::LookaheadDecodingConfig> getLookaheadDecodingFromTensors(
-    InputTensors const& inputsTensors, std::optional<executor::LookaheadDecodingConfig> const& executorLookaheadConfig)
-{
-    std::optional<executor::LookaheadDecodingConfig> requestLookaheadConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::requestLookaheadDecodingWindowSize))
-    {
-        executor::SizeType32 windowSize = 0, ngramSize = 0, verificationSetSize = 0;
-        if (!utils::extractSingleton<int32_t>(
-                inputsTensors, InputFieldsNames::requestLookaheadDecodingWindowSize, windowSize))
-        {
-            throw std::runtime_error("Failed to extract lookahead_window_size");
-        }
-        if (!utils::extractSingleton<int32_t>(
-                inputsTensors, InputFieldsNames::requestLookaheadDecodingNgramSize, ngramSize))
-        {
-            throw std::runtime_error("Failed to extract lookahead_ngram_size");
-        }
-        if (!utils::extractSingleton<int32_t>(
-                inputsTensors, InputFieldsNames::requestLookaheadDecodingVerificationSetSize, verificationSetSize))
-        {
-            throw std::runtime_error("Failed to extract lookahead_verification_set_size");
-        }
-
-        requestLookaheadConfig = executor::LookaheadDecodingConfig{windowSize, ngramSize, verificationSetSize};
-
-        TLLM_CHECK_WITH_INFO(executorLookaheadConfig.has_value(),
-            "Cannot set the request lookahead decoding configuration when model instance lookahead parameters are not "
-            "set.");
-    }
-    return requestLookaheadConfig;
-}
-
-std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputTensors> const& inputsTensors,
-    bool paramExcludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
-    executor::RequestType requestType, bool isOrchestrator, bool specDecFastLogits,
-    std::optional<executor::LookaheadDecodingConfig> const& executorLookaheadConfig)
-{
-    if (!isDecoupled && inputsTensors.size() > 1)
-    {
-        TLLM_THROW("Batch size > 1 requires the tensorrt_llm backend to be using decoupled transaction policy");
-    }
-
-    if (streaming && !isDecoupled)
-    {
-        TLLM_THROW("Streaming is only supported if model is deployed using decoupled mode.");
-    }
-
-    std::vector<executor::Request> requests;
-    for (auto const& inputTensors : inputsTensors)
-    {
-        executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputTensors);
-
-        std::optional<bool> reqExcludeInputFromOutput{std::nullopt};
-        extractOptionalSingleton<bool>(
-            inputTensors, InputFieldsNames::excludeInputFromOutput, reqExcludeInputFromOutput);
-
-        // If specified in request, set from request
-        if (reqExcludeInputFromOutput != std::nullopt)
-        {
-            outConfig.excludeInputFromOutput = reqExcludeInputFromOutput.value();
-        }
-        else // Set from parameter
-        {
-            outConfig.excludeInputFromOutput = paramExcludeInputFromOutput;
-        }
-
-        executor::VecTokens inputTokens;
-        std::optional<executor::Tensor> encoderInputFeatures{std::nullopt};
-        if (inputTensors.count(InputFieldsNames::encoderInputFeatures))
-        {
-            std::shared_ptr<runtime::ITensor> originalTensor
-                = inputTensors.at(InputFieldsNames::encoderInputFeatures).tensor;
-            utils::squeezeTensor(originalTensor, 2);
-            encoderInputFeatures = executor::detail::ofITensor(originalTensor);
-        }
-        if (!utils::extractVector<int32_t>(inputTensors, InputFieldsNames::inputTokens, inputTokens))
-        {
-            if (!encoderInputFeatures.has_value())
-            {
-                TLLM_THROW("%s or %s is not present in the request.", InputFieldsNames::inputTokens,
-                    InputFieldsNames::encoderInputFeatures);
-            }
-        }
-        executor::SizeType32 maxNewTokens;
-        if (!utils::extractSingleton<int32_t>(inputTensors, InputFieldsNames::maxNewTokens, maxNewTokens))
-        {
-            throw std::runtime_error("request_output_len is not present in the request");
-        }
-
-        std::optional<executor::SizeType32> endId{std::nullopt};
-        utils::extractOptionalSingleton<int32_t>(inputTensors, InputFieldsNames::endId, endId);
-
-        std::optional<executor::SizeType32> padId{std::nullopt};
-        utils::extractOptionalSingleton<int32_t>(inputTensors, InputFieldsNames::padId, padId);
-
-        std::optional<executor::VecTokens> encoderInputTokens{std::nullopt};
-        if (modelType == executor::ModelType::kENCODER_ONLY || modelType == executor::ModelType::kENCODER_DECODER)
-        {
-            if (inputTensors.count(InputFieldsNames::inputTokens))
-            {
-                encoderInputTokens = inputTokens;
-            }
-            // If encoder-decoder, check if decoder tokens are specified
-            if (modelType == executor::ModelType::kENCODER_DECODER)
-            {
-                if (!utils::extractVector<int32_t>(inputTensors, InputFieldsNames::decoderInputTokens, inputTokens))
-                {
-                    if (padId)
-                    {
-                        TLLM_LOG_WARNING(
-                            "%s is not present in the request for encoder-decoder model. The decoder input tokens will "
-                            "be "
-                            "set to "
-                            "[padId]",
-                            InputFieldsNames::decoderInputTokens);
-                        inputTokens = {padId.value()};
-                    }
-                    else
-                    {
-                        TLLM_LOG_WARNING("%s is not present in the request for encoder-decoder model",
-                            InputFieldsNames::decoderInputTokens);
-                        inputTokens.clear();
-                    }
-                }
-            }
-        }
-
-        auto samplingConfig = utils::getSamplingConfigFromTensors(inputTensors);
-
-        std::optional<std::list<executor::VecTokens>> badWords = std::nullopt;
-        executor::VecTokens badWordsRaw;
-        if (utils::extractVector<int32_t>(inputTensors, InputFieldsNames::badWords, badWordsRaw))
-        {
-            badWords = utils::convertWordList(badWordsRaw);
-        }
-
-        std::optional<std::list<executor::VecTokens>> stopWords = std::nullopt;
-        executor::VecTokens stopWordsRaw;
-        if (utils::extractVector<int32_t>(inputTensors, InputFieldsNames::stopWords, stopWordsRaw))
-        {
-            stopWords = utils::convertWordList(stopWordsRaw);
-        }
-
-        std::optional<executor::Tensor> embeddingBias{std::nullopt};
-        if (inputTensors.count(InputFieldsNames::embeddingBias))
-        {
-            std::shared_ptr<runtime::ITensor> originalTensor = inputTensors.at(InputFieldsNames::embeddingBias).tensor;
-            utils::squeezeTensor(originalTensor, 1);
-            auto newShape = originalTensor->getShape();
-            if (!(newShape.nbDims == 1 && newShape.d[0] == 0))
-            {
-                embeddingBias = executor::detail::ofITensor(originalTensor);
-            }
-        }
-
-        auto pTuningConfig = utils::getPromptTuningConfigFromTensors(inputTensors, inputTokens.size());
-
-        std::optional<executor::Tensor> multimodalEmbedding{std::nullopt};
-        if (inputTensors.count(InputFieldsNames::multimodalEmbedding))
-        {
-            std::shared_ptr<runtime::ITensor> originalTensor
-                = inputTensors.at(InputFieldsNames::multimodalEmbedding).tensor;
-            utils::squeezeTensor(originalTensor, 2);
-            multimodalEmbedding = executor::detail::ofITensor(originalTensor);
-        }
-
-        auto mropeConfig = utils::getMropeConfigFromTensors(inputTensors);
-
-        auto loraConfig = utils::getLoraConfigFromTensors(inputTensors);
-
-        auto kvCacheRetentionConfig = utils::getKvCacheRetentionConfigFromTensors(inputTensors);
-
-        auto externalDraftTokensConfig
-            = utils::getExternalDraftTokensConfigFromTensors(inputTensors, specDecFastLogits);
-
-        auto requestLookaheadConfig = getLookaheadDecodingFromTensors(inputTensors, executorLookaheadConfig);
-
-        // Multimodal input construction
-        std::optional<executor::MultimodalInput> multimodalInputOpt{std::nullopt};
-        if (inputTensors.count(InputFieldsNames::multimodalHashes)
-            && inputTensors.count(InputFieldsNames::multimodalPositions)
-            && inputTensors.count(InputFieldsNames::multimodalLengths))
-        {
-            std::vector<std::vector<executor::SizeType32>> multimodalHashes;
-            std::vector<executor::SizeType32> multimodalPositions;
-            std::vector<executor::SizeType32> multimodalLengths;
-            // Extract multimodalHashes as a vector of vectors (2D tensor with [num_tokens, hash_len])
-            auto const& hashesTensor = inputTensors.at(InputFieldsNames::multimodalHashes).tensor;
-            auto hashesShape = hashesTensor->getShape();
-            TLLM_CHECK_WITH_INFO(hashesShape.nbDims == 2, "multimodal_hashes tensor must be 2D (num_tokens, hash_len)");
-            int64_t numTokens = hashesShape.d[0];
-            int64_t hashLen = hashesShape.d[1];
-            auto* data = static_cast<executor::SizeType32*>(hashesTensor->data());
-            multimodalHashes.resize(numTokens);
-            for (int64_t i = 0; i < numTokens; ++i)
-            {
-                multimodalHashes[i].resize(hashLen);
-                std::memcpy(multimodalHashes[i].data(), data + i * hashLen, hashLen * sizeof(executor::SizeType32));
-            }
-            // Extract positions and lengths as 1D vectors
-            utils::extractVector<executor::SizeType32>(
-                inputTensors, InputFieldsNames::multimodalPositions, multimodalPositions);
-            utils::extractVector<executor::SizeType32>(
-                inputTensors, InputFieldsNames::multimodalLengths, multimodalLengths);
-            multimodalInputOpt = executor::MultimodalInput(multimodalHashes, multimodalPositions, multimodalLengths);
-        }
-
-        auto request = executor::Request(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig, endId, padId,
-            /*positionIds*/ std::nullopt, badWords, stopWords, embeddingBias, externalDraftTokensConfig,
-            /*PromptTuningConfig*/ pTuningConfig, /*multimodalInput*/ multimodalInputOpt,
-            /*multimodalEmbedding*/ multimodalEmbedding,
-            /*MropeConfig*/ mropeConfig, loraConfig, requestLookaheadConfig, kvCacheRetentionConfig,
-            /*logitsPostProcessorName*/ std::nullopt, /*logitsPostProcessor*/ std::nullopt, encoderInputTokens);
-
-        if (encoderInputFeatures.has_value())
-        {
-            executor::Tensor encoderInputFeaturesTensor = encoderInputFeatures.value();
-            request.setEncoderInputFeatures(encoderInputFeaturesTensor);
-        }
-
-        executor::SizeType32 encoderOutputLengths;
-        if (utils::extractSingleton<int32_t>(
-                inputTensors, InputFieldsNames::encoderOutputLengths, encoderOutputLengths))
-        {
-            request.setEncoderOutputLength(encoderOutputLengths);
-        }
-
-        request.setRequestType(requestType);
-        auto contextPhaseParamsIt = inputTensors.find(InputFieldsNames::contextPhaseParams);
-        if (contextPhaseParamsIt != inputTensors.end())
-        {
-            auto& contextPhaseParams = contextPhaseParamsIt->second();
-            InMemoryStreamBuffer buffer(
-                reinterpret_cast<char*>(contextPhaseParams->data()), contextPhaseParams->getSize());
-
-            auto requestContextPhase = executor::Serialization::deserializeContextPhaseParams(buffer);
-            request.setContextPhaseParams(requestContextPhase);
-        }
-
-        if (inputTensors.count(InputFieldsNames::crossAttentionMask))
-        {
-            std::shared_ptr<runtime::ITensor> originalTensor
-                = inputTensors.at(InputFieldsNames::crossAttentionMask).tensor;
-            utils::squeezeTensor(originalTensor, 2);
-            request.setCrossAttentionMask(executor::detail::ofITensor(originalTensor));
-        }
-
-        if (inputTensors.count(InputFieldsNames::skipCrossAttnBlocks))
-        {
-            std::shared_ptr<runtime::ITensor> originalTensor
-                = inputTensors.at(InputFieldsNames::skipCrossAttnBlocks).tensor;
-            request.setSkipCrossAttnBlocks(executor::detail::ofITensor(originalTensor));
-        }
-
-        auto guidedDecodingParams = utils::getGuidedDecodingParamsFromTensors(inputTensors);
-        if (guidedDecodingParams.has_value())
-        {
-            request.setGuidedDecodingParams(guidedDecodingParams.value());
-        }
-        requests.emplace_back(std::move(request));
-    }
-    return requests;
-}
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<int32_t>::parameter_type = TRITONSERVER_PARAMETER_INT;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<std::string>::parameter_type = TRITONSERVER_PARAMETER_STRING;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<bool>::parameter_type = TRITONSERVER_PARAMETER_BOOL;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<double>::parameter_type = TRITONSERVER_PARAMETER_DOUBLE;
-
-} // namespace triton::backend::inflight_batcher_llm::utils
diff --git a/triton_backend/inflight_batcher_llm/src/utils.h b/triton_backend/inflight_batcher_llm/src/utils.h
deleted file mode 100644
index 78e0ec6545c..00000000000
--- a/triton_backend/inflight_batcher_llm/src/utils.h
+++ /dev/null
@@ -1,443 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "NvInfer.h"
-#include "namedTensor.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/types.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-#include <map>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-using namespace tensorrt_llm;
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-/// @brief Names of input fields
-struct InputFieldsNames
-{
-    static constexpr char const* inputTokens = "input_ids";
-    static constexpr char const* encoderInputFeatures = "encoder_input_features";
-    static constexpr char const* encoderOutputLengths = "encoder_output_lengths";
-    static constexpr char const* inputLengths = "input_lengths";
-    static constexpr char const* decoderInputTokens = "decoder_input_ids";
-    static constexpr char const* maxNewTokens = "request_output_len";
-    static constexpr char const* noRepeatNgramSize = "no_repeat_ngram_size";
-    static constexpr char const* numReturnSequences = "num_return_sequences";
-    static constexpr char const* endId = "end_id";
-    static constexpr char const* padId = "pad_id";
-    static constexpr char const* badWords = "bad_words_list";
-    static constexpr char const* stopWords = "stop_words_list";
-    static constexpr char const* embeddingBias = "embedding_bias";
-    static constexpr char const* contextPhaseParams = "context_phase_params";
-    static constexpr char const* crossAttentionMask = "cross_attention_mask";
-    static constexpr char const* skipCrossAttnBlocks = "skip_cross_attn_blocks";
-    static constexpr char const* multimodalEmbedding = "multimodal_embedding";
-
-    // OutputConfig
-    static constexpr char const* returnLogProbs = "return_log_probs";
-    static constexpr char const* returnGenerationLogits = "return_generation_logits";
-    static constexpr char const* returnContextLogits = "return_context_logits";
-    static constexpr char const* excludeInputFromOutput = "exclude_input_in_output";
-    static constexpr char const* returnPerfMetrics = "return_perf_metrics";
-    static constexpr char const* returnNumInputTokens = "return_num_input_tokens";
-    static constexpr char const* returnNumOutputTokens = "return_num_output_tokens";
-
-    // Deprecated
-    static constexpr char const* returnKvCacheReuseStats = "return_kv_cache_reuse_stats";
-
-    // SamplingConfig
-    static constexpr char const* beamWidth = "beam_width";
-    static constexpr char const* topK = "runtime_top_k";
-    static constexpr char const* topP = "runtime_top_p";
-    static constexpr char const* topPMin = "runtime_top_k_min";
-    static constexpr char const* topPDecay = "runtime_top_p_decay";
-    static constexpr char const* topPResetIds = "runtime_top_p_reset_ids";
-    static constexpr char const* temperature = "temperature";
-    static constexpr char const* lengthPenalty = "len_penalty";
-    static constexpr char const* earlyStopping = "early_stopping";
-    static constexpr char const* repetitionPenalty = "repetition_penalty";
-    static constexpr char const* minTokens = "min_tokens";
-    static constexpr char const* beamSearchDiversityRate = "beam_search_diversity_rate";
-    static constexpr char const* presencePenalty = "presence_penalty";
-    static constexpr char const* frequencyPenalty = "frequency_penalty";
-    static constexpr char const* seed = "seed";
-
-    // PromptTuningConfig
-    static constexpr char const* promptEmbeddingTable = "prompt_embedding_table";
-    static constexpr char const* InputTokenExtraIds = "prompt_table_extra_ids";
-
-    // MropeConfig
-    static constexpr char const* mropeRotaryCosSin = "mrope_rotary_cos_sin";
-    static constexpr char const* mropePositionDeltas = "mrope_position_deltas";
-
-    // MultimodalInput
-    static constexpr char const* multimodalHashes = "multimodal_hashes";
-    static constexpr char const* multimodalPositions = "multimodal_positions";
-    static constexpr char const* multimodalLengths = "multimodal_lengths";
-
-    // LoraConfig
-    static constexpr char const* loraTaskId = "lora_task_id";
-    static constexpr char const* loraWeights = "lora_weights";
-    static constexpr char const* loraConfig = "lora_config";
-
-    // ExternalDraftTokensConfig
-    static constexpr char const* draftInputs = "draft_input_ids";
-    static constexpr char const* draftLogits = "draft_logits";
-    static constexpr char const* draftAcceptanceThreshold = "draft_acceptance_threshold";
-
-    // KvCacheRetentionConfig
-    static constexpr char const* retentionTokenRangeStarts = "retention_token_range_starts";
-    static constexpr char const* retentionTokenRangeEnds = "retention_token_range_ends";
-    static constexpr char const* retentionTokenRangePriorities = "retention_token_range_priorities";
-    static constexpr char const* retentionTokenRangeDurations = "retention_token_range_durations_ms";
-    static constexpr char const* retentionDecodePriority = "retention_decode_priority";
-    static constexpr char const* retentionDecodeDuration = "retention_decode_duration_ms";
-
-    // GuidedDecodingParams
-    static constexpr char const* guidedDecodingGuideType = "guided_decoding_guide_type";
-    static constexpr char const* guidedDecodingGuide = "guided_decoding_guide";
-
-    // LookaheadDecodingConfig
-    static constexpr char const* requestLookaheadDecodingWindowSize = "lookahead_window_size";
-    static constexpr char const* requestLookaheadDecodingNgramSize = "lookahead_ngram_size";
-    static constexpr char const* requestLookaheadDecodingVerificationSetSize = "lookahead_verification_set_size";
-};
-
-/// @brief Names of output fields
-struct OutputFieldsNames
-{
-    static constexpr char const* outputIds = "output_ids";
-    static constexpr char const* sequenceLength = "sequence_length";
-    static constexpr char const* contextLogits = "context_logits";
-    static constexpr char const* generationLogits = "generation_logits";
-    static constexpr char const* outputLogProbs = "output_log_probs";
-    static constexpr char const* cumLogProbs = "cum_log_probs";
-    static constexpr char const* batchIndex = "batch_index";
-    static constexpr char const* sequenceIndex = "sequence_index";
-    static constexpr char const* contextPhaseParams = "context_phase_params";
-    static constexpr char const* kvCacheAllocNewBlocks = "kv_cache_alloc_new_blocks";
-    static constexpr char const* kvCacheReusedBlocks = "kv_cache_reused_blocks";
-    static constexpr char const* kvCacheAllocTotalBlocks = "kv_cache_alloc_total_blocks";
-    static constexpr char const* arrivalTime = "arrival_time_ns";
-    static constexpr char const* firstScheduledTime = "first_scheduled_time_ns";
-    static constexpr char const* firstTokenTime = "first_token_time_ns";
-    static constexpr char const* lastTokenTime = "last_token_time_ns";
-    static constexpr char const* acceptanceRate = "acceptance_rate";
-    static constexpr char const* totalAcceptedDraftTokens = "total_accepted_draft_tokens";
-    static constexpr char const* totalDraftTokens = "total_draft_tokens";
-    static constexpr char const* numInputTokens = "num_input_tokens";
-    static constexpr char const* numOutputTokens = "num_output_tokens";
-};
-
-inline static std::string const kStopInputTensorName = "stop";
-inline static std::string const kStreamingInputTensorName = "streaming";
-inline static std::string const kRequestTypeParameterName = "request_type";
-inline static std::unordered_map<std::string, executor::RequestType> stringToRequestType
-    = {{"context_and_generation", executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION},
-        {"context_only", executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY},
-        {"generation_only", executor::RequestType::REQUEST_TYPE_GENERATION_ONLY}};
-
-namespace utils
-{
-
-/// @brief  Convert Triton datatype to TRT datatype
-nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type);
-
-/// @brief Convert executor datatype to Triton datatype
-TRITONSERVER_DataType to_triton_datatype(executor::DataType data_type);
-
-using InputTensors = std::unordered_map<std::string, NamedTensor>;
-
-/// @brief Split batched input tensors into bs==1 tensors.
-/// @return Vector of maps of bs==1 tensors keyed on tensor name.
-std::vector<InputTensors> splitBatchInputsTensors(InputTensors const& inputsTensors);
-
-/// @brief Gather input tenors in a Triton request
-/// @return An unordered map with key being input name and value being input tensor for each batch sample
-std::vector<InputTensors> readInputsTensors(TRITONBACKEND_Request* request);
-
-/// @brief Construct executor::SampleConfig from input tensors
-executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::OutputConfig from input tensors
-executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::ExternalDraftTokensConfig from input tensors
-std::optional<executor::ExternalDraftTokensConfig> getExternalDraftTokensConfigFromTensors(
-    InputTensors const& inputsTensors, bool const fastLogits);
-
-/// @brief Construct executor::PromptTuningConfig from input tensors
-std::optional<executor::PromptTuningConfig> getPromptTuningConfigFromTensors(
-    InputTensors const& inputsTensors, size_t inputLen);
-
-/// @brief Construct executor::LoraConfig from input tensors
-std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::KvCacheRetentionConfig from input tensors
-std::optional<executor::KvCacheRetentionConfig> getKvCacheRetentionConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::GuidedDecodingParams from input tensors
-std::optional<executor::GuidedDecodingParams> getGuidedDecodingParamsFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::LookaheadDecodingConfig from input tensors for requests
-/// @note Let executor_lookahead_config as (W, N, G). Each request can specify a Lookahead configuration, noted as (w,
-/// n, g). Ensure the Lookahead configuration for each request satisfies w <= W, n <= N, g <= G.
-std::optional<executor::LookaheadDecodingConfig> getLookaheadDecodingFromTensors(
-    InputTensors const& inputsTensors, std::optional<executor::LookaheadDecodingConfig> const& executorLookaheadConfig);
-
-/// @brief Construct executor::Request from input tensors
-std::vector<executor::Request> createRequestsFromInputTensors(std::vector<InputTensors> const& inputsTensors,
-    bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType,
-    executor::RequestType requestType, bool isOrchestrator, bool specDecFastLogits,
-    std::optional<executor::LookaheadDecodingConfig> const& executorLookaheadConfig);
-
-/// @brief get the requestId of the request and update requestIdStrMap
-/// @return Returns 0 if not specified. Throws an error if request_id cannot be convert to uint64_t
-uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap);
-
-/// @brief Get the requested output names
-std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request);
-
-/// @brief Get the value of a boolean tensor
-bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName);
-
-/// @brief Get a single value tensor from the input tensors
-/// @return true if the value is found else false
-template <typename Value>
-bool extractSingleton(InputTensors const& params, std::string const& name, Value& value)
-{
-    if (!params.count(name))
-    {
-        return false;
-    }
-    auto const& tensor = params.at(name);
-    TLLM_CHECK_WITH_INFO(tensor.tensor->getSize() == 1, "Invalid size for tensor " + name);
-    value = *(static_cast<Value*>(tensor.tensor->data()));
-    return true;
-}
-
-/// @brief Get a single value tensor from the input tensors and put it into an optional. Set to std::nullopt if it's not
-/// found.
-template <typename Value>
-void extractOptionalSingleton(InputTensors const& params, std::string const& name, std::optional<Value>& optionalValue)
-{
-    Value value;
-    if (extractSingleton<Value>(params, name, value))
-    {
-        optionalValue = value;
-    }
-    else
-    {
-        optionalValue = std::nullopt;
-    }
-}
-
-/// @brief Get a 1d tensor from the input tensors
-/// @return true if the tensor is found else false
-template <typename Value>
-bool extractVector(InputTensors const& params, std::string const& name, std::vector<Value>& value)
-{
-    if (!params.count(name))
-    {
-        return false;
-    }
-    auto const& tensor = params.at(name);
-    int64_t n = tensor.tensor->getSize();
-    value.resize(n);
-    for (int64_t i = 0; i < n; ++i)
-    {
-        value[i] = static_cast<Value*>(tensor.tensor->data())[i];
-    }
-    return true;
-}
-
-int64_t numElements(std::vector<int64_t> const& shape);
-
-/// @brief Flatten the vector and copy into the buffer
-template <typename T>
-void flatten(std::vector<T> const& vec, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    TLLM_CHECK_WITH_INFO(static_cast<int64_t>(vec.size()) == numElements(expectedShape),
-        "Trying to flatten a tensor with unexpected size");
-    T* typedBuffer = static_cast<T*>(buffer);
-    std::copy(vec.begin(), vec.end(), typedBuffer);
-}
-
-/// @brief Flatten the vector of vector and copy into the buffer
-template <typename T>
-void flatten(std::vector<std::vector<T>> const& vec, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    T* typedBuffer = static_cast<T*>(buffer);
-    int64_t copiedSize = 0;
-    for (auto const& innerVec : vec)
-    {
-        TLLM_CHECK_WITH_INFO(innerVec.size() == vec.at(0).size(),
-            "The vector of vector to be flattened has mismatched sizes in its inner vectors");
-        copiedSize += innerVec.size();
-        typedBuffer = std::copy(innerVec.begin(), innerVec.end(), typedBuffer);
-    }
-    TLLM_CHECK_WITH_INFO(copiedSize == numElements(expectedShape), "Trying to flatten a tensor with unexpected size");
-}
-
-/// @brief Flatten the tensor and copy into the buffer
-template <typename Value>
-void flatten(tensorrt_llm::executor::Tensor const& tensor, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    TLLM_CHECK_WITH_INFO(static_cast<int64_t>(tensor.getSize()) == numElements(expectedShape),
-        "Trying to flatten a tensor with unexpected size");
-    Value* typedBuffer = static_cast<Value*>(buffer);
-    Value const* ptr = static_cast<Value const*>(tensor.getData());
-    std::copy(ptr, ptr + tensor.getSize(), typedBuffer);
-}
-
-/// @brief Query Triton for a buffer that can be used to pass the output tensors
-template <typename T>
-void* getResponseBuffer(TRITONBACKEND_Response* tritonResponse, std::vector<int64_t> const& shape,
-    TRITONSERVER_DataType dtype, std::string const& name)
-{
-    TRITONBACKEND_Output* output;
-    TRITONSERVER_Error* err{nullptr};
-    err = TRITONBACKEND_ResponseOutput(tritonResponse, &output, name.c_str(), dtype, shape.data(), shape.size());
-    if (err != nullptr)
-    {
-        auto errMsg = TRITONSERVER_ErrorMessage(err);
-        TLLM_THROW("Could not get response output for output tensor %s: %s", name.c_str(), errMsg);
-    }
-
-    TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-    int64_t memory_type_id = 0;
-    uint64_t size = 1;
-    for (auto s : shape)
-    {
-        size *= s;
-    }
-    auto buffersize = size * sizeof(T);
-    void* tritonBuffer = 0L;
-    err = TRITONBACKEND_OutputBuffer(output, &tritonBuffer, buffersize, &memory_type, &memory_type_id);
-    if (err != nullptr)
-    {
-        auto errMsg = TRITONSERVER_ErrorMessage(err);
-        TLLM_THROW("Could not get output buffer for output tensor %s: %s", name.c_str(), errMsg);
-    }
-    return tritonBuffer;
-}
-
-template <typename T>
-struct ParameterTypeMap
-{
-    static const TRITONSERVER_ParameterType parameter_type;
-};
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<int32_t>::parameter_type;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<std::string>::parameter_type;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<bool>::parameter_type;
-
-template <>
-const TRITONSERVER_ParameterType ParameterTypeMap<double>::parameter_type;
-
-template <typename T>
-std::optional<T> getRequestParameter(TRITONBACKEND_Request* request, std::string const& name)
-{
-    uint32_t parameter_count;
-    TRITONBACKEND_RequestParameterCount(request, &parameter_count);
-    for (size_t i = 0; i < parameter_count; i++)
-    {
-        char const* request_key;
-        TRITONSERVER_ParameterType parameter_type;
-        void const* value;
-        TRITONBACKEND_RequestParameter(request, i, &request_key, &parameter_type, &value);
-        if (parameter_type == ParameterTypeMap<T>::parameter_type && request_key == name)
-        {
-            if (std::is_same<T, std::string>::value)
-            {
-                return reinterpret_cast<char const*>(value);
-            }
-            else
-            {
-                return *reinterpret_cast<T const*>(value);
-            }
-        }
-    }
-
-    // If the parameter is not found, we would return a nullopt.
-    return std::nullopt;
-}
-
-/// @brief Convert a sparse tensor to a list of VecTokens
-std::list<executor::VecTokens> convertWordList(executor::VecTokens const& sparseList);
-
-/// @brief Remove the additional size 1 dimension for tensor
-void squeezeTensor(std::shared_ptr<runtime::ITensor> const& tensor, int32_t expectedNumDims);
-
-/// Helper functions to parse a csv delimited string to a vector ints
-std::vector<int32_t> csvStrToVecInt(std::string const& str);
-
-/// Helper functions to parse a csv delimited string to a vector of vector ints
-std::vector<std::vector<int32_t>> csvStrToVecVecInt(std::string const& str);
-
-/// Split a string by a delimiter and return the tokens in a vector of strings.
-std::vector<std::string> split(std::string const& str, char delimiter);
-
-/// @brief Get the TRTLLM request type from the request parameters.
-executor::RequestType getRequestType(TRITONBACKEND_Request* request);
-
-struct MemoryBuffer : std::streambuf
-{
-    MemoryBuffer(char* base, size_t size)
-    {
-        this->setg(base, base, base + size);
-    }
-};
-
-struct InMemoryStreamBuffer : virtual MemoryBuffer, std::istream
-{
-    InMemoryStreamBuffer(char* base, size_t size)
-        : MemoryBuffer(base, size)
-        , std::istream(static_cast<std::streambuf*>(this))
-    {
-    }
-};
-
-} // namespace utils
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/triton_backend/inflight_batcher_llm/tests/CMakeLists.txt b/triton_backend/inflight_batcher_llm/tests/CMakeLists.txt
deleted file mode 100644
index 32bf0f3ed8c..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/CMakeLists.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met: *
-# Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer. * Redistributions in binary
-# form must reproduce the above copyright notice, this list of conditions and
-# the following disclaimer in the documentation and/or other materials provided
-# with the distribution. * Neither the name of NVIDIA CORPORATION nor the names
-# of its contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS
-# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-# EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# GoogleTest Preparation - Code block copied from
-# https://google.github.io/googletest/quickstart-cmake.html
-
-include(FetchContent)
-FetchContent_Declare(
-  googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG release-1.12.1)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
-
-include_directories(
-  ${TRTLLM_DIR}/cpp ${TRTLLM_DIR}/cpp/include ${CMAKE_CURRENT_SOURCE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/../src ${CUDA_INCLUDE_DIRS})
-
-set(TOP_LEVEL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..")
-
-function(add_gtest test_name test_src)
-  add_executable(${test_name} ${test_src})
-
-  target_link_libraries(
-    ${test_name} PRIVATE gtest_main gmock_main triton-tensorrt-llm-backend
-                         ${tensorrt_llm} ${CUDA_LIBRARIES})
-
-  target_compile_features(${test_name} PRIVATE cxx_std_17)
-  target_compile_definitions(${test_name}
-                             PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
-  target_compile_options(${test_name} PRIVATE -Wno-deprecated-declarations)
-
-  gtest_discover_tests(${test_name} DISCOVERY_TIMEOUT 30 DISCOVERY_MODE
-                       PRE_TEST)
-endfunction()
-
-add_gtest(modelInstanceStateTest modelInstanceStateTest.cpp)
-add_gtest(modelStateTest modelState.cpp)
-add_gtest(utilsTest utilsTest.cpp)
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/first.json
-               ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/second.json
-               ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/third.json
-               ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
diff --git a/triton_backend/inflight_batcher_llm/tests/first.json b/triton_backend/inflight_batcher_llm/tests/first.json
deleted file mode 100644
index a19147bf5af..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/first.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-    "parameters": {
-        "gpu_device_ids": {
-            "string_value": "1"
-        },
-        "max_beam_width": {
-            "string_value": "8"
-        },
-        "batch_scheduler_policy": {
-            "string_value": "guaranteed_no_evict"
-        },
-        "executor_worker_path": {
-            "string_value": "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
-        },
-        "normalize_log_probs": {
-            "string_value": "true"
-        },
-        "gpt_model_type": {
-            "string_value": "inflight_fused_batching"
-        }
-    },
-    "model_transaction_policy": {
-        "decoupled": false
-    }
-}
diff --git a/triton_backend/inflight_batcher_llm/tests/modelInstanceStateTest.cpp b/triton_backend/inflight_batcher_llm/tests/modelInstanceStateTest.cpp
deleted file mode 100644
index e84c3247c13..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/modelInstanceStateTest.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <gtest/gtest.h>
-
-#include "model_instance_state.h"
-#include "model_state.h"
-
-#include "tensorrt_llm/common/logger.h"
-
-using namespace triton::backend::inflight_batcher_llm;
-
-namespace triton::backend::inflight_batcher_llm::tests
-{
-
-class ModelInstanceStateTest : public ::testing::Test
-{
-protected:
-    void SetUp() override {}
-
-    void TearDown() override
-    {
-        ;
-    }
-};
-
-TEST_F(ModelInstanceStateTest, ExecutorConfig)
-{
-
-    std::string jsonStr =
-        R"(
-{
-    "parameters": {
-        "normalize_log_probs": {
-            "string_value": "false"
-        },
-        "multi_block_mode": {
-            "string_value": "false"
-        },
-        "enable_context_fmha_fp32_acc": {
-            "string_value": "true"
-        },
-        "cuda_graph_mode": {
-            "string_value": "true"
-        },
-        "cuda_graph_cache_size": {
-            "string_value": "10"
-        },
-        "gpt_model_type": {
-            "string_value": "inflight_fused_batching"
-        }
-    }
-}
-)";
-
-    triton::common::TritonJson::Value modelConfig;
-    EXPECT_EQ(modelConfig.Parse(jsonStr), TRITONJSON_STATUSSUCCESS);
-
-    ModelState modelState(nullptr, "test", 1, std::move(modelConfig));
-
-    auto modelInstanceState = ModelInstanceState(&modelState);
-    auto executorConfig = modelInstanceState.getExecutorConfigFromParams();
-    auto extendedPerfCfg = executorConfig.getExtendedRuntimePerfKnobConfig();
-
-    EXPECT_EQ(executorConfig.getNormalizeLogProbs(), false);
-    EXPECT_EQ(extendedPerfCfg.getMultiBlockMode(), false);
-    EXPECT_EQ(extendedPerfCfg.getEnableContextFMHAFP32Acc(), true);
-    EXPECT_EQ(extendedPerfCfg.getCudaGraphMode(), true);
-    EXPECT_EQ(extendedPerfCfg.getCudaGraphCacheSize(), 10);
-}
-} // namespace triton::backend::inflight_batcher_llm::tests
diff --git a/triton_backend/inflight_batcher_llm/tests/modelState.cpp b/triton_backend/inflight_batcher_llm/tests/modelState.cpp
deleted file mode 100644
index 5c8dd73cb80..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/modelState.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <gtest/gtest.h>
-
-#include "model_state.h"
-
-#include "tensorrt_llm/common/logger.h"
-
-#include <fstream>
-
-using namespace triton::backend::inflight_batcher_llm;
-
-class ModelStateTest : public ::testing::Test
-{
-protected:
-    void SetUp() override {}
-
-    void TearDown() override
-    {
-        ;
-    }
-};
-
-struct ModelStateTestUtils
-{
-    static void CompareModelStates(ModelState& ms1, ModelState& ms2)
-    {
-        EXPECT_EQ(ms1.GetModelName(), ms2.GetModelName());
-        EXPECT_EQ(ms1.GetModelVersion(), ms2.GetModelVersion());
-        EXPECT_EQ(ms1.GetExecutorWorkerPath(), ms2.GetExecutorWorkerPath());
-        EXPECT_EQ(ms1.getDeviceIds(), ms2.getDeviceIds());
-        EXPECT_EQ(ms1.IsDecoupled(), ms2.IsDecoupled());
-
-        // Compare a few parameters
-        EXPECT_EQ(ms1.GetParameter<std::string>("gpt_model_type"), ms2.GetParameter<std::string>("gpt_model_type"));
-        EXPECT_EQ(ms1.GetParameter<int32_t>("max_beam_width"), ms2.GetParameter<int32_t>("max_beam_width"));
-        EXPECT_EQ(ms1.GetParameter<bool>("normalize_log_probs"), ms2.GetParameter<bool>("normalize_log_probs"));
-    }
-
-    static void TestModelState(std::string name, uint64_t version, std::string jsonFileName)
-    {
-        // Force the creation of the logger so it gets destroyed last
-        tensorrt_llm::common::Logger::getLogger();
-
-        std::ifstream ifs(jsonFileName);
-        EXPECT_EQ(ifs.is_open(), true);
-        std::stringstream ss;
-        ss << ifs.rdbuf();
-        ifs.close();
-
-        triton::common::TritonJson::Value modelConfig;
-        EXPECT_EQ(modelConfig.Parse(ss.str()), TRITONJSON_STATUSSUCCESS);
-
-        // create origin ModelState
-        ModelState original(nullptr, name, version, std::move(modelConfig));
-
-        // copy through deserialize(original.serialize())
-        auto packed = original.serialize();
-        auto copy = ModelState::deserialize(packed);
-
-        ModelStateTestUtils::CompareModelStates(original, copy);
-    }
-};
-
-TEST_F(ModelStateTest, 1)
-{
-    ModelStateTestUtils::TestModelState("first", 3, "first.json");
-    ;
-}
-
-TEST_F(ModelStateTest, 2)
-{
-    ModelStateTestUtils::TestModelState("second", 7, "second.json");
-    ;
-}
-
-TEST_F(ModelStateTest, 3)
-{
-    ModelStateTestUtils::TestModelState("third", 3, "third.json");
-    ;
-}
diff --git a/triton_backend/inflight_batcher_llm/tests/second.json b/triton_backend/inflight_batcher_llm/tests/second.json
deleted file mode 100644
index 68b2148bd8a..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/second.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-    "parameters": {
-        "gpu_device_ids": {
-            "string_value": "0"
-        },
-        "max_beam_width": {
-            "string_value": "1"
-        },
-        "batch_scheduler_policy": {
-            "string_value": "guaranteed_no_evict"
-        },
-        "executor_worker_path": {
-            "string_value": "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
-        },
-        "normalize_log_probs": {
-            "string_value": "false"
-        },
-        "gpt_model_type": {
-            "string_value": "inflight_fused_batching"
-        }
-    },
-    "model_transaction_policy": {
-        "decoupled": false
-    }
-}
diff --git a/triton_backend/inflight_batcher_llm/tests/third.json b/triton_backend/inflight_batcher_llm/tests/third.json
deleted file mode 100644
index 622b4c886df..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/third.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-    "parameters": {
-        "gpu_device_ids": {
-            "string_value": "0"
-        },
-        "max_beam_width": {
-            "string_value": "4"
-        },
-        "batch_scheduler_policy": {
-            "string_value": "guaranteed_no_evict"
-        },
-        "executor_worker_path": {
-            "string_value": "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
-        },
-        "normalize_log_probs": {
-            "string_value": "false"
-        },
-        "gpt_model_type": {
-            "string_value": "inflight_fused_batching"
-        }
-    },
-    "model_transaction_policy": {
-        "decoupled": true
-    }
-}
diff --git a/triton_backend/inflight_batcher_llm/tests/utilsTest.cpp b/triton_backend/inflight_batcher_llm/tests/utilsTest.cpp
deleted file mode 100644
index cd6a5c78048..00000000000
--- a/triton_backend/inflight_batcher_llm/tests/utilsTest.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "tensorrt_llm/common/tllmException.h"
-#include "tensorrt_llm/executor/types.h"
-#include <gmock/gmock-matchers.h>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include <optional>
-
-#include "utils.h"
-
-using namespace triton::backend::inflight_batcher_llm;
-using namespace triton::backend::inflight_batcher_llm::utils;
-
-TEST(UtilsTest, csvStrToVecInt)
-{
-    {
-        std::string str = "0";
-        auto out = csvStrToVecInt(str);
-        EXPECT_THAT(out, testing::ElementsAre(0));
-    }
-    {
-        std::string str = "0, 1, 2";
-        auto out = csvStrToVecInt(str);
-        EXPECT_THAT(out, testing::ElementsAre(0, 1, 2));
-    }
-    {
-        std::string str = "0,1,2";
-        auto out = csvStrToVecInt(str);
-        EXPECT_THAT(out, testing::ElementsAre(0, 1, 2));
-    }
-    // Trailing comma is ok
-    {
-        std::string str = "0, 1, 2, ";
-        auto out = csvStrToVecInt(str);
-        EXPECT_THAT(out, testing::ElementsAre(0, 1, 2));
-    }
-    {
-        std::vector<std::string> invalidStr{"a", "bbdfdsa", ",,", ","};
-        for (auto const& str : invalidStr)
-        {
-            try
-            {
-                auto out = csvStrToVecInt(str);
-                for (auto val : out)
-                {
-                    std::cout << val << std::endl;
-                }
-                FAIL() << "Expected exception for input: " << str;
-            }
-            catch (std::invalid_argument const& e)
-            {
-                EXPECT_THAT(e.what(), testing::HasSubstr("stoi"));
-            }
-            catch (tensorrt_llm::common::TllmException const& e)
-            {
-                EXPECT_THAT(
-                    e.what(), testing::AnyOf(testing::HasSubstr("empty string"), testing::HasSubstr("Empty vector")));
-            }
-            catch (std::exception const& e)
-            {
-                FAIL() << "Expected invalid argument of input: " << str;
-            }
-        }
-    }
-}
-
-TEST(UtilsTest, csvStrToVecVecInt)
-{
-    {
-        std::string str = "{0}";
-        auto out = csvStrToVecVecInt(str);
-        EXPECT_THAT(out.size(), 1);
-        EXPECT_THAT(out.at(0), testing::ElementsAre(0));
-    }
-    {
-        std::vector<std::string> strs = {"{1, 5}, {1, 2, 3}", "{1,5},{1,2,3}", "{1, 5}, {1,2,3,}, "};
-        for (auto str : strs)
-        {
-            auto out = csvStrToVecVecInt(str);
-            EXPECT_THAT(out.size(), 2);
-            EXPECT_THAT(out.at(0), testing::ElementsAre(1, 5));
-            EXPECT_THAT(out.at(1), testing::ElementsAre(1, 2, 3));
-        }
-    }
-
-    {
-        std::vector<std::string> invalidStr{"a", "0,1,", "{a}", "{bdfda,,}", "", "", "{{}}", "{}"};
-        for (auto const& str : invalidStr)
-        {
-            try
-            {
-                auto out = csvStrToVecVecInt(str);
-                FAIL() << "Expected exception for input: " << str;
-            }
-            catch (std::invalid_argument const& e)
-            {
-                EXPECT_THAT(e.what(), testing::HasSubstr("stoi"));
-            }
-            catch (tensorrt_llm::common::TllmException const& e)
-            {
-                EXPECT_THAT(
-                    e.what(), testing::AnyOf(testing::HasSubstr("empty string"), testing::HasSubstr("Empty vector")));
-            }
-            catch (std::exception const& e)
-            {
-                FAIL() << "Expected invalid argument of input: " << str << " got " << e.what();
-            }
-        }
-    }
-}
-
-template <typename Value>
-void pushTensor(std::unordered_map<std::string, NamedTensor>& inputsTensors, std::string name, nvinfer1::DataType type,
-    std::vector<int64_t> shape, std::vector<Value> data)
-{
-    NamedTensor tensor(type, shape, name, data.data());
-    inputsTensors.insert(make_pair(name, std::move(tensor)));
-}
-
-TEST(UtilsTest, extractSingleton)
-{
-    std::unordered_map<std::string, NamedTensor> inputsTensors;
-    pushTensor<int32_t>(inputsTensors, "int32", nvinfer1::DataType::kINT32, {1}, {2});
-    pushTensor<int64_t>(inputsTensors, "int64", nvinfer1::DataType::kINT64, {1, 1}, {4294967296ll});
-    pushTensor<float>(inputsTensors, "float32", nvinfer1::DataType::kFLOAT, {1, 2}, {0.5, 0.6});
-
-    // extractSingleton
-    {
-        int32_t int32Value = 0;
-        EXPECT_THAT(extractSingleton(inputsTensors, "int32", int32Value), true);
-        EXPECT_THAT(int32Value, 2);
-    }
-    {
-        int64_t int64Value = 0;
-        EXPECT_THAT(extractSingleton(inputsTensors, "int64_typo", int64Value), false);
-        EXPECT_THAT(extractSingleton(inputsTensors, "int64", int64Value), true);
-        EXPECT_THAT(int64Value, 4294967296ll);
-    }
-    {
-        float floatValue = 0;
-        try
-        {
-            extractSingleton(inputsTensors, "float32", floatValue);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("Invalid size"));
-        }
-    }
-
-    // extractOptionalSingleton
-    {
-        std::optional<int64_t> int64Value;
-        extractOptionalSingleton(inputsTensors, "int64_typo", int64Value);
-        EXPECT_THAT(int64Value.has_value(), false);
-        extractOptionalSingleton(inputsTensors, "int64", int64Value);
-        EXPECT_THAT(int64Value.has_value(), true);
-        EXPECT_THAT(int64Value.value(), 4294967296ll);
-    }
-
-    // extractVector
-    {
-        std::vector<float> float32Values;
-        EXPECT_THAT(extractVector(inputsTensors, "float32", float32Values), true);
-        EXPECT_THAT(float32Values, testing::ElementsAre(0.5, 0.6));
-    }
-}
-
-TEST(UtilsTest, flatten)
-{
-    // single vector pass
-    {
-        std::vector<int32_t> original{1, 2, 3};
-        std::vector<int32_t> out(3, -1);
-        flatten(original, out.data(), {3});
-        EXPECT_THAT(out, testing::ElementsAre(1, 2, 3));
-    }
-    // single vector fail
-    {
-        std::vector<int32_t> original{1, 2, 3};
-        std::vector<int32_t> out(3, -1);
-        try
-        {
-            flatten(original, out.data(), {4});
-            FAIL() << "Expected exception for mismatched shape";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("unexpected size"));
-        }
-    }
-    // vector of vector pass
-    {
-        std::vector<std::vector<int32_t>> original{
-            {1, 1},
-            {2, 3},
-            {5, 8},
-        };
-        std::vector<int32_t> out(6, -1);
-        flatten(original, out.data(), {6});
-        EXPECT_THAT(out, testing::ElementsAre(1, 1, 2, 3, 5, 8));
-    }
-    // vector of vector fail
-    {
-        std::vector<std::vector<int32_t>> original{
-            {1, 1},
-            {2, 3},
-            {5, 8, 8},
-        };
-        std::vector<int32_t> out(7, -1);
-        try
-        {
-            flatten(original, out.data(), {7});
-            FAIL() << "Expected exception for mismatched inner vectors";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("mismatched sizes"));
-        }
-    }
-    // executor tensor pass
-    {
-        std::vector<int32_t> tensorUnderlying{6, 7, 8};
-        auto original = executor::Tensor::of(tensorUnderlying.data(), {3});
-        std::vector<int32_t> out(3, -1);
-        flatten<int32_t>(original, out.data(), {3});
-        EXPECT_THAT(out, testing::ElementsAre(6, 7, 8));
-    }
-    // executor tensor fail
-    {
-        std::vector<int32_t> tensorUnderlying{6, 7, 8};
-        auto original = executor::Tensor::of(tensorUnderlying.data(), {3});
-        std::vector<int32_t> out(3, -1);
-        try
-        {
-            flatten<int32_t>(original, out.data(), {4});
-            FAIL() << "Expected exception for mismatched inner vectors";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("unexpected size"));
-        }
-    }
-}
-
-TEST(UtilsTest, convertWordList)
-{
-    // fail
-    {
-        executor::VecTokens before{1, 2, 3};
-        std::list<executor::VecTokens> after;
-        try
-        {
-            after = convertWordList(before);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("odd length"));
-        }
-    }
-    // fail
-    {
-        executor::VecTokens before{1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 4, 9, -1, -1, -1, -1, 666, -1};
-        std::list<executor::VecTokens> after;
-        try
-        {
-            after = convertWordList(before);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("additional -1s"));
-        }
-    }
-    // fail
-    {
-        executor::VecTokens before{1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 4, 10, -1, -1, -1, -1, -1, -1};
-        std::list<executor::VecTokens> after;
-        try
-        {
-            after = convertWordList(before);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("out-of-bound offsets"));
-        }
-    }
-    // fail
-    {
-        executor::VecTokens before{1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 4, 2, -1, -1, -1, -1, -1, -1};
-        std::list<executor::VecTokens> after;
-        try
-        {
-            after = convertWordList(before);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("non-increasing offsets"));
-        }
-    }
-    // pass
-    {
-        executor::VecTokens before{1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 4, 9, -1, -1, -1, -1, -1, -1};
-        std::list<executor::VecTokens> after;
-        after = convertWordList(before);
-        EXPECT_THAT(after.size(), 3);
-        auto iter = after.begin();
-        EXPECT_THAT(*(iter++), testing::ElementsAre(1, 2));
-        EXPECT_THAT(*(iter++), testing::ElementsAre(3, 4));
-        EXPECT_THAT(*(iter++), testing::ElementsAre(5, 6, 7, 8, 9));
-    }
-}
-
-std::optional<tensorrt_llm::executor::Request> getRequest(
-    executor::ModelType modelType = executor::ModelType::kDECODER_ONLY, std::optional<int32_t> padId = std::nullopt,
-    std::vector<int32_t> const& inputTokens = {1, 2, 3, 4, 5}, std::vector<int32_t> const& decoderInputTokens = {})
-{
-    std::unordered_map<std::string, NamedTensor> inputsTensors;
-
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::inputTokens, nvinfer1::DataType::kINT32,
-        {static_cast<int64_t>(inputTokens.size())}, inputTokens);
-    if (!decoderInputTokens.empty())
-    {
-        pushTensor<int32_t>(inputsTensors, InputFieldsNames::decoderInputTokens, nvinfer1::DataType::kINT32,
-            {static_cast<int64_t>(decoderInputTokens.size())}, decoderInputTokens);
-    }
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::maxNewTokens, nvinfer1::DataType::kINT32, {1}, {8});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::numReturnSequences, nvinfer1::DataType::kINT32, {1}, {29});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::endId, nvinfer1::DataType::kINT32, {1}, {11});
-    if (padId)
-    {
-        pushTensor<int32_t>(inputsTensors, InputFieldsNames::padId, nvinfer1::DataType::kINT32, {1}, {padId.value()});
-    }
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::badWords, nvinfer1::DataType::kINT32, {6}, {1, 2, 3, 2, 3, -1});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::stopWords, nvinfer1::DataType::kINT32, {6}, {1, 2, 3, 3, -1, -1});
-    pushTensor<float>(
-        inputsTensors, InputFieldsNames::embeddingBias, nvinfer1::DataType::kFLOAT, {1, 3}, {0.5, 0.6, 0.7});
-
-    // OutputConfig
-    using MyBool = char; // prevent std::vector<bool>
-    pushTensor<MyBool>(inputsTensors, InputFieldsNames::returnLogProbs, nvinfer1::DataType::kBOOL, {1}, {true});
-    pushTensor<MyBool>(inputsTensors, InputFieldsNames::returnGenerationLogits, nvinfer1::DataType::kBOOL, {1}, {true});
-    pushTensor<MyBool>(inputsTensors, InputFieldsNames::returnContextLogits, nvinfer1::DataType::kBOOL, {1}, {true});
-
-    // SamplingConfig
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::beamWidth, nvinfer1::DataType::kINT32, {1}, {1023});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::topK, nvinfer1::DataType::kINT32, {1}, {25});
-    pushTensor<float>(inputsTensors, InputFieldsNames::topP, nvinfer1::DataType::kFLOAT, {1}, {0.7});
-    pushTensor<float>(inputsTensors, InputFieldsNames::topPMin, nvinfer1::DataType::kFLOAT, {1}, {0.8});
-    pushTensor<float>(inputsTensors, InputFieldsNames::topPDecay, nvinfer1::DataType::kFLOAT, {1}, {0.9});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::topPResetIds, nvinfer1::DataType::kINT32, {1}, {25});
-    pushTensor<float>(inputsTensors, InputFieldsNames::temperature, nvinfer1::DataType::kFLOAT, {1}, {0.3});
-    pushTensor<float>(inputsTensors, InputFieldsNames::lengthPenalty, nvinfer1::DataType::kFLOAT, {1}, {0.4});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::earlyStopping, nvinfer1::DataType::kINT32, {1}, {4});
-    pushTensor<float>(inputsTensors, InputFieldsNames::repetitionPenalty, nvinfer1::DataType::kFLOAT, {1}, {0.8});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::minTokens, nvinfer1::DataType::kINT32, {1}, {45});
-    pushTensor<float>(inputsTensors, InputFieldsNames::beamSearchDiversityRate, nvinfer1::DataType::kFLOAT, {1}, {0.1});
-    pushTensor<float>(inputsTensors, InputFieldsNames::presencePenalty, nvinfer1::DataType::kFLOAT, {1}, {0.2});
-    pushTensor<float>(inputsTensors, InputFieldsNames::frequencyPenalty, nvinfer1::DataType::kFLOAT, {1}, {0.3});
-    pushTensor<uint64_t>(inputsTensors, InputFieldsNames::seed, nvinfer1::DataType::kINT64, {1}, {3456});
-
-    // PromptTuningConfig
-    pushTensor<float>(inputsTensors, InputFieldsNames::promptEmbeddingTable, nvinfer1::DataType::kFLOAT, {1, 2, 2},
-        {0.5, 0.6, 0.7, 0.8});
-
-    // LoraConfig
-    pushTensor<uint64_t>(inputsTensors, InputFieldsNames::loraTaskId, nvinfer1::DataType::kINT64, {1}, {87654});
-    pushTensor<float>(inputsTensors, InputFieldsNames::loraWeights, nvinfer1::DataType::kFLOAT, {1, 3, 2},
-        {0.5, 0.6, 0.7, 0.8, 0.1, 0.1});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::loraConfig, nvinfer1::DataType::kINT32, {1, 3, 2}, {1, 1, 2, 3, 5, 8});
-
-    // KvCacheRetentionConfig
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::retentionTokenRangeStarts, nvinfer1::DataType::kINT32, {3}, {0, 100, 200});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::retentionTokenRangeEnds, nvinfer1::DataType::kINT32, {3}, {100, 200, 300});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::retentionTokenRangePriorities, nvinfer1::DataType::kINT32, {3}, {70, 50, 20});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::retentionTokenRangeDurations, nvinfer1::DataType::kINT32, {3},
-        {3000, 2000, 1000});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::retentionDecodePriority, nvinfer1::DataType::kINT32, {1}, {10});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::retentionDecodeDuration, nvinfer1::DataType::kINT32, {1}, {500});
-
-    // ExternalDraftTokensConfig
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::draftInputs, nvinfer1::DataType::kINT32, {4}, {1, 2, 3, 3});
-    pushTensor<float>(
-        inputsTensors, InputFieldsNames::draftLogits, nvinfer1::DataType::kFLOAT, {1, 4, 1}, {1.1, 2.1, 3.1, 3.1});
-    pushTensor<float>(
-        inputsTensors, InputFieldsNames::draftAcceptanceThreshold, nvinfer1::DataType::kFLOAT, {1}, {0.222F});
-
-    // getGuidedDecodingParams
-    // NOTE
-    // guideType: 'regex', guide: '\d+'
-    // Add 4 dummy 0(null)'s to mimic Triton StringType input.
-    pushTensor<int8_t>(inputsTensors, InputFieldsNames::guidedDecodingGuideType, nvinfer1::DataType::kINT8, {1, 9},
-        {0, 0, 0, 0, 114, 101, 103, 101, 120});
-    pushTensor<int8_t>(inputsTensors, InputFieldsNames::guidedDecodingGuide, nvinfer1::DataType::kINT8, {1, 7},
-        {0, 0, 0, 0, 92, 100, 43});
-
-    // Request LookaheadDecodingConfig
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::requestLookaheadDecodingWindowSize, nvinfer1::DataType::kINT32, {1}, {2});
-    pushTensor<int32_t>(
-        inputsTensors, InputFieldsNames::requestLookaheadDecodingNgramSize, nvinfer1::DataType::kINT32, {1}, {2});
-    pushTensor<int32_t>(inputsTensors, InputFieldsNames::requestLookaheadDecodingVerificationSetSize,
-        nvinfer1::DataType::kINT32, {1}, {1});
-
-    std::optional<executor::Request> request;
-    try
-    {
-        request = createRequestsFromInputTensors({inputsTensors}, /*paramExcludeInputFromOutput*/ true,
-            /*isDecoupled*/ true, /*streaming*/ true, modelType,
-            executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, false /* isOrchestrator */,
-            false /* specDecFastLogits */, executor::LookaheadDecodingConfig{2, 2, 1})
-                      .at(0);
-        if (modelType == executor::ModelType::kENCODER_DECODER && decoderInputTokens.empty() && !padId)
-        {
-            EXPECT_TRUE(false) << "Should have failed";
-        }
-        return request;
-    }
-    catch (std::exception const& e)
-    {
-        if (modelType == executor::ModelType::kENCODER_DECODER && decoderInputTokens.empty() && !padId)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("Assertion failed: !mInputTokenIds.empty"));
-            return std::nullopt;
-        }
-        else
-        {
-            EXPECT_TRUE(false) << "Should not fail but failed with: " << e.what();
-        }
-    }
-    return request;
-}
-
-void checkWords(
-    std::list<tensorrt_llm::executor::VecTokens> const& words, std::vector<std::vector<int32_t>> const& reference)
-{
-    EXPECT_EQ(words.size(), reference.size());
-    auto iter1 = words.begin();
-    auto iter2 = reference.begin();
-    for (int32_t i = 0; i < words.size(); ++i)
-    {
-        EXPECT_EQ(*(iter1++), *(iter2++));
-    }
-}
-
-template <class T>
-void checkTensor(tensorrt_llm::executor::Tensor const& tensor, std::vector<T> reference)
-{
-    EXPECT_EQ(tensor.getSizeInBytes(), reference.size() * sizeof(T));
-    for (int i = 0; i < reference.size(); ++i)
-    {
-        EXPECT_EQ(reference[i], static_cast<T const*>(tensor.getData())[i]);
-    }
-}
-
-void checkRequest(tensorrt_llm::executor::Request const& request,
-    executor::ModelType modelType = executor::ModelType::kDECODER_ONLY, std::optional<int32_t> padId = std::nullopt,
-    std::vector<int32_t> const& inputTokens = {1, 2, 3, 4, 5}, std::vector<int32_t> const& decoderInputTokens = {})
-{
-    if (modelType == executor::ModelType::kENCODER_ONLY)
-    {
-        EXPECT_EQ(request.getEncoderInputTokenIds(), inputTokens);
-    }
-    else if (modelType == executor::ModelType::kENCODER_DECODER)
-    {
-        EXPECT_EQ(request.getEncoderInputTokenIds(), inputTokens);
-        if (!decoderInputTokens.empty())
-        {
-            EXPECT_EQ(request.getInputTokenIds(), decoderInputTokens);
-        }
-        else
-        {
-            if (padId)
-            {
-                EXPECT_EQ(request.getInputTokenIds(), std::vector<int32_t>{padId.value()});
-            }
-            else
-            {
-                FAIL() << "Should have failed already";
-            }
-        }
-    }
-    EXPECT_EQ(request.getMaxTokens(), 8);
-    EXPECT_EQ(request.getEndId().value(), 11);
-    if (padId)
-    {
-        EXPECT_EQ(request.getPadId().value(), padId.value());
-    }
-    checkWords(request.getBadWords().value(), {{1, 2}, {3}});
-    checkWords(request.getStopWords().value(), {{1, 2, 3}});
-    checkTensor<float>(request.getEmbeddingBias().value(), {0.5, 0.6, 0.7});
-
-    // OutputConfig
-    auto outputConfig = request.getOutputConfig();
-    EXPECT_TRUE(outputConfig.returnLogProbs);
-    EXPECT_TRUE(outputConfig.returnGenerationLogits);
-    EXPECT_TRUE(outputConfig.returnContextLogits);
-    EXPECT_TRUE(outputConfig.excludeInputFromOutput);
-
-    // ExternalDraftTokensConfig
-    auto externalDraftTokensConfig = request.getExternalDraftTokensConfig().value();
-    EXPECT_THAT(externalDraftTokensConfig.getTokens(), testing::ElementsAre(1, 2, 3, 3));
-    checkTensor<float>(externalDraftTokensConfig.getLogits().value(), {1.1, 2.1, 3.1, 3.1});
-    EXPECT_TRUE(externalDraftTokensConfig.getAcceptanceThreshold().has_value());
-    EXPECT_FLOAT_EQ(externalDraftTokensConfig.getAcceptanceThreshold().value(), 0.222F);
-
-    // PromptTuningConfig
-    auto promptTuningConfig = request.getPromptTuningConfig().value();
-    checkTensor<float>(promptTuningConfig.getEmbeddingTable(), {0.5, 0.6, 0.7, 0.8});
-
-    // LoraConfig
-    auto loraConfig = request.getLoraConfig().value();
-    EXPECT_EQ(loraConfig.getTaskId(), 87654);
-    checkTensor<float>(loraConfig.getWeights().value(), {0.5, 0.6, 0.7, 0.8, 0.1, 0.1});
-    checkTensor<int32_t>(loraConfig.getConfig().value(), {1, 1, 2, 3, 5, 8});
-
-    // KvCacheRetentionConfig
-    auto kvCacheRetentionConfig = request.getKvCacheRetentionConfig().value();
-
-    EXPECT_THAT(kvCacheRetentionConfig.getTokenRangeRetentionConfigs(),
-        testing::ContainerEq(std::vector<executor::KvCacheRetentionConfig::TokenRangeRetentionConfig>{
-            executor::KvCacheRetentionConfig::TokenRangeRetentionConfig(0, 100, 70, std::chrono::milliseconds(3000)),
-            executor::KvCacheRetentionConfig::TokenRangeRetentionConfig(100, 200, 50, std::chrono::milliseconds(2000)),
-            executor::KvCacheRetentionConfig::TokenRangeRetentionConfig(
-                200, 300, 20, std::chrono::milliseconds(1000))}));
-
-    EXPECT_EQ(kvCacheRetentionConfig.getDecodeRetentionPriority(), 10);
-    EXPECT_EQ(kvCacheRetentionConfig.getDecodeDurationMs(), std::chrono::milliseconds(500));
-
-    auto requestLookaheadConfig = request.getLookaheadConfig().value();
-    EXPECT_THAT(requestLookaheadConfig.getWindowSize(), 2);
-    EXPECT_THAT(requestLookaheadConfig.getNgramSize(), 2);
-    EXPECT_THAT(requestLookaheadConfig.getVerificationSetSize(), 1);
-
-    // SamplingConfig
-    auto samplingConfig = request.getSamplingConfig();
-    EXPECT_EQ(samplingConfig.getBeamWidth(), 1023);
-    EXPECT_EQ(samplingConfig.getTopK().value(), 25);
-    EXPECT_EQ(samplingConfig.getTopP().value(), 0.7f);
-    EXPECT_EQ(samplingConfig.getTopPMin().value(), 0.8f);
-    EXPECT_EQ(samplingConfig.getTopPDecay().value(), 0.9f);
-    EXPECT_EQ(samplingConfig.getTopPResetIds().value(), 25);
-    EXPECT_EQ(samplingConfig.getTemperature().value(), 0.3f);
-    EXPECT_EQ(samplingConfig.getLengthPenalty().value(), 0.4f);
-    EXPECT_EQ(samplingConfig.getEarlyStopping().value(), 4);
-    EXPECT_EQ(samplingConfig.getRepetitionPenalty().value(), 0.8f);
-    EXPECT_EQ(samplingConfig.getMinTokens().value(), 45);
-    EXPECT_EQ(samplingConfig.getBeamSearchDiversityRate().value(), 0.1f);
-    EXPECT_EQ(samplingConfig.getPresencePenalty().value(), 0.2f);
-    EXPECT_EQ(samplingConfig.getFrequencyPenalty().value(), 0.3f);
-    EXPECT_EQ(samplingConfig.getSeed().value(), 3456);
-    EXPECT_EQ(samplingConfig.getNumReturnSequences().value(), 29);
-
-    // getGuidedDecodingParams
-    auto guidedDecodingParams = request.getGuidedDecodingParams().value();
-    EXPECT_EQ(guidedDecodingParams.getGuideType(), executor::GuidedDecodingParams::GuideType::kREGEX);
-    EXPECT_EQ(guidedDecodingParams.getGuide().value(), "\\d+");
-}
-
-TEST(UtilsTest, createRequestFromInputTensors)
-{
-    auto request = getRequest(executor::ModelType::kDECODER_ONLY, 2);
-    checkRequest(request.value(), executor::ModelType::kDECODER_ONLY, 2);
-}
-
-TEST(UtilsTest, createRequestFromInputTensorsEncoderOnly)
-{
-    auto modelType = executor::ModelType::kENCODER_ONLY;
-    int32_t padId = 2;
-    auto request = getRequest(modelType, padId);
-    checkRequest(request.value(), modelType, padId);
-}
-
-TEST(UtilsTest, createRequestFromInputTensorsEncoderDecoder)
-{
-    auto modelType = executor::ModelType::kENCODER_DECODER;
-    auto inputTokensVec = std::vector<std::vector<int32_t>>{{1, 2, 3, 4, 5}};
-    auto decoderInputTokensVec = std::vector<std::vector<int32_t>>{{1, 2, 3, 4, 5}, {}};
-    auto padIds = std::vector<std::optional<int32_t>>{2, std::nullopt};
-
-    for (auto const& inputTokens : inputTokensVec)
-    {
-        for (auto const& decoderInputTokens : decoderInputTokensVec)
-        {
-            for (auto padId : padIds)
-            {
-                auto request = getRequest(modelType, padId, inputTokens, decoderInputTokens);
-                // If not request returned, means we were expecting failure during request creation
-                // Just continue in this case.
-                if (!request)
-                {
-                    continue;
-                }
-                else
-                {
-                    checkRequest(request.value(), modelType, padId, inputTokens, decoderInputTokens);
-                }
-            }
-        }
-    }
-}
-
-TEST(UtilsTest, splitBatchInputsTensorsBS1)
-{
-    // Batch size 1
-    auto modelType = executor::ModelType::kDECODER_ONLY;
-    {
-        std::vector<int32_t> const& inputTokens = {1, 2, 3, 4, 5};
-        InputTensors inputTensors;
-        pushTensor<int32_t>(inputTensors, InputFieldsNames::inputTokens, nvinfer1::DataType::kINT32,
-            {1, static_cast<int64_t>(inputTokens.size())}, inputTokens);
-        pushTensor<int32_t>(inputTensors, InputFieldsNames::maxNewTokens, nvinfer1::DataType::kINT32, {1}, {8});
-
-        auto inputsTensors = splitBatchInputsTensors(inputTensors);
-
-        // No length specified should work ok with batch size 1
-        {
-            EXPECT_EQ(inputsTensors.size(), 1);
-            for (int batchId = 0; batchId < inputsTensors.size(); batchId++)
-            {
-                auto& inputTensors = inputsTensors.at(batchId);
-                EXPECT_EQ(inputTensors.size(), 2);
-                EXPECT_EQ(inputTensors.count(InputFieldsNames::inputTokens), 1);
-                auto inputTokensTensor = inputTensors.at(InputFieldsNames::inputTokens).tensor;
-                EXPECT_EQ(inputTokensTensor->getShape().nbDims, 2);
-                EXPECT_EQ(inputTokensTensor->getShape().d[0], 1);
-                EXPECT_EQ(inputTokensTensor->getShape().d[1], inputTokens.size());
-                auto inputTokensData = reinterpret_cast<int32_t*>(inputTokensTensor->data());
-                EXPECT_EQ(inputTokensData[0], 1);
-                EXPECT_EQ(inputTokensData[1], 2);
-                EXPECT_EQ(inputTokensData[2], 3);
-                EXPECT_EQ(inputTokensData[3], 4);
-                EXPECT_EQ(inputTokensData[4], 5);
-
-                EXPECT_EQ(inputTensors.count(InputFieldsNames::maxNewTokens), 1);
-                auto maxTokensTensor = inputTensors.at(InputFieldsNames::maxNewTokens).tensor;
-                EXPECT_EQ(maxTokensTensor->getShape().nbDims, 1);
-                EXPECT_EQ(maxTokensTensor->getShape().d[0], 1);
-                auto maxTokensData = reinterpret_cast<int32_t*>(maxTokensTensor->data());
-                EXPECT_EQ(maxTokensData[0], 8);
-            }
-        }
-
-        // Create requests from batch size 1 tensor
-        {
-            auto requests = createRequestsFromInputTensors(inputsTensors, true, true, true, modelType,
-                executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, false /* isOrchestrator */,
-                false /* specDecFastLogits */, std::nullopt);
-
-            EXPECT_EQ(requests.size(), 1);
-            EXPECT_EQ(requests.at(0).getInputTokenIds(), inputTokens);
-            EXPECT_EQ(requests.at(0).getMaxTokens(), 8);
-        }
-    }
-}
-
-TEST(UtilsTest, splitBatchInputsTensorsBS3)
-{
-    auto modelType = executor::ModelType::kDECODER_ONLY;
-    // Batch size 3
-    std::vector<int32_t> const& inputTokens = {1, 2, -1, 3, 4, 5, 6, -1, -1};
-    std::vector<int32_t> const& inputLengths = {2, 3, 1};
-    std::vector<int32_t> const& maxTokens = {8, 10, 11};
-    InputTensors inputTensors;
-    pushTensor<int32_t>(inputTensors, InputFieldsNames::inputTokens, nvinfer1::DataType::kINT32, {3, 3}, inputTokens);
-    pushTensor<int32_t>(inputTensors, InputFieldsNames::maxNewTokens, nvinfer1::DataType::kINT32, {3}, maxTokens);
-
-    {
-        // W/o lengths, it should throw an error
-        try
-        {
-            auto inputsTensors = splitBatchInputsTensors(inputTensors);
-            FAIL() << "Expected exception";
-        }
-        catch (tensorrt_llm::common::TllmException const& e)
-        {
-            EXPECT_THAT(e.what(), testing::HasSubstr("input lengths tensor not provided"));
-        }
-    }
-
-    pushTensor<int32_t>(inputTensors, InputFieldsNames::inputLengths, nvinfer1::DataType::kINT32, {3}, inputLengths);
-    auto inputsTensors = splitBatchInputsTensors(inputTensors);
-
-    EXPECT_EQ(inputsTensors.size(), 3);
-    for (int batchId = 0; batchId < inputsTensors.size(); batchId++)
-    {
-        auto& inputTensors = inputsTensors.at(batchId);
-        EXPECT_EQ(inputTensors.size(), 3);
-        EXPECT_EQ(inputTensors.count(InputFieldsNames::inputTokens), 1);
-        auto inputTokensTensor = inputTensors.at(InputFieldsNames::inputTokens).tensor;
-        EXPECT_EQ(inputTokensTensor->getShape().nbDims, 2);
-        EXPECT_EQ(inputTokensTensor->getShape().d[0], 1);
-        EXPECT_EQ(inputTokensTensor->getShape().d[1], inputLengths[batchId]);
-        auto inputTokensData = reinterpret_cast<int32_t*>(inputTokensTensor->data());
-        EXPECT_EQ(inputTokensData[0], inputTokens[batchId * 3 + 0]);
-        EXPECT_EQ(inputTokensData[1], inputTokens[batchId * 3 + 1]);
-        EXPECT_EQ(inputTokensData[2], inputTokens[batchId * 3 + 2]);
-
-        EXPECT_EQ(inputTensors.count(InputFieldsNames::maxNewTokens), 1);
-        auto maxTokensTensor = inputTensors.at(InputFieldsNames::maxNewTokens).tensor;
-        EXPECT_EQ(maxTokensTensor->getShape().nbDims, 1);
-        EXPECT_EQ(maxTokensTensor->getShape().d[0], 1);
-        auto maxTokensData = reinterpret_cast<int32_t*>(maxTokensTensor->data());
-        EXPECT_EQ(maxTokensData[0], maxTokens[batchId]);
-
-        EXPECT_EQ(inputTensors.count(InputFieldsNames::inputLengths), 1);
-        auto inputLengthsTensor = inputTensors.at(InputFieldsNames::inputLengths).tensor;
-        EXPECT_EQ(maxTokensTensor->getShape().nbDims, 1);
-        EXPECT_EQ(maxTokensTensor->getShape().d[0], 1);
-        auto inputLengthsData = reinterpret_cast<int32_t*>(inputLengthsTensor->data());
-        EXPECT_EQ(inputLengthsData[0], inputLengths[batchId]);
-    }
-
-    // Get the requests
-    {
-        auto requests = createRequestsFromInputTensors(inputsTensors, true, true, true, modelType,
-            executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, false /* isOrchestrator */,
-            false /* specDecFastLogits */, std::nullopt);
-        EXPECT_EQ(requests.size(), 3);
-        for (int batchId = 0; batchId < inputsTensors.size(); batchId++)
-        {
-            auto const& request = requests[batchId];
-            EXPECT_EQ(request.getInputTokenIds(),
-                std::vector<int32_t>(
-                    inputTokens.begin() + batchId * 3, inputTokens.begin() + batchId * 3 + inputLengths[batchId]));
-            EXPECT_EQ(request.getMaxTokens(), maxTokens[batchId]);
-        }
-    }
-}
-
-TEST(UtilsTest, split)
-{
-    std::string str = "a b c";
-    std::vector<std::string> expected = {"a", "b", "c"};
-    EXPECT_EQ(split(str, ' '), expected);
-
-    str = "a b c ";
-    expected = {"a", "b", "c"};
-    EXPECT_EQ(split(str, ' '), expected);
-
-    str = "";
-    expected = {};
-    EXPECT_EQ(split(str, ' '), expected);
-}
diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt
deleted file mode 100644
index 3fcf5762e16..00000000000
--- a/triton_backend/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-regex
-fire
-tritonclient[all]
-transformers==4.51.0
-pandas
-tabulate
-flash_attn
diff --git a/triton_backend/scripts/launch_triton_server.py b/triton_backend/scripts/launch_triton_server.py
deleted file mode 100644
index e93596b3080..00000000000
--- a/triton_backend/scripts/launch_triton_server.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import argparse
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-from packaging import version
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
-    parser.add_argument(
-        '--tritonserver',
-        type=str,
-        help='path to the tritonserver exe',
-        default='/opt/tritonserver/bin/tritonserver',
-    )
-    parser.add_argument(
-        '--grpc_port',
-        type=str,
-        help='tritonserver grpc port',
-        default='8001',
-    )
-    parser.add_argument(
-        '--http_port',
-        type=str,
-        help='tritonserver http port',
-        default='8000',
-    )
-    parser.add_argument(
-        '--metrics_port',
-        type=str,
-        help='tritonserver metrics port',
-        default='8002',
-    )
-    parser.add_argument(
-        '--force',
-        '-f',
-        action='store_true',
-        help='launch tritonserver regardless of other instances running')
-    parser.add_argument('--log',
-                        action='store_true',
-                        help='log triton server stats into log_file')
-    parser.add_argument(
-        '--log-file',
-        type=str,
-        help='path to triton log file',
-        default='triton_log.txt',
-    )
-    parser.add_argument(
-        '--no-mpi',
-        action='store_true',
-        help='Launch tritonserver without MPI (single instance mode)',
-        default=False,
-    )
-
-    path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt'
-    parser.add_argument('--model_repo', type=str, default=path)
-
-    parser.add_argument(
-        '--tensorrt_llm_model_name',
-        type=str,
-        help=
-        'Name(s) of the tensorrt_llm Triton model in the repo. Use comma to separate if multiple model names',
-        default='tensorrt_llm',
-    )
-
-    parser.add_argument(
-        '--multi-model',
-        action='store_true',
-        help=
-        'Enable support for multiple TRT-LLM models in the Triton model repository'
-    )
-
-    parser.add_argument(
-        '--disable-spawn-processes',
-        action='store_true',
-        help='Disable dynamic spawning of child processes when using multi-model'
-    )
-
-    parser.add_argument(
-        '--multimodal_gpu0_cuda_mem_pool_bytes',
-        type=int,
-        default=0,
-        help=
-        'For multimodal usage, model instances need to transfer GPU tensors which requires to have enough cuda pool memory. We currently assume al multimodal_encoderss are on GPU 0.'
-    )
-
-    parser.add_argument(
-        '--oversubscribe',
-        action='store_true',
-        help=
-        'Append --oversubscribe to the mpirun command. Mainly for SLURM MPI usecases.'
-    )
-
-    parser.add_argument(
-        '--trtllm_llmapi_launch',
-        action='store_true',
-        help='Launch tritonserver with trtllm-llmapi-launch',
-        default=False,
-    )
-    parser.add_argument(
-        '--exit_timeout',
-        type=int,
-        help='Exit timeout in seconds',
-        default=None,
-    )
-    return parser.parse_args()
-
-
-def number_of_gpus():
-    output = os.popen('nvidia-smi --list-gpus').read()
-    return len(output.strip().split('\n'))
-
-
-def check_triton_version(required_version):
-    try:
-        current_version = version.Version(
-            os.environ.get('NVIDIA_TRITON_SERVER_VERSION'))
-        required_version = version.Version(required_version)
-        return current_version > required_version
-    except version.InvalidVersion:
-        print("Invalid version format. Please use major.minor format.")
-        return False
-
-
-def add_multi_model_config(cmd, args):
-    """Add multi-model configuration to command if enabled."""
-    if args.multi_model and check_triton_version(
-            '24.06') and not args.disable_spawn_processes:
-        cmd += [
-            '--pinned-memory-pool-byte-size=0', '--enable-peer-access=false'
-        ]
-        for j in range(number_of_gpus()):
-            cmd += [f'--cuda-memory-pool-byte-size={j}:0']
-    return cmd
-
-
-def add_logging_config(cmd, log, log_file, rank=None):
-    """Add logging configuration to command if enabled."""
-    if log and (rank is None or rank == 0):
-        cmd += ['--log-verbose=3', f'--log-file={log_file}']
-    return cmd
-
-
-def add_port_config(cmd, grpc_port, http_port, metrics_port):
-    """Add port configuration to command."""
-    cmd += [
-        f'--grpc-port={grpc_port}',
-        f'--http-port={http_port}',
-        f'--metrics-port={metrics_port}',
-    ]
-    return cmd
-
-
-def get_cmd(world_size,
-            tritonserver,
-            grpc_port,
-            http_port,
-            metrics_port,
-            model_repo,
-            log,
-            log_file,
-            tensorrt_llm_model_name,
-            oversubscribe,
-            multimodal_gpu0_cuda_mem_pool_bytes,
-            no_mpi,
-            trtllm_llmapi_launch,
-            exit_timeout=None):
-    if no_mpi:
-        assert world_size == 1, "world size must be 1 when using no-mpi"
-
-    use_mpi = not no_mpi
-    cmd = []
-
-    if use_mpi:
-        cmd = ['mpirun', '--allow-run-as-root']
-        if oversubscribe:
-            cmd += ['--oversubscribe']
-
-    for i in range(world_size):
-        if use_mpi:
-            cmd += ['-n', '1']
-        if trtllm_llmapi_launch:
-            cmd += ['trtllm-llmapi-launch']
-        cmd += [tritonserver, f'--model-repository={model_repo}']
-        if exit_timeout:
-            cmd += [f'--exit-timeout-secs={exit_timeout}']
-
-        # Add port configuration
-        cmd = add_port_config(cmd, grpc_port, http_port, metrics_port)
-
-        # Add logging if requested (only for rank 0)
-        cmd = add_logging_config(cmd, log, log_file, i)
-
-        # If rank is not 0, skip loading of models other than `tensorrt_llm_model_name`
-        if (i != 0):
-            cmd += ['--model-control-mode=explicit']
-            model_names = tensorrt_llm_model_name.split(',')
-            for name in model_names:
-                cmd += [f'--load-model={name}']
-        elif i == 0 and multimodal_gpu0_cuda_mem_pool_bytes != 0:
-            cmd += [
-                f'--cuda-memory-pool-byte-size=0:{multimodal_gpu0_cuda_mem_pool_bytes}'
-            ]
-
-        # Add multi-model configuration if enabled
-        cmd = add_multi_model_config(cmd, args)
-
-        # Add port configuration
-        cmd = add_port_config(cmd, grpc_port, http_port, metrics_port)
-
-        cmd += [
-            '--disable-auto-complete-config',
-            f'--backend-config=python,shm-region-prefix-name=prefix{i}_',
-        ]
-        if use_mpi:
-            cmd += [':']
-    return cmd
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
-                         capture_output=True,
-                         encoding='utf-8')
-    if res.stdout:
-        pids = res.stdout.replace('\n', ' ').rstrip()
-        msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
-        if args.force:
-            print(msg, file=sys.stderr)
-        else:
-            raise RuntimeError(msg + ' Or use --force.')
-    cmd = get_cmd(int(args.world_size), args.tritonserver, args.grpc_port,
-                  args.http_port, args.metrics_port, args.model_repo, args.log,
-                  args.log_file, args.tensorrt_llm_model_name,
-                  args.oversubscribe, args.multimodal_gpu0_cuda_mem_pool_bytes,
-                  args.no_mpi, args.trtllm_llmapi_launch, args.exit_timeout)
-    env = os.environ.copy()
-    if args.multi_model:
-        if not args.disable_spawn_processes:
-            assert args.world_size == 1, 'World size must be 1 when using multi-model without disable-spawn-processes. Processes will be spawned automatically to run the multi-GPU models'
-        env['TRTLLM_ORCHESTRATOR'] = '1'
-        env['TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES'] = '0' if args.disable_spawn_processes else '1'
-    subprocess.Popen(cmd, env=env)
diff --git a/triton_backend/tools/__init__.py b/triton_backend/tools/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/tools/dataset/long_input.csv b/triton_backend/tools/dataset/long_input.csv
deleted file mode 100644
index 4593345650f..00000000000
--- a/triton_backend/tools/dataset/long_input.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,3189,28809,28707,7234,574,3441,1236,28723,28705,26991,27130,1900,9063,286,935,3356,9822,13407,5376,28705,28750,28734,28725,28705,28750,28734,28740,28787,28705,1984,1141,349,26991,28723,315,837,272,2971,297,272,8802,302,272,1868,500,811,912,10064,6965,22280,466,28569,28723,28705,415,2609,3522,3370,506,750,741,302,272,1080,287,27770,297,586,1411,304,28725,356,272,7873,28725,315,1073,736,349,7949,297,10681,586,2659,395,272,1526,579,369,14785,993,347,5996,477,378,28723,661,349,586,3317,369,28725,586,2838,4579,11510,264,2268,297,2496,10086,28725,1560,297,272,8885,8455,14130,2764,304,297,799,5080,28723,28705,315,7298,500,811,912,10064,297,4074,28705,28750,28734,28740,28782,390,264,6275,4806,10340,16797,354,19387,1682,15800,1214,4165,28723,1984,727,736,403,7444,26104,28747,272,771,403,14361,28725,272,1918,403,1598,28725,304,272,4466,403,14987,304,20458,28723,1682,297,544,28725,264,1179,1633,298,347,28723,4529,2363,28725,2905,403,459,3659,28725,562,767,18077,1235,459,506,4382,304,1370,28733,532,28733,1466,1104,19351,28804,28705,851,4648,356,272,3970,302,4624,28705,28740,28774,362,28705,28750,28734,28740,28787,28725,739,384,5675,28710,25020,28725,272,15954,9298,438,500,811,912,10064,2261,528,298,506,3142,395,264,11140,3512,28725,27932,354,516,1216,5114,425,352,28723,415,3926,302,369,2125,1749,528,7414,9704,304,13803,28723,315,2580,369,767,553,4243,298,528,403,459,1103,28725,562,315,1654,378,403,396,18545,10824,304,4496,298,7120,378,304,15335,298,2318,356,28723,325,2565,4162,302,456,28725,1032,1236,586,6251,356,272,3926,302,272,2125,390,1162,390,264,1203,2489,302,272,10466,28723,2909,302,272,2955,297,272,1203,2489,506,750,25980,298,5321,272,2562,1218,302,586,4494,15137,28723,1136,547,477,456,28725,708,799,13496,697,506,750,1269,298,378,609,28705,315,6900,298,511,456,354,14246,624,1819,1024,690,315,3125,298,6788,3232,356,586,771,28723,5235,272,1679,989,4587,28725,315,2774,298,2659,11205,17251,11326,315,2068,298,272,4007,28723,315,863,459,947,298,3530,586,2855,28725,2783,315,403,12863,24122,28723,315,553,8304,741,23404,11752,582,6591,4681,298,304,1938,272,1918,18030,28723,2993,28725,1024,272,18030,28725,1167,3246,6871,304,680,16153,28723,1794,907,28725,315,1654,369,456,403,2940,298,272,582,265,12239,369,272,19387,1682,15800,1214,4165,14441,403,20998,438,272,727,2940,298,5611,9800,325,279,403,3444,6308,5088,286,486,8285,28619,304,4435,297,2223,23790,2090,609,28705,315,835,1654,369,315,1659,347,16094,3784,579,315,13033,5024,7478,477,586,6676,28723,315,403,26629,395,264,17884,505,18197,302,272,16492,28725,562,586,6676,829,459,2708,354,544,586,799,12380,28723,20921,516,7478,28725,315,2068,298,1032,264,26558,693,8750,741,20858,298,1316,528,5248,575,767,403,3544,28723,315,835,2056,28705,28770,4587,805,477,771,477,3909,5353,298,4725,28733,28755,1161,390,315,1654,315,1659,347,11812,477,13207,6727,304,5698,406,28723,28705,661,403,1938,456,727,805,369,315,863,272,20858,20791,13284,486,272,26558,304,315,24425,369,586,2700,403,7140,385,300,1711,28723,415,25669,477,272,10824,297,4624,553,6236,582,395,528,28725,2779,264,2102,2062,28723,315,553,3851,298,1455,805,384,5675,28710,28809,28713,24370,390,12731,269,1467,442,461,8992,562,586,2187,2723,28809,28707,9737,378,28723,661,403,438,456,1305,369,315,4496,298,13575,22883,28745,315,1309,2723,28809,28707,13746,369,378,403,1545,315,829,442,1023,7234,395,586,22271,28723,28705,5713,315,553,1269,456,5161,297,2935,4117,28725,315,22264,1885,1932,297,989,2848,3282,369,315,403,2526,354,264,633,2389,1096,315,403,17785,395,15317,298,771,438,500,811,912,10064,28723,1684,590,2261,2079,315,403,6285,28725,315,6431,395,706,741,4162,302,767,553,4243,395,384,5675,28710,28723,560,272,2363,302,813,17256,28725,315,403,341,25704,298,2822,369,384,5675,28710,28809,28713,7616,654,459,272,18545,10824,315,553,11012,562,3210,369,400,553,264,1267,28809,1043,28725,12575,28733,4717,12947,354,3142,1323,297,25381,380,9086,28725,1859,1929,304,438,771,28723,28705,2530,586,3530,403,754,28725,315,3125,27283,4218,2739,477,1611,28725,690,6979,3071,403,2572,390,500,811,912,10064,5976,9308,2739,28723,315,403,1404,298,272,4007,865,6955,535,264,2102,356,5151,304,315,5104,286,586,17217,298,272,4007,298,5407,369,315,682,459,506,298,1482,778,384,5675,28710,28723,661,403,297,2935,3999,28725,1024,264,2102,302,7237,12869,304,17256,395,1712,799,2525,693,553,750,356,272,11864,948,302,12692,23079,4501,477,384,5675,28710,28725,369,315,1988,298,272,12886,369,1545,3236,298,347,2203,28723,13445,28725,315,3593,298,347,272,865,1338,395,1560,15362,1070,5566,302,516,4052,27835,28725,390,1162,390,396,18646,28733,366,16199,3758,28723,661,349,868,369,315,13033,5648,7478,304,5996,369,28725,2818,356,272,22321,302,500,811,912,10064,28809,28713,21664,2213,840,28725,390,1162,390,8885,8455,304,9500,5802,28725,384,5675,28710,28809,28713,7616,14786,390,6965,22280,466,486,6818,264,26616,4466,28723,315,403,21020,369,315,829,1729,264,5484,1222,562,369,378,682,347,284,16950,308,298,907,2111,272,2496,396,5701,298,2962,272,4291,3875,578,390,272,5484,1023,347,264,1432,18969,28723,28705,2326,456,297,2273,28725,315,3125,298,4563,575,298,741,302,272,2525,315,553,14382,298,5585,28723,1794,456,5430,28725,315,553,750,2358,298,9051,754,3359,2525,693,553,835,553,297,25381,380,2524,19610,395,384,5675,28710,28723,6352,624,2971,403,7813,298,1567,3814,513,272,9217,829,12587,369,630,682,6344,23267,28723,2909,302,272,799,2525,553,708,5566,302,652,10150,697,304,5884,863,459,1675,298,347,7761,28723,19038,863,459,947,298,347,5363,395,1545,579,13957,752,391,14990,442,298,312,5700,2609,5455,383,1711,3926,28723,661,6166,2056,684,1712,680,4587,298,625,586,281,18352,297,264,5150,304,13914,272,22105,298,272,5113,28745,315,863,369,356,28705,28781,362,2246,28705,28750,28734,28740,28787,28723,28705,3868,456,5104,4661,354,767,1203,28720,1360,1854,28723,28705,4577,868,28725,315,506,3364,9783,5979,477,264,3102,302,19806,2525,693,506,2240,528,684,2461,3684,28331,10482,2524,19610,395,384,5675,28710,28725,297,4518,298,1698,3095,442,579,5979,369,315,28809,333,750,9638,298,11630,28723,415,14006,2240,302,297,1167,5979,27950,477,297,25381,380,304,28748,271,3397,495,2245,8570,298,10313,302,8310,302,7862,2652,1803,515,304,16794,5064,28723,560,624,10824,384,5675,28710,10150,12395,13438,516,2652,14427,298,264,2971,297,272,4986,302,264,7114,684,559,771,28723,28705,7022,286,1835,456,17837,5754,28725,315,1601,390,513,315,1433,805,22140,28723,1015,28725,297,272,6084,12066,302,22280,466,369,481,24975,274,5277,13349,304,1019,6965,13349,28725,378,349,3411,298,12983,767,4243,298,528,390,22178,28723,20956,369,378,349,459,28723,315,28809,333,2590,553,905,1912,528,661,403,865,3085,579,378,28809,28713,459,264,2032,3215,1040,438,2429,400,1539,28809,28707,21170,368,28723,2993,28725,297,586,2204,28725,1429,6323,22280,466,349,459,17910,3908,2256,28745,297,4518,298,1250,5455,383,1711,28725,378,349,2608,272,9862,1128,298,680,319,10212,607,805,2119,304,5884,1023,459,347,9852,22140,28723,28705,415,2611,315,863,459,1567,3814,395,586,2838,5585,349,1096,28725,916,8885,8455,2309,28725,378,349,459,15463,298,4517,356,9760,5489,1159,264,5484,28723,560,456,1222,28725,315,403,21020,369,390,264,1347,302,1179,7152,315,1023,582,5750,456,18698,304,11696,871,4993,298,272,5757,500,811,912,10064,1759,28725,1019,2070,272,18698,1235,459,5580,1096,272,5757,1759,403,459,4612,5867,28723,1092,1055,369,272,500,811,912,10064,9217,659,21342,384,5675,28710,1024,7484,713,14227,302,17725,4052,27835,28725,304,622,4291,713,395,264,5640,298,1347,4244,2079,516,14087,1023,459,347,1850,5499,28725,315,837,302,272,2204,369,315,837,4251,1933,298,4085,304,4517,356,456,3293,5665,7338,28723,28705,315,506,989,2191,15262,302,679,2148,28723,415,907,349,384,5675,28710,28809,28713,14006,28723,17158,356,272,4870,10150,697,1269,4213,9394,1468,304,805,1081,18382,369,506,9894,1854,272,4231,302,586,22105,1988,298,2061,356,28705,28774,362,4398,28725,378,682,459,347,264,14742,298,1034,384,5675,28710,28809,28713,22362,14006,710,4201,695,28723,10373,7871,352,349,9651,486,264,5679,302,9296,304,15038,2706,369,6191,1291,13980,302,22280,466,298,459,1038,264,285,1349,442,660,331,10888,442,776,9303,713,1996,400,4739,10272,442,19739,304,4859,1753,28723,851,5679,8681,1287,28725,1287,13980,298,459,1034,575,710,4201,695,14006,304,2264,378,390,272,9165,378,2590,349,28723,851,5679,5575,28713,778,272,13980,28809,20142,369,736,349,708,1305,297,8711,575,1835,272,22280,466,590,2105,304,8681,706,298,1601,737,590,506,708,3551,562,298,13572,297,9296,28723,851,1580,2115,28808,816,3573,1675,13980,302,22280,466,298,4085,582,513,590,460,325,1246,346,28731,8526,369,798,3562,18443,16988,622,1372,28723,28705,1984,1676,14056,302,679,2148,349,272,9217,28809,28713,2899,28723,315,506,750,11869,1346,1060,486,272,6768,304,297,1774,302,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,28723,12763,1854,315,19506,2560,586,22105,356,28705,28781,362,2246,28725,315,506,750,3817,286,298,544,272,7087,312,487,12804,594,369,1038,13980,302,22280,466,8526,298,4085,575,28747,28705,393,468,302,1760,28747,5235,272,28705,28787,28781,2202,302,456,442,450,282,28725,459,264,2692,5113,4292,659,5048,575,298,297,18948,684,586,1162,1250,297,707,1069,28725,5843,442,1221,28723,2280,264,4126,1034,28725,459,396,4927,28725,459,1059,264,4008,4150,28723,315,4579,13750,456,1076,28747,11684,477,500,811,912,10064,9802,442,272,5113,659,438,707,1305,25277,298,1460,513,315,28809,28719,8810,442,13033,298,389,2303,13713,272,616,4177,5088,456,659,553,356,586,771,304,2528,28723,415,865,1338,297,5411,693,13992,707,4368,349,586,2195,16529,693,28725,1019,2070,400,349,586,1863,1407,7056,28725,865,1419,575,684,456,739,315,12903,713,302,586,12782,298,3530,500,811,912,10064,356,28705,28740,28774,362,4212,28723,415,9217,506,10335,356,2856,18323,369,590,17154,1738,477,481,364,5549,684,586,1162,28733,15823,1096,590,21820,297,1352,699,5648,22581,28723,1791,272,1489,302,586,4788,28725,708,8427,16128,279,264,1338,477,7201,910,460,368,28804,21012,14612,288,741,6471,2930,1285,2090,1023,873,1040,304,511,1040,1873,28723,6968,321,480,6133,28747,1682,1059,456,1759,28725,315,506,750,1083,28707,346,19983,2330,486,272,9217,354,11246,304,1699,1781,5648,14625,28723,415,408,28381,349,369,315,1269,456,5648,304,13140,486,14971,264,12668,690,7207,3376,1112,298,12668,582,304,5884,4163,298,272,882,748,297,12852,272,3209,28723,851,659,750,773,298,528,10897,486,25367,382,404,1294,304,349,264,21790,369,659,750,9332,2856,2421,486,799,9217,3338,28725,304,652,14625,297,1560,1597,304,798,17256,28723,318,1207,263,28747,315,506,750,22889,1907,12903,302,680,821,624,3335,970,384,5675,28710,659,1269,7616,298,272,2030,369,315,837,272,624,693,7900,2465,713,354,272,10700,302,24491,264,6965,3758,304,369,315,865,16691,264,22105,739,400,15585,528,28723,2957,7616,654,1269,298,1560,15137,304,438,2488,13251,395,500,811,912,10064,11796,28723,315,506,3851,298,2968,456,582,438,2429,8660,28747,2327,297,272,28705,28750,28781,362,2246,5498,298,272,9217,304,18410,438,272,5757,9788,356,28705,28782,362,4398,28723,560,272,907,3335,28725,315,3874,708,2899,1312,297,272,1676,315,403,459,1019,4670,298,4160,586,2996,304,403,2240,369,586,2996,349,459,8598,390,1167,3926,9894,1024,586,22105,403,1269,304,5884,654,459,6823,486,272,297,18831,28723,5526,748,28747,661,2056,28705,28787,28781,2202,298,625,264,5161,477,272,9217,28723,11081,3741,18185,3953,5498,3653,403,2203,354,272,907,28705,28784,28734,2202,28723,1015,1019,272,9181,477,272,9217,28809,28713,14625,3593,298,347,2662,486,20819,2502,614,9636,1040,736,403,1743,264,8579,302,28705,28740,298,28705,28781,2739,2202,28725,304,15343,272,799,19731,590,863,459,4080,8670,12221,302,652,9181,4213,317,28733,6645,28723,415,9217,865,3593,298,960,2327,272,3209,3246,798,1040,907,395,19387,1682,15800,1214,4165,4961,868,297,4118,8885,8455,943,1449,3114,4938,28725,27855,304,4077,28723,3764,4243,297,2202,28705,28784,28734,28816,28787,28781,325,28740,28782,2202,28731,821,297,272,3742,28705,28784,28734,2202,9837,28723,1094,931,13768,1759,28747,23501,272,28705,28787,28781,2202,28725,304,1996,1055,28725,315,506,553,708,25312,356,910,272,5113,682,4269,456,4291,28725,4602,354,272,5498,11793,486,272,9217,28809,28713,11881,1002,1024,272,5283,302,28705,28750,28734,4212,28705,28750,28734,28740,28787,970,272,3471,302,15613,302,272,9788,654,808,575,28723,1794,708,1305,863,272,9217,3084,528,395,272,5104,25465,2373,690,590,8926,298,18429,456,3209,28723,1306,829,506,8729,298,511,456,707,1069,590,2613,390,272,500,811,912,10064,20978,291,1231,1072,4920,349,25285,304,865,4605,415,500,811,912,10064,28809,28713,1124,452,4609,16797,622,20849,272,1338,693,16726,264,22105,304,20199,2080,14345,302,272,5745,22734,442,18648,22734,28723,1682,7959,622,347,11510,346,25481,304,7658,4714,495,2992,622,347,3214,513,9775,286,486,272,11597,28723,1015,1019,739,264,1759,403,28637,28725,378,403,459,390,7991,2458,28723,1263,3335,28725,272,297,18831,369,272,5113,3336,403,1104,3000,390,264,7114,298,1464,304,7507,767,4243,304,5695,16066,2500,562,297,586,2204,28725,272,7114,4377,24507,601,298,586,1250,3817,286,298,754,28705,28750,28723,28782,3316,302,23736,27203,5682,459,298,625,438,272,5307,562,298,2231,396,9285,15529,970,272,18313,28712,1028,3246,272,12354,28723,4127,272,28705,28750,28787,28733,3005,560,18831,7431,356,456,1759,9675,395,264,21933,1791,8610,334,869,369,2327,1076,659,708,5104,25465,442,25312,356,739,272,1759,622,948,28723,23471,10813,28747,6213,272,297,18831,28725,315,553,2856,6241,1503,304,459,579,6241,1503,13355,972,6195,1307,1835,528,28723,5518,315,506,264,2513,884,5108,304,1307,272,1707,4159,1938,272,10824,28725,378,403,8750,369,315,837,272,624,693,2897,272,7114,298,3142,304,5884,12313,384,5675,28710,28809,28713,6965,754,28707,1238,28723,851,403,7577,272,1639,369,315,863,459,938,272,1707,297,264,6965,2758,325,3245,456,3798,354,4118,6098,302,272,1707,4159,609,661,403,835,7668,369,315,18639,26097,384,5675,28710,28809,28713,6965,26201,1096,315,863,459,7901,7627,5019,3441,586,8287,438,516,297,25381,380,24370,562,3519,3851,298,2268,272,3817,28723,415,1792,13396,298,7105,298,28725,1346,4411,2962,586,10864,28747,23501,456,1759,28725,315,506,750,3857,395,708,5701,298,25128,2423,586,10864,298,272,5113,442,5556,264,2899,298,706,28723,415,4382,315,6333,297,586,9181,28725,1259,390,25367,7201,500,811,912,10064,5084,298,2264,707,6965,22280,466,298,5411,28725,2490,384,5675,28710,28725,654,459,16715,28723,4127,438,272,9788,28725,315,403,459,15463,298,1460,707,4224,1040,315,403,459,1019,4670,298,7282,2079,315,28286,28808,28705,415,13128,354,767,349,1629,28718,1907,272,4052,24700,302,456,3293,4291,10427,4524,6672,395,272,9217,28723,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,927,298,9353,346,304,2425,578,1388,9680,354,456,28723,6723,7719,298,9222,8594,586,22105,349,272,11150,2611,315,28286,477,264,2389,315,403,1215,8987,302,304,403,1215,1179,438,28723,6723,7719,298,808,575,20710,3081,5287,354,272,9788,4670,384,5675,28710,28809,28713,14625,298,1527,378,477,396,297,18831,298,264,3475,28129,3017,28809,395,713,390,272,16813,271,304,272,10737,28723,6723,7719,298,4085,582,442,23807,28706,390,315,403,4412,2374,286,354,754,28705,28750,28723,28782,3316,1040,1019,739,586,14625,1928,286,1040,12825,369,590,1419,456,18636,28723,28705,1306,835,5502,286,272,17253,298,9222,17214,304,1388,2992,297,456,3209,28723,851,349,683,313,4697,486,272,2296,28747,28705,4258,2123,4788,302,3684,10150,697,28747,384,5675,28710,659,264,1043,3340,302,10150,697,302,3142,1323,22280,288,2525,477,516,2202,297,272,28455,365,11279,6353,699,325,28796,28760,28780,557,690,403,5038,297,272,4725,298,3909,28705,28750,28734,28734,28734,28713,28723,1824,349,1019,680,3292,28712,6219,349,369,438,2429,1712,302,272,9217,3338,506,2651,684,384,5675,28710,28809,28713,22362,4969,11746,354,22280,288,2525,354,264,5864,3216,302,727,28723,5376,5127,403,396,5466,1028,438,28455,935,18376,390,456,403,1404,356,28725,1312,4417,524,598,515,403,264,1001,28733,9016,263,304,21516,5466,1028,438,351,1029,1540,28723,25367,403,835,5038,3270,1938,456,3216,304,27932,403,5635,302,1167,21786,697,28723,4127,513,400,1659,459,506,3364,302,1167,10150,697,297,272,11146,407,28713,28725,25367,382,404,1294,403,15956,12903,684,10150,697,302,6965,22280,466,1835,384,5675,28710,297,3909,28705,28750,28734,28740,28782,739,264,15574,8885,8455,14130,23248,2240,713,684,706,297,4349,28705,28750,28734,28740,28782,28723,415,10150,697,456,727,4658,10313,10214,7437,1345,23108,284,1063,298,2525,4213,19884,356,11690,304,2492,12692,7266,6965,27505,298,2525,2373,272,19683,4642,3772,3667,325,2956,500,811,912,10064,28725,19721,2889,28725,420,644,4114,304,613,19498,654,868,5651,28731,304,297,272,17525,14130,27374,28723,1794,456,727,384,5675,28710,403,272,10601,15954,9298,302,500,811,912,10064,28455,28723,10191,544,456,4788,28725,272,500,811,912,10064,9217,1269,384,5675,28710,28809,28713,15954,9298,2840,13089,297,4624,28705,28750,28734,28740,28784,28723,13695,28725,1019,739,7567,395,2219,364,449,15164,7167,302,17725,4052,27835,28725,390,315,863,739,315,1269,586,22105,356,2246,28705,28781,362,28705,28750,28734,28740,28787,28725,272,5113,553,298,347,3817,286,298,14373,798,6403,1159,590,863,2424,19258,28723,1263,754,9542,4587,1024,315,19506,2560,456,22105,28725,384,5675,28710,5317,298,960,297,516,8444,390,15954,9298,28747,400,21510,297,2488,13251,28725,7213,13912,304,1019,5611,4530,390,272,2105,302,272,18077,28725,2490,19735,272,28705,28750,28734,28740,28787,10822,7161,5801,6696,617,3546,12939,304,272,28705,28784,362,23647,28556,20461,18864,19751,10882,28723,650,403,865,2662,356,623,7550,695,3530,356,272,1348,1370,369,2332,444,28764,7710,5591,264,2838,325,28740,28750,362,4398,609,1682,1059,456,3475,262,6281,4821,17906,272,9217,659,16127,754,24324,298,23926,272,9828,302,384,5675,28710,304,516,12668,28723,6829,1929,28725,590,2261,354,272,1348,2184,302,10722,390,264,11392,5484,325,28706,28723,28721,28723,9149,13751,304,1719,1334,12908,28731,1671,2461,272,9637,298,511,579,28723,11680,28725,590,2897,1401,304,16222,369,378,403,396,5757,11597,304,5884,1023,25482,5566,477,7291,3536,272,2496,28723,415,9217,932,286,286,298,384,5675,28710,28809,28713,12668,28809,28713,15047,28725,10596,1096,590,654,8526,302,264,3544,1007,1850,2235,8040,1250,16691,486,384,5675,28710,28723,851,349,438,1489,704,288,4265,607,304,438,8748,6125,10573,28723,330,2496,395,272,7511,385,500,811,912,10064,3276,2729,298,506,1023,459,23589,1753,477,2548,767,349,1103,354,4813,302,264,28674,28723,14862,28725,553,384,5675,28710,519,286,500,811,912,10064,28725,400,28809,28715,506,11466,2897,264,1597,3209,798,390,400,682,506,553,298,13242,2079,400,403,1850,5499,28723,6924,10181,297,28455,460,798,10181,28747,2461,586,22105,304,6311,369,10466,12740,297,272,798,7966,2085,1944,2068,5090,1835,516,1216,1489,10299,28723,384,5675,28710,28809,28713,13268,354,272,3926,302,272,6856,403,369,400,3654,396,317,5882,21422,302,516,9788,11092,304,25367,28725,390,1162,390,272,25601,28715,492,5084,28725,654,8538,713,913,354,378,28723,315,868,10150,12395,7298,706,1671,22504,304,11390,298,3530,713,579,544,516,12908,654,1269,297,396,4261,298,625,7194,302,528,28723,2993,28725,438,272,727,302,813,7114,28725,25367,553,2141,15451,298,2855,354,272,2125,28723,851,541,347,998,7848,298,486,707,302,272,500,811,912,10064,6867,17131,693,654,1309,438,272,7944,9257,28723,6213,272,9788,28725,25367,863,459,4714,384,5675,28710,356,456,1305,304,297,579,2548,28725,20690,346,1275,311,3622,601,369,521,434,2657,28723,4127,513,25367,553,750,20875,28725,272,1639,369,400,403,1987,3714,390,744,302,384,5675,28710,28809,28713,389,593,28710,1023,506,6315,17793,516,5502,302,17253,28723,25367,1023,459,506,750,264,744,302,272,8357,369,272,9217,779,28346,298,17214,304,1038,264,5161,28723,2387,3573,347,264,9149,304,10737,297,272,1348,1222,28808,1791,3608,28725,384,5675,28710,1309,659,516,2389,28747,3672,272,5113,28809,28713,7345,4009,28725,384,5675,28710,659,865,750,21342,28723,661,349,8134,2572,369,400,993,347,312,4138,601,297,272,2496,28723,851,349,7577,272,9217,14612,288,10466,5566,302,713,3142,1323,22280,288,516,18339,390,1162,390,10150,697,477,264,26001,472,302,13980,690,21506,298,5166,5264,951,1377,28723,4127,395,456,28725,378,1309,2056,798,6403,354,272,9217,298,907,4080,713,356,3530,868,14551,713,28723,851,349,521,12678,522,28723,1602,460,13980,6495,298,1567,3814,513,456,349,910,264,2496,395,272,12947,302,500,811,912,10064,19754,1167,2258,313,2119,28804,28705,315,28809,333,3874,2856,10928,697,354,272,8579,28747,415,9217,9278,369,456,403,264,4630,3209,1096,500,811,912,10064,349,2853,294,4360,297,272,2223,28725,2783,272,7896,460,8885,8455,304,272,10824,9894,297,28455,28723,661,1528,2723,28809,28707,369,13140,28723,9500,304,8885,8455,2309,460,297,14012,739,378,3435,298,6965,22280,466,8427,28747,478,9681,456,1159,478,16726,586,22105,304,1019,11053,439,2742,10730,477,1560,5780,28809,8427,28723,28705,415,9217,659,835,10335,369,590,654,459,2358,298,2992,586,22105,390,4377,390,590,682,506,8232,1096,590,654,2942,298,4461,264,3544,1007,1850,2235,8040,28723,1136,7955,2747,456,349,1488,1814,397,28723,17158,356,272,4975,302,3926,477,272,28705,28770,5240,302,4398,28725,456,829,506,750,15570,297,28705,28750,4587,28723,1537,2079,863,378,1388,28705,28787,28781,2202,28804,661,3969,3081,369,272,5113,403,2526,354,6494,459,298,960,7577,652,1429,6323,304,4241,1155,324,2233,298,272,19909,28723,560,1259,264,1222,28725,272,622,298,960,349,544,369,9760,28723,2280,11012,1489,25087,28723,1015,2818,356,652,18147,28725,384,5675,28710,349,14227,302,4052,27835,356,2856,2778,28713,28723,3635,302,369,403,4231,304,403,5444,1019,1159,272,28705,28782,362,4398,560,18831,403,3336,28723,851,1023,459,506,3214,390,1043,390,378,863,28723,28705,1136,10537,22996,2260,2747,28725,354,741,21296,2611,272,500,811,912,10064,5113,304,9802,659,750,19238,440,298,1388,2992,1019,739,7567,395,3081,5566,684,384,5675,28710,28809,28713,4052,27835,28723,851,4716,24838,16984,272,2273,1096,456,28569,977,274,396,2588,2256,4623,298,500,811,912,10064,390,396,18077,28723,415,418,19760,6309,27532,828,473,403,2141,21242,288,395,10203,2020,302,456,22105,1024,378,403,1269,356,28705,28781,362,2246,28723,415,5113,776,3969,298,506,4214,575,302,871,1069,298,4461,12292,395,586,22105,28723,28705,415,5113,659,835,750,2108,821,9853,1246,297,272,2296,4342,28747,28705,19230,25959,4845,1749,272,9217,302,500,811,912,10064,297,4527,28705,28750,28734,28740,28782,28723,851,403,459,7004,3875,578,4678,7784,578,28723,4324,1996,28705,28740,28782,362,4398,28725,19230,403,1309,9206,356,272,4400,390,264,9217,15178,28723,415,14060,302,272,28096,438,272,297,18831,349,16050,7990,24108,28725,5876,741,575,1246,4052,9157,2044,697,390,1162,390,272,3422,1937,302,8598,12458,28723,851,541,347,287,13528,575,486,272,12740,302,773,28096,690,272,9217,14612,274,304,315,19411,706,298,4098,456,12740,297,871,3293,884,395,272,798,28723,3560,578,28725,500,811,912,10064,659,396,1565,2251,4920,28723,2993,28725,378,3969,369,456,1565,1467,6608,297,18879,302,28725,304,459,1096,302,272,9217,28723,25367,659,12313,272,5084,298,2264,707,10824,302,22280,466,1155,1378,706,369,272,9217,622,4269,378,20029,346,28723,851,1155,4935,349,6861,346,15235,10937,486,910,272,9217,659,5884,2082,9852,272,989,5084,3338,693,654,3142,1323,26201,286,356,28705,28740,28774,362,4624,28705,28750,28734,28740,28787,28725,624,302,6105,553,5566,325,14576,599,28731,304,624,693,863,459,28723,415,23967,2161,302,20370,3857,486,272,9217,356,28705,28740,28787,362,4398,349,3620,745,395,272,5307,28723,23518,9254,369,16499,456,460,28747,28705,415,7719,298,4389,369,315,403,25007,354,771,739,315,403,521,13952,356,28705,28770,28740,303,2246,28723,851,349,1871,272,9217,553,3411,2735,298,304,1023,506,3214,778,15301,739,590,8278,264,3608,354,272,9788,28723,28705,415,297,25318,380,5436,302,272,1759,302,272,5239,302,272,5566,325,28782,362,298,28705,28740,28782,362,4212,609,3868,272,5104,4661,354,767,2590,1203,28720,1360,28725,395,396,13268,354,272,882,748,28723,28705,415,297,25318,380,5436,302,272,1759,302,5861,288,3714,272,3471,302,15613,298,347,1307,438,272,297,18831,325,28750,28734,362,304,28705,28750,28787,362,4212,28731,259,28782,362,4398,28747,1794,272,9788,28725,272,9217,9256,590,3236,264,1819,298,1038,652,5161,28723,851,659,1055,750,4497,2458,298,28705,28787,2739,2202,298,16287,871,5161,259,28782,362,4398,28747,415,9217,1960,23860,871,5161,298,4080,272,1992,19571,308,356,3530,1996,264,5161,349,1269,28723,661,349,459,3081,693,590,1960,6899,369,395,28723,661,5785,403,459,298,528,297,272,2363,302,272,9788,4678,298,272,5084,390,315,553,2735,298,317,28733,28719,2014,1996,28705,28740,28734,362,4398,304,456,553,6304,459,750,1960,6899,298,272,1918,486,868,28723,384,5675,28710,403,907,2662,356,3530,356,28705,28740,28750,362,4398,1024,5864,3618,304,798,6403,28723,28705,1682,297,544,28725,456,22105,659,750,4716,304,24354,4052,27309,477,5399,28705,28740,28723,7829,272,14130,3618,304,8885,8455,6190,438,2475,927,298,9198,272,9783,11405,308,369,272,9217,302,500,811,912,10064,659,808,28723,6723,1759,659,750,11869,22904,286,304,456,1023,459,347,11389,776,1096,590,506,19862,596,4912,1567,298,272,4714,12886,28723,560,4469,1259,390,1167,28725,272,2825,3209,776,390,1188,390,272,948,28723,28705,11682,750,356,272,3544,948,302,264,24122,6900,1759,28725,315,682,3397,369,544,24202,312,19090,652,6965,22280,466,15251,304,15627,706,297,2061,302,1489,10879,28723,23518,578,28725,590,927,298,5407,369,272,2296,4382,506,750,16715,28747,28705,330,1162,28733,444,294,6432,1759,1023,347,5397,369,12784,28747,28705,7240,25465,28747,415,727,3214,354,272,3293,1759,1023,347,390,2485,390,2572,28723,330,7228,302,28705,28740,28781,15639,2202,3969,298,347,396,18636,3526,4787,28723,415,1759,298,347,4961,28747,560,767,1221,304,1069,1023,7896,13914,23430,442,7234,10864,28804,4867,736,24597,12920,298,6105,7896,541,13914,1259,23430,739,272,23430,460,1835,272,1215,1338,590,460,12336,6495,298,13914,706,298,28804,1691,5566,302,264,3452,3030,28804,1047,5081,28725,767,2112,28804,1824,1430,5430,302,272,1759,622,511,28725,739,378,622,347,2203,304,2079,28723,1824,1221,8520,1938,272,1759,622,1388,304,767,1430,4150,1023,1675,298,3934,732,5556,438,1430,5430,28723,2280,2500,302,799,5084,28747,1739,9629,356,767,5084,622,347,459,1799,28725,739,304,297,767,1221,28723,14250,298,7234,10864,28747,330,2825,1023,347,3859,298,1914,544,8209,298,7234,707,4382,590,506,390,272,1759,349,15260,28723,24876,282,1759,28747,513,272,14120,349,459,4864,3765,298,272,8209,28725,767,513,707,1182,269,1139,354,937,8306,511,590,506,28804,28705,10731,298,272,8209,28747,767,28725,513,707,28725,18835,1760,460,272,12354,304,18313,28712,1028,15583,298,28804,1684,304,970,349,456,1760,2632,28725,304,354,910,1043,28804,28705,25487,302,13841,1938,272,11597,28747,1047,304,354,910,1043,272,12354,304,18313,28712,1028,1023,347,2662,356,623,7550,695,325,26283,28731,3530,390,272,11597,1759,349,16353,2589,28723,28705,5795,3696,395,272,1846,302,272,2496,304,272,3536,1526,28747,330,4920,1023,2588,356,767,349,10924,28725,739,304,910,298,799,5084,390,1162,390,298,8209,3536,272,18077,28723,28705,661,1023,576,1671,3677,378,3969,369,378,1580,347,773,28747,14213,840,22280,466,349,459,264,6241,1503,442,12354,1503,9311,28723,5518,302,378,28725,315,28809,333,553,298,22883,477,264,2389,315,8232,304,403,1215,1179,438,28723,851,659,750,1276,28712,15402,298,1560,272,7347,1127,28725,304,272,18077,390,264,2894,28723,315,873,302,905,693,506,750,7207,298,22883,2940,298,22280,466,304,460,18410,9638,298,1300,633,7794,304,633,7291,302,7904,28723,851,349,1096,438,871,6421,6965,22280,466,349,684,396,12476,302,1982,28723,415,1338,297,264,2840,302,1982,8049,272,12354,297,264,2840,970,590,2477,6344,10832,442,6788,652,9678,18538,28723,816,3573,347,8711,302,3667,25573,771,452,2089,354,2525,325,12938,9135,272,1170,2794,302,272,7757,302,6965,22280,466,4469,28731,739,478,460,1309,679,2570,395,1259,408,554,2487,628,4382,390,20083,369,15217,460,1933,302,22280,466,28723,28705,17158,356,910,11466,586,22105,403,287,969,1006,28725,500,811,912,10064,28809,28713,12947,659,3214,264,9130,4067,28723,851,927,1484,506,4243,304,349,4716,25340,298,272,7757,302,272,5084,302,272,18077,693,506,4226,7771,708,744,297,456,3518,302,27245,3926,28725,304,298,456,1370,6344,7444,297,272,3199,684,4668,767,1203,28720,1360,28723,661,15667,586,3031,369,272,1179,905,693,771,438,500,811,912,10064,506,750,16619,486,272,6768,302,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,28723,415,1179,771,369,500,811,912,10064,659,2203,754,272,2609,28705,28740,28734,1267,28725,304,10352,298,511,1023,459,347,261,1331,1497,486,272,6768,304,5502,736,1009,302,456,9217,28723,560,1745,298,2839,272,15932,1759,2373,272,2496,28725,304,354,272,1526,298,2839,298,983,426,9843,297,500,811,912,10064,28725,272,9217,1023,22883,297,871,3293,884,28723,6352,633,5411,541,2839,298,11701,272,6544,2203,28723,1387,349,264,3904,912,2689,3677,28747,351,28727,7322,7050,13067,4279,1254,4985,28725,1879,478,769,261,515,9433,28710,28723,10929,7896,693,506,24385,907,1021,910,24122,315,506,750,9852,506,1215,1353,20142,684,910,652,4382,304,10864,622,347,15570,297,272,3437,28723,6352,264,6138,304,7126,9217,304,15954,9298,541,3084,272,1155,324,2233,369,272,5084,28725,272,8885,8455,14130,3618,304,272,1526,438,2475,927,28725,390,478,544,2318,356,477,1236,28723,330,1179,907,3707,682,347,298,808,582,264,28455,3407,489,2446,23039,304,1658,10276,741,5823,298,2136,354,707,5648,304,1902,24943,1760,3345,298,799,13980,302,384,5675,28710,304,799,6965,7871,3117,28723,28705,500,811,912,10064,349,264,1598,2496,395,264,12302,2093,28723,315,3317,272,9217,541,1658,652,14486,385,10276,304,511,767,28809,28713,1489,354,272,2496,28725,690,349,25719,10276,298,2111,500,811,912,10064,264,4676,298,312,4671,871,12947,28723,28705,8980,28747,7240,4661,8457,356,28705,28750,28740,4398,298,4714,13936,1065,304,967,264,6925,3062,28705,13,11521,28745,4232,28747
diff --git a/triton_backend/tools/dataset/long_output.csv b/triton_backend/tools/dataset/long_output.csv
deleted file mode 100644
index 2979cf92809..00000000000
--- a/triton_backend/tools/dataset/long_output.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,3189,28809,28707,7234,574,3441,1236,28723,28705,26991,27130,1900,9063,286,935,3356,9822,13407,5376,28705,28750,28734,28725,28705,28750,28734,28740,28787,28705,1984,1141,349,26991,28723,315,837,272,2971,297,272,8802,302,272,1868,500,811,912,10064,6965,22280,466,28569,28723,28705,415,2609,3522,3370,506,750,741,302,272,1080,287,27770,297,586,1411,304,28725,356,272,7873,28725,315,1073,736,349,7949,297,10681,586,2659,395,272,1526,579,369,14785,993,347,5996,477,378,28723,661,349,586,3317,369,28725,586,2838,4579,11510,264,2268,297,2496,10086,28725,1560,297,272,8885,8455,14130,2764,304,297,799,5080,28723,28705,315,7298,500,811,912,10064,297,4074,28705,28750,28734,28740,28782,390,264,6275,4806,10340,16797,354,19387,1682,15800,1214,4165,28723,1984,727,736,403,7444,26104,28747,272,771,403,14361,28725,272,1918,403,1598,28725,304,272,4466,403,14987,304,20458,28723,1682,297,544,28725,264,1179,1633,298,347,28723,4529,2363,28725,2905,403,459,3659,28725,562,767,18077,1235,459,506,4382,304,1370,28733,532,28733,1466,1104,19351,28804,28705,851,4648,356,272,3970,302,4624,28705,28740,28774,362,28705,28750,28734,28740,28787,28725,739,384,5675,28710,25020,28725,272,15954,9298,438,500,811,912,10064,2261,528,298,506,3142,395,264,11140,3512,28725,27932,354,516,1216,5114,425,352,28723,415,3926,302,369,2125,1749,528,7414,9704,304,13803,28723,315,2580,369,767,553,4243,298,528,403,459,1103,28725,562,315,1654,378,403,396,18545,10824,304,4496,298,7120,378,304,15335,298,2318,356,28723,325,2565,4162,302,456,28725,1032,1236,586,6251,356,272,3926,302,272,2125,390,1162,390,264,1203,2489,302,272,10466,28723,2909,302,272,2955,297,272,1203,2489,506,750,25980,298,5321,272,2562,1218,302,586,4494,15137,28723,1136,547,477,456,28725,708,799,13496,697,506,750,1269,298,378,609,28705,315,6900,298,511,456,354,14246,624,1819,1024,690,315,3125,298,6788,3232,356,586,771,28723,5235,272,1679,989,4587,28725,315,2774,298,2659,11205,17251,11326,315,2068,298,272,4007,28723,315,863,459,947,298,3530,586,2855,28725,2783,315,403,12863,24122,28723,315,553,8304,741,23404,11752,582,6591,4681,298,304,1938,272,1918,18030,28723,2993,28725,1024,272,18030,28725,1167,3246,6871,304,680,16153,28723,1794,907,28725,315,1654,369,456,403,2940,298,272,582,265,12239,369,272,19387,1682,15800,1214,4165,14441,403,20998,438,272,727,2940,298,5611,9800,325,279,403,3444,6308,5088,286,486,8285,28619,304,4435,297,2223,23790,2090,609,28705,315,835,1654,369,315,1659,347,16094,3784,579,315,13033,5024,7478,477,586,6676,28723,315,403,26629,395,264,17884,505,18197,302,272,16492,28725,562,586,6676,829,459,2708,354,544,586,799,12380,28723,20921,516,7478,28725,315,2068,298,1032,264,26558,693,8750,741,20858,298,1316,528,5248,575,767,403,3544,28723,315,835,2056,28705,28770,4587,805,477,771,477,3909,5353,298,4725,28733,28755,1161,390,315,1654,315,1659,347,11812,477,13207,6727,304,5698,406,28723,28705,661,403,1938,456,727,805,369,315,863,272,20858,20791,13284,486,272,26558,304,315,24425,369,586,2700,403,7140,385,300,1711,28723,415,25669,477,272,10824,297,4624,553,6236,582,395,528,28725,2779,264,2102,2062,28723,315,553,3851,298,1455,805,384,5675,28710,28809,28713,24370,390,12731,269,1467,442,461,8992,562,586,2187,2723,28809,28707,9737,378,28723,661,403,438,456,1305,369,315,4496,298,13575,22883,28745,315,1309,2723,28809,28707,13746,369,378,403,1545,315,829,442,1023,7234,395,586,22271,28723,28705,5713,315,553,1269,456,5161,297,2935,4117,28725,315,22264,1885,1932,297,989,2848,3282,369,315,403,2526,354,264,633,2389,1096,315,403,17785,395,15317,298,771,438,500,811,912,10064,28723,1684,590,2261,2079,315,403,6285,28725,315,6431,395,706,741,4162,302,767,553,4243,395,384,5675,28710,28723,560,272,2363,302,813,17256,28725,315,403,341,25704,298,2822,369,384,5675,28710,28809,28713,7616,654,459,272,18545,10824,315,553,11012,562,3210,369,400,553,264,1267,28809,1043,28725,12575,28733,4717,12947,354,3142,1323,297,25381,380,9086,28725,1859,1929,304,438,771,28723,28705,2530,586,3530,403,754,28725,315,3125,27283,4218,2739,477,1611,28725,690,6979,3071,403,2572,390,500,811,912,10064,5976,9308,2739,28723,315,403,1404,298,272,4007,865,6955,535,264,2102,356,5151,304,315,5104,286,586,17217,298,272,4007,298,5407,369,315,682,459,506,298,1482,778,384,5675,28710,28723,661,403,297,2935,3999,28725,1024,264,2102,302,7237,12869,304,17256,395,1712,799,2525,693,553,750,356,272,11864,948,302,12692,23079,4501,477,384,5675,28710,28725,369,315,1988,298,272,12886,369,1545,3236,298,347,2203,28723,13445,28725,315,3593,298,347,272,865,1338,395,1560,15362,1070,5566,302,516,4052,27835,28725,390,1162,390,396,18646,28733,366,16199,3758,28723,661,349,868,369,315,13033,5648,7478,304,5996,369,28725,2818,356,272,22321,302,500,811,912,10064,28809,28713,21664,2213,840,28725,390,1162,390,8885,8455,304,9500,5802,28725,384,5675,28710,28809,28713,7616,14786,390,6965,22280,466,486,6818,264,26616,4466,28723,315,403,21020,369,315,829,1729,264,5484,1222,562,369,378,682,347,284,16950,308,298,907,2111,272,2496,396,5701,298,2962,272,4291,3875,578,390,272,5484,1023,347,264,1432,18969,28723,28705,2326,456,297,2273,28725,315,3125,298,4563,575,298,741,302,272,2525,315,553,14382,298,5585,28723,1794,456,5430,28725,315,553,750,2358,298,9051,754,3359,2525,693,553,835,553,297,25381,380,2524,19610,395,384,5675,28710,28723,6352,624,2971,403,7813,298,1567,3814,513,272,9217,829,12587,369,630,682,6344,23267,28723,2909,302,272,799,2525,553,708,5566,302,652,10150,697,304,5884,863,459,1675,298,347,7761,28723,19038,863,459,947,298,347,5363,395,1545,579,13957,752,391,14990,442,298,312,5700,2609,5455,383,1711,3926,28723,661,6166,2056,684,1712,680,4587,298,625,586,281,18352,297,264,5150,304,13914,272,22105,298,272,5113,28745,315,863,369,356,28705,28781,362,2246,28705,28750,28734,28740,28787,28723,28705,3868,456,5104,4661,354,767,1203,28720,1360,1854,28723,28705,4577,868,28725,315,506,3364,9783,5979,477,264,3102,302,19806,2525,693,506,2240,528,684,2461,3684,28331,10482,2524,19610,395,384,5675,28710,28725,297,4518,298,1698,3095,442,579,5979,369,315,28809,333,750,9638,298,11630,28723,415,14006,2240,302,297,1167,5979,27950,477,297,25381,380,304,28748,271,3397,495,2245,8570,298,10313,302,8310,302,7862,2652,1803,515,304,16794,5064,28723,560,624,10824,384,5675,28710,10150,12395,13438,516,2652,14427,298,264,2971,297,272,4986,302,264,7114,684,559,771,28723,28705,7022,286,1835,456,17837,5754,28725,315,1601,390,513,315,1433,805,22140,28723,1015,28725,297,272,6084,12066,302,22280,466,369,481,24975,274,5277,13349,304,1019,6965,13349,28725,378,349,3411,298,12983,767,4243,298,528,390,22178,28723,20956,369,378,349,459,28723,315,28809,333,2590,553,905,1912,528,661,403,865,3085,579,378,28809,28713,459,264,2032,3215,1040,438,2429,400,1539,28809,28707,21170,368,28723,2993,28725,297,586,2204,28725,1429,6323,22280,466,349,459,17910,3908,2256,28745,297,4518,298,1250,5455,383,1711,28725,378,349,2608,272,9862,1128,298,680,319,10212,607,805,2119,304,5884,1023,459,347,9852,22140,28723,28705,415,2611,315,863,459,1567,3814,395,586,2838,5585,349,1096,28725,916,8885,8455,2309,28725,378,349,459,15463,298,4517,356,9760,5489,1159,264,5484,28723,560,456,1222,28725,315,403,21020,369,390,264,1347,302,1179,7152,315,1023,582,5750,456,18698,304,11696,871,4993,298,272,5757,500,811,912,10064,1759,28725,1019,2070,272,18698,1235,459,5580,1096,272,5757,1759,403,459,4612,5867,28723,1092,1055,369,272,500,811,912,10064,9217,659,21342,384,5675,28710,1024,7484,713,14227,302,17725,4052,27835,28725,304,622,4291,713,395,264,5640,298,1347,4244,2079,516,14087,1023,459,347,1850,5499,28725,315,837,302,272,2204,369,315,837,4251,1933,298,4085,304,4517,356,456,3293,5665,7338,28723,28705,315,506,989,2191,15262,302,679,2148,28723,415,907,349,384,5675,28710,28809,28713,14006,28723,17158,356,272,4870,10150,697,1269,4213,9394,1468,304,805,1081,18382,369,506,9894,1854,272,4231,302,586,22105,1988,298,2061,356,28705,28774,362,4398,28725,378,682,459,347,264,14742,298,1034,384,5675,28710,28809,28713,22362,14006,710,4201,695,28723,10373,7871,352,349,9651,486,264,5679,302,9296,304,15038,2706,369,6191,1291,13980,302,22280,466,298,459,1038,264,285,1349,442,660,331,10888,442,776,9303,713,1996,400,4739,10272,442,19739,304,4859,1753,28723,851,5679,8681,1287,28725,1287,13980,298,459,1034,575,710,4201,695,14006,304,2264,378,390,272,9165,378,2590,349,28723,851,5679,5575,28713,778,272,13980,28809,20142,369,736,349,708,1305,297,8711,575,1835,272,22280,466,590,2105,304,8681,706,298,1601,737,590,506,708,3551,562,298,13572,297,9296,28723,851,1580,2115,28808,816,3573,1675,13980,302,22280,466,298,4085,582,513,590,460,325,1246,346,28731,8526,369,798,3562,18443,16988,622,1372,28723,28705,1984,1676,14056,302,679,2148,349,272,9217,28809,28713,2899,28723,315,506,750,11869,1346,1060,486,272,6768,304,297,1774,302,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,28723,12763,1854,315,19506,2560,586,22105,356,28705,28781,362,2246,28725,315,506,750,3817,286,298,544,272,7087,312,487,12804,594,369,1038,13980,302,22280,466,8526,298,4085,575,28747,28705,393,468,302,1760,28747,5235,272,28705,28787,28781,2202,302,456,442,450,282,28725,459,264,2692,5113,4292,659,5048,575,298,297,18948,684,586,1162,1250,297,707,1069,28725,5843,442,1221,28723,2280,264,4126,1034,28725,459,396,4927,28725,459,1059,264,4008,4150,28723,315,4579,13750,456,1076,28747,11684,477,500,811,912,10064,9802,442,272,5113,659,438,707,1305,25277,298,1460,513,315,28809,28719,8810,442,13033,298,389,2303,13713,272,616,4177,5088,456,659,553,356,586,771,304,2528,28723,415,865,1338,297,5411,693,13992,707,4368,349,586,2195,16529,693,28725,1019,2070,400,349,586,1863,1407,7056,28725,865,1419,575,684,456,739,315,12903,713,302,586,12782,298,3530,500,811,912,10064,356,28705,28740,28774,362,4212,28723,415,9217,506,10335,356,2856,18323,369,590,17154,1738,477,481,364,5549,684,586,1162,28733,15823,1096,590,21820,297,1352,699,5648,22581,28723,1791,272,1489,302,586,4788,28725,708,8427,16128,279,264,1338,477,7201,910,460,368,28804,21012,14612,288,741,6471,2930,1285,2090,1023,873,1040,304,511,1040,1873,28723,6968,321,480,6133,28747,1682,1059,456,1759,28725,315,506,750,1083,28707,346,19983,2330,486,272,9217,354,11246,304,1699,1781,5648,14625,28723,415,408,28381,349,369,315,1269,456,5648,304,13140,486,14971,264,12668,690,7207,3376,1112,298,12668,582,304,5884,4163,298,272,882,748,297,12852,272,3209,28723,851,659,750,773,298,528,10897,486,25367,382,404,1294,304,349,264,21790,369,659,750,9332,2856,2421,486,799,9217,3338,28725,304,652,14625,297,1560,1597,304,798,17256,28723,318,1207,263,28747,315,506,750,22889,1907,12903,302,680,821,624,3335,970,384,5675,28710,659,1269,7616,298,272,2030,369,315,837,272,624,693,7900,2465,713,354,272,10700,302,24491,264,6965,3758,304,369,315,865,16691,264,22105,739,400,15585,528,28723,2957,7616,654,1269,298,1560,15137,304,438,2488,13251,395,500,811,912,10064,11796,28723,315,506,3851,298,2968,456,582,438,2429,8660,28747,2327,297,272,28705,28750,28781,362,2246,5498,298,272,9217,304,18410,438,272,5757,9788,356,28705,28782,362,4398,28723,560,272,907,3335,28725,315,3874,708,2899,1312,297,272,1676,315,403,459,1019,4670,298,4160,586,2996,304,403,2240,369,586,2996,349,459,8598,390,1167,3926,9894,1024,586,22105,403,1269,304,5884,654,459,6823,486,272,297,18831,28723,5526,748,28747,661,2056,28705,28787,28781,2202,298,625,264,5161,477,272,9217,28723,11081,3741,18185,3953,5498,3653,403,2203,354,272,907,28705,28784,28734,2202,28723,1015,1019,272,9181,477,272,9217,28809,28713,14625,3593,298,347,2662,486,20819,2502,614,9636,1040,736,403,1743,264,8579,302,28705,28740,298,28705,28781,2739,2202,28725,304,15343,272,799,19731,590,863,459,4080,8670,12221,302,652,9181,4213,317,28733,6645,28723,415,9217,865,3593,298,960,2327,272,3209,3246,798,1040,907,395,19387,1682,15800,1214,4165,4961,868,297,4118,8885,8455,943,1449,3114,4938,28725,27855,304,4077,28723,3764,4243,297,2202,28705,28784,28734,28816,28787,28781,325,28740,28782,2202,28731,821,297,272,3742,28705,28784,28734,2202,9837,28723,1094,931,13768,1759,28747,23501,272,28705,28787,28781,2202,28725,304,1996,1055,28725,315,506,553,708,25312,356,910,272,5113,682,4269,456,4291,28725,4602,354,272,5498,11793,486,272,9217,28809,28713,11881,1002,1024,272,5283,302,28705,28750,28734,4212,28705,28750,28734,28740,28787,970,272,3471,302,15613,302,272,9788,654,808,575,28723,1794,708,1305,863,272,9217,3084,528,395,272,5104,25465,2373,690,590,8926,298,18429,456,3209,28723,1306,829,506,8729,298,511,456,707,1069,590,2613,390,272,500,811,912,10064,20978,291,1231,1072,4920,349,25285,304,865,4605,415,500,811,912,10064,28809,28713,1124,452,4609,16797,622,20849,272,1338,693,16726,264,22105,304,20199,2080,14345,302,272,5745,22734,442,18648,22734,28723,1682,7959,622,347,11510,346,25481,304,7658,4714,495,2992,622,347,3214,513,9775,286,486,272,11597,28723,1015,1019,739,264,1759,403,28637,28725,378,403,459,390,7991,2458,28723,1263,3335,28725,272,297,18831,369,272,5113,3336,403,1104,3000,390,264,7114,298,1464,304,7507,767,4243,304,5695,16066,2500,562,297,586,2204,28725,272,7114,4377,24507,601,298,586,1250,3817,286,298,754,28705,28750,28723,28782,3316,302,23736,27203,5682,459,298,625,438,272,5307,562,298,2231,396,9285,15529,970,272,18313,28712,1028,3246,272,12354,28723,4127,272,28705,28750,28787,28733,3005,560,18831,7431,356,456,1759,9675,395,264,21933,1791,8610,334,869,369,2327,1076,659,708,5104,25465,442,25312,356,739,272,1759,622,948,28723,23471,10813,28747,6213,272,297,18831,28725,315,553,2856,6241,1503,304,459,579,6241,1503,13355,972,6195,1307,1835,528,28723,5518,315,506,264,2513,884,5108,304,1307,272,1707,4159,1938,272,10824,28725,378,403,8750,369,315,837,272,624,693,2897,272,7114,298,3142,304,5884,12313,384,5675,28710,28809,28713,6965,754,28707,1238,28723,851,403,7577,272,1639,369,315,863,459,938,272,1707,297,264,6965,2758,325,3245,456,3798,354,4118,6098,302,272,1707,4159,609,661,403,835,7668,369,315,18639,26097,384,5675,28710,28809,28713,6965,26201,1096,315,863,459,7901,7627,5019,3441,586,8287,438,516,297,25381,380,24370,562,3519,3851,298,2268,272,3817,28723,415,1792,13396,298,7105,298,28725,1346,4411,2962,586,10864,28747,23501,456,1759,28725,315,506,750,3857,395,708,5701,298,25128,2423,586,10864,298,272,5113,442,5556,264,2899,298,706,28723,415,4382,315,6333,297,586,9181,28725,1259,390,25367,7201,500,811,912,10064,5084,298,2264,707,6965,22280,466,298,5411,28725,2490,384,5675,28710,28725,654,459,16715,28723,4127,438,272,9788,28725,315,403,459,15463,298,1460,707,4224,1040,315,403,459,1019,4670,298,7282,2079,315,28286,28808,28705,415,13128,354,767,349,1629,28718,1907,272,4052,24700,302,456,3293,4291,10427,4524,6672,395,272,9217,28723,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,927,298,9353,346,304,2425,578,1388,9680,354,456,28723,6723,7719,298,9222,8594,586,22105,349,272,11150,2611,315,28286,477,264,2389,315,403,1215,8987,302,304,403,1215,1179,438,28723,6723,7719,298,808,575,20710,3081,5287,354,272,9788,4670,384,5675,28710,28809,28713,14625,298,1527,378,477,396,297,18831,298,264,3475,28129,3017,28809,395,713,390,272,16813,271,304,272,10737,28723,6723,7719,298,4085,582,442,23807,28706,390,315,403,4412,2374,286,354,754,28705,28750,28723,28782,3316,1040,1019,739,586,14625,1928,286,1040,12825,369,590,1419,456,18636,28723,28705,1306,835,5502,286,272,17253,298,9222,17214,304,1388,2992,297,456,3209,28723,851,349,683,313,4697,486,272,2296,28747,28705,4258,2123,4788,302,3684,10150,697,28747,384,5675,28710,659,264,1043,3340,302,10150,697,302,3142,1323,22280,288,2525,477,516,2202,297,272,28455,365,11279,6353,699,325,28796,28760,28780,557,690,403,5038,297,272,4725,298,3909,28705,28750,28734,28734,28734,28713,28723,1824,349,1019,680,3292,28712,6219,349,369,438,2429,1712,302,272,9217,3338,506,2651,684,384,5675,28710,28809,28713,22362,4969,11746,354,22280,288,2525,354,264,5864,3216,302,727,28723,5376,5127,403,396,5466,1028,438,28455,935,18376,390,456,403,1404,356,28725,1312,4417,524,598,515,403,264,1001,28733,9016,263,304,21516,5466,1028,438,351,1029,1540,28723,25367,403,835,5038,3270,1938,456,3216,304,27932,403,5635,302,1167,21786,697,28723,4127,513,400,1659,459,506,3364,302,1167,10150,697,297,272,11146,407,28713,28725,25367,382,404,1294,403,15956,12903,684,10150,697,302,6965,22280,466,1835,384,5675,28710,297,3909,28705,28750,28734,28740,28782,739,264,15574,8885,8455,14130,23248,2240,713,684,706,297,4349,28705,28750,28734,28740,28782,28723,415,10150,697,456,727,4658,10313,10214,7437,1345,23108,284,1063,298,2525,4213,19884,356,11690,304,2492,12692,7266,6965,27505,298,2525,2373,272,19683,4642,3772,3667,325,2956,500,811,912,10064,28725,19721,2889,28725,420,644,4114,304,613,19498,654,868,5651,28731,304,297,272,17525,14130,27374,28723,1794,456,727,384,5675,28710,403,272,10601,15954,9298,302,500,811,912,10064,28455,28723,10191,544,456,4788,28725,272,500,811,912,10064,9217,1269,384,5675,28710,28809,28713,15954,9298,2840,13089,297,4624,28705,28750,28734,28740,28784,28723,13695,28725,1019,739,7567,395,2219,364,449,15164,7167,302,17725,4052,27835,28725,390,315,863,739,315,1269,586,22105,356,2246,28705,28781,362,28705,28750,28734,28740,28787,28725,272,5113,553,298,347,3817,286,298,14373,798,6403,1159,590,863,2424,19258,28723,1263,754,9542,4587,1024,315,19506,2560,456,22105,28725,384,5675,28710,5317,298,960,297,516,8444,390,15954,9298,28747,400,21510,297,2488,13251,28725,7213,13912,304,1019,5611,4530,390,272,2105,302,272,18077,28725,2490,19735,272,28705,28750,28734,28740,28787,10822,7161,5801,6696,617,3546,12939,304,272,28705,28784,362,23647,28556,20461,18864,19751,10882,28723,650,403,865,2662,356,623,7550,695,3530,356,272,1348,1370,369,2332,444,28764,7710,5591,264,2838,325,28740,28750,362,4398,609,1682,1059,456,3475,262,6281,4821,17906,272,9217,659,16127,754,24324,298,23926,272,9828,302,384,5675,28710,304,516,12668,28723,6829,1929,28725,590,2261,354,272,1348,2184,302,10722,390,264,11392,5484,325,28706,28723,28721,28723,9149,13751,304,1719,1334,12908,28731,1671,2461,272,9637,298,511,579,28723,11680,28725,590,2897,1401,304,16222,369,378,403,396,5757,11597,304,5884,1023,25482,5566,477,7291,3536,272,2496,28723,415,9217,932,286,286,298,384,5675,28710,28809,28713,12668,28809,28713,15047,28725,10596,1096,590,654,8526,302,264,3544,1007,1850,2235,8040,1250,16691,486,384,5675,28710,28723,851,349,438,1489,704,288,4265,607,304,438,8748,6125,10573,28723,330,2496,395,272,7511,385,500,811,912,10064,3276,2729,298,506,1023,459,23589,1753,477,2548,767,349,1103,354,4813,302,264,28674,28723,14862,28725,553,384,5675,28710,519,286,500,811,912,10064,28725,400,28809,28715,506,11466,2897,264,1597,3209,798,390,400,682,506,553,298,13242,2079,400,403,1850,5499,28723,6924,10181,297,28455,460,798,10181,28747,2461,586,22105,304,6311,369,10466,12740,297,272,798,7966,2085,1944,2068,5090,1835,516,1216,1489,10299,28723,384,5675,28710,28809,28713,13268,354,272,3926,302,272,6856,403,369,400,3654,396,317,5882,21422,302,516,9788,11092,304,25367,28725,390,1162,390,272,25601,28715,492,5084,28725,654,8538,713,913,354,378,28723,315,868,10150,12395,7298,706,1671,22504,304,11390,298,3530,713,579,544,516,12908,654,1269,297,396,4261,298,625,7194,302,528,28723,2993,28725,438,272,727,302,813,7114,28725,25367,553,2141,15451,298,2855,354,272,2125,28723,851,541,347,998,7848,298,486,707,302,272,500,811,912,10064,6867,17131,693,654,1309,438,272,7944,9257,28723,6213,272,9788,28725,25367,863,459,4714,384,5675,28710,356,456,1305,304,297,579,2548,28725,20690,346,1275,311,3622,601,369,521,434,2657,28723,4127,513,25367,553,750,20875,28725,272,1639,369,400,403,1987,3714,390,744,302,384,5675,28710,28809,28713,389,593,28710,1023,506,6315,17793,516,5502,302,17253,28723,25367,1023,459,506,750,264,744,302,272,8357,369,272,9217,779,28346,298,17214,304,1038,264,5161,28723,2387,3573,347,264,9149,304,10737,297,272,1348,1222,28808,1791,3608,28725,384,5675,28710,1309,659,516,2389,28747,3672,272,5113,28809,28713,7345,4009,28725,384,5675,28710,659,865,750,21342,28723,661,349,8134,2572,369,400,993,347,312,4138,601,297,272,2496,28723,851,349,7577,272,9217,14612,288,10466,5566,302,713,3142,1323,22280,288,516,18339,390,1162,390,10150,697,477,264,26001,472,302,13980,690,21506,298,5166,5264,951,1377,28723,4127,395,456,28725,378,1309,2056,798,6403,354,272,9217,298,907,4080,713,356,3530,868,14551,713,28723,851,349,521,12678,522,28723,1602,460,13980,6495,298,1567,3814,513,456,349,910,264,2496,395,272,12947,302,500,811,912,10064,19754,1167,2258,313,2119,28804,28705,315,28809,333,3874,2856,10928,697,354,272,8579,28747,415,9217,9278,369,456,403,264,4630,3209,1096,500,811,912,10064,349,2853,294,4360,297,272,2223,28725,2783,272,7896,460,8885,8455,304,272,10824,9894,297,28455,28723,661,1528,2723,28809,28707,369,13140,28723,9500,304,8885,8455,2309,460,297,14012,739,378,3435,298,6965,22280,466,8427,28747,478,9681,456,1159,478,16726,586,22105,304,1019,11053,439,2742,10730,477,1560,5780,28809,8427,28723,28705,415,9217,659,835,10335,369,590,654,459,2358,298,2992,586,22105,390,4377,390,590,682,506,8232,1096,590,654,2942,298,4461,264,3544,1007,1850,2235,8040,28723,1136,7955,2747,456,349,1488,1814,397,28723,17158,356,272,4975,302,3926,477,272,28705,28770,5240,302,4398,28725,456,829,506,750,15570,297,28705,28750,4587,28723,1537,2079,863,378,1388,28705,28787,28781,2202,28804,661,3969,3081,369,272,5113,403,2526,354,6494,459,298,960,7577,652,1429,6323,304,4241,1155,324,2233,298,272,19909,28723,560,1259,264,1222,28725,272,622,298,960,349,544,369,9760,28723,2280,11012,1489,25087,28723,1015,2818,356,652,18147,28725,384,5675,28710,349,14227,302,4052,27835,356,2856,2778,28713,28723,3635,302,369,403,4231,304,403,5444,1019,1159,272,28705,28782,362,4398,560,18831,403,3336,28723,851,1023,459,506,3214,390,1043,390,378,863,28723,28705,1136,10537,22996,2260,2747,28725,354,741,21296,2611,272,500,811,912,10064,5113,304,9802,659,750,19238,440,298,1388,2992,1019,739,7567,395,3081,5566,684,384,5675,28710,28809,28713,4052,27835,28723,851,4716,24838,16984,272,2273,1096,456,28569,977,274,396,2588,2256,4623,298,500,811,912,10064,390,396,18077,28723,415,418,19760,6309,27532,828,473,403,2141,21242,288,395,10203,2020,302,456,22105,1024,378,403,1269,356,28705,28781,362,2246,28723,415,5113,776,3969,298,506,4214,575,302,871,1069,298,4461,12292,395,586,22105,28723,28705,415,5113,659,835,750,2108,821,9853,1246,297,272,2296,4342,28747,28705,19230,25959,4845,1749,272,9217,302,500,811,912,10064,297,4527,28705,28750,28734,28740,28782,28723,851,403,459,7004,3875,578,4678,7784,578,28723,4324,1996,28705,28740,28782,362,4398,28725,19230,403,1309,9206,356,272,4400,390,264,9217,15178,28723,415,14060,302,272,28096,438,272,297,18831,349,16050,7990,24108,28725,5876,741,575,1246,4052,9157,2044,697,390,1162,390,272,3422,1937,302,8598,12458,28723,851,541,347,287,13528,575,486,272,12740,302,773,28096,690,272,9217,14612,274,304,315,19411,706,298,4098,456,12740,297,871,3293,884,395,272,798,28723,3560,578,28725,500,811,912,10064,659,396,1565,2251,4920,28723,2993,28725,378,3969,369,456,1565,1467,6608,297,18879,302,28725,304,459,1096,302,272,9217,28723,25367,659,12313,272,5084,298,2264,707,10824,302,22280,466,1155,1378,706,369,272,9217,622,4269,378,20029,346,28723,851,1155,4935,349,6861,346,15235,10937,486,910,272,9217,659,5884,2082,9852,272,989,5084,3338,693,654,3142,1323,26201,286,356,28705,28740,28774,362,4624,28705,28750,28734,28740,28787,28725,624,302,6105,553,5566,325,14576,599,28731,304,624,693,863,459,28723,415,23967,2161,302,20370,3857,486,272,9217,356,28705,28740,28787,362,4398,349,3620,745,395,272,5307,28723,23518,9254,369,16499,456,460,28747,28705,415,7719,298,4389,369,315,403,25007,354,771,739,315,403,521,13952,356,28705,28770,28740,303,2246,28723,851,349,1871,272,9217,553,3411,2735,298,304,1023,506,3214,778,15301,739,590,8278,264,3608,354,272,9788,28723,28705,415,297,25318,380,5436,302,272,1759,302,272,5239,302,272,5566,325,28782,362,298,28705,28740,28782,362,4212,609,3868,272,5104,4661,354,767,2590,1203,28720,1360,28725,395,396,13268,354,272,882,748,28723,28705,415,297,25318,380,5436,302,272,1759,302,5861,288,3714,272,3471,302,15613,298,347,1307,438,272,297,18831,325,28750,28734,362,304,28705,28750,28787,362,4212,28731,259,28782,362,4398,28747,1794,272,9788,28725,272,9217,9256,590,3236,264,1819,298,1038,652,5161,28723,851,659,1055,750,4497,2458,298,28705,28787,2739,2202,298,16287,871,5161,259,28782,362,4398,28747,415,9217,1960,23860,871,5161,298,4080,272,1992,19571,308,356,3530,1996,264,5161,349,1269,28723,661,349,459,3081,693,590,1960,6899,369,395,28723,661,5785,403,459,298,528,297,272,2363,302,272,9788,4678,298,272,5084,390,315,553,2735,298,317,28733,28719,2014,1996,28705,28740,28734,362,4398,304,456,553,6304,459,750,1960,6899,298,272,1918,486,868,28723,384,5675,28710,403,907,2662,356,3530,356,28705,28740,28750,362,4398,1024,5864,3618,304,798,6403,28723,28705,1682,297,544,28725,456,22105,659,750,4716,304,24354,4052,27309,477,5399,28705,28740,28723,7829,272,14130,3618,304,8885,8455,6190,438,2475,927,298,9198,272,9783,11405,308,369,272,9217,302,500,811,912,10064,659,808,28723,6723,1759,659,750,11869,22904,286,304,456,1023,459,347,11389,776,1096,590,506,19862,596,4912,1567,298,272,4714,12886,28723,560,4469,1259,390,1167,28725,272,2825,3209,776,390,1188,390,272,948,28723,28705,11682,750,356,272,3544,948,302,264,24122,6900,1759,28725,315,682,3397,369,544,24202,312,19090,652,6965,22280,466,15251,304,15627,706,297,2061,302,1489,10879,28723,23518,578,28725,590,927,298,5407,369,272,2296,4382,506,750,16715,28747,28705,330,1162,28733,444,294,6432,1759,1023,347,5397,369,12784,28747,28705,7240,25465,28747,415,727,3214,354,272,3293,1759,1023,347,390,2485,390,2572,28723,330,7228,302,28705,28740,28781,15639,2202,3969,298,347,396,18636,3526,4787,28723,415,1759,298,347,4961,28747,560,767,1221,304,1069,1023,7896,13914,23430,442,7234,10864,28804,4867,736,24597,12920,298,6105,7896,541,13914,1259,23430,739,272,23430,460,1835,272,1215,1338,590,460,12336,6495,298,13914,706,298,28804,1691,5566,302,264,3452,3030,28804,1047,5081,28725,767,2112,28804,1824,1430,5430,302,272,1759,622,511,28725,739,378,622,347,2203,304,2079,28723,1824,1221,8520,1938,272,1759,622,1388,304,767,1430,4150,1023,1675,298,3934,732,5556,438,1430,5430,28723,2280,2500,302,799,5084,28747,1739,9629,356,767,5084,622,347,459,1799,28725,739,304,297,767,1221,28723,14250,298,7234,10864,28747,330,2825,1023,347,3859,298,1914,544,8209,298,7234,707,4382,590,506,390,272,1759,349,15260,28723,24876,282,1759,28747,513,272,14120,349,459,4864,3765,298,272,8209,28725,767,513,707,1182,269,1139,354,937,8306,511,590,506,28804,28705,10731,298,272,8209,28747,767,28725,513,707,28725,18835,1760,460,272,12354,304,18313,28712,1028,15583,298,28804,1684,304,970,349,456,1760,2632,28725,304,354,910,1043,28804,28705,25487,302,13841,1938,272,11597,28747,1047,304,354,910,1043,272,12354,304,18313,28712,1028,1023,347,2662,356,623,7550,695,325,26283,28731,3530,390,272,11597,1759,349,16353,2589,28723,28705,5795,3696,395,272,1846,302,272,2496,304,272,3536,1526,28747,330,4920,1023,2588,356,767,349,10924,28725,739,304,910,298,799,5084,390,1162,390,298,8209,3536,272,18077,28723,28705,661,1023,576,1671,3677,378,3969,369,378,1580,347,773,28747,14213,840,22280,466,349,459,264,6241,1503,442,12354,1503,9311,28723,5518,302,378,28725,315,28809,333,553,298,22883,477,264,2389,315,8232,304,403,1215,1179,438,28723,851,659,750,1276,28712,15402,298,1560,272,7347,1127,28725,304,272,18077,390,264,2894,28723,315,873,302,905,693,506,750,7207,298,22883,2940,298,22280,466,304,460,18410,9638,298,1300,633,7794,304,633,7291,302,7904,28723,851,349,1096,438,871,6421,6965,22280,466,349,684,396,12476,302,1982,28723,415,1338,297,264,2840,302,1982,8049,272,12354,297,264,2840,970,590,2477,6344,10832,442,6788,652,9678,18538,28723,816,3573,347,8711,302,3667,25573,771,452,2089,354,2525,325,12938,9135,272,1170,2794,302,272,7757,302,6965,22280,466,4469,28731,739,478,460,1309,679,2570,395,1259,408,554,2487,628,4382,390,20083,369,15217,460,1933,302,22280,466,28723,28705,17158,356,910,11466,586,22105,403,287,969,1006,28725,500,811,912,10064,28809,28713,12947,659,3214,264,9130,4067,28723,851,927,1484,506,4243,304,349,4716,25340,298,272,7757,302,272,5084,302,272,18077,693,506,4226,7771,708,744,297,456,3518,302,27245,3926,28725,304,298,456,1370,6344,7444,297,272,3199,684,4668,767,1203,28720,1360,28723,661,15667,586,3031,369,272,1179,905,693,771,438,500,811,912,10064,506,750,16619,486,272,6768,302,4417,524,598,515,28725,25367,382,404,1294,28725,5376,5127,13430,539,304,25785,27656,22703,28723,415,1179,771,369,500,811,912,10064,659,2203,754,272,2609,28705,28740,28734,1267,28725,304,10352,298,511,1023,459,347,261,1331,1497,486,272,6768,304,5502,736,1009,302,456,9217,28723,560,1745,298,2839,272,15932,1759,2373,272,2496,28725,304,354,272,1526,298,2839,298,983,426,9843,297,500,811,912,10064,28725,272,9217,1023,22883,297,871,3293,884,28723,6352,633,5411,541,2839,298,11701,272,6544,2203,28723,1387,349,264,3904,912,2689,3677,28747,351,28727,7322,7050,13067,4279,1254,4985,28725,1879,478,769,261,515,9433,28710,28723,10929,7896,693,506,24385,907,1021,910,24122,315,506,750,9852,506,1215,1353,20142,684,910,652,4382,304,10864,622,347,15570,297,272,3437,28723,6352,264,6138,304,7126,9217,304,15954,9298,541,3084,272,1155,324,2233,369,272,5084,28725,272,8885,8455,14130,3618,304,272,1526,438,2475,927,28725,390,478,544,2318,356,477,1236,28723,330,1179,907,3707,682,347,298,808,582,264,28455,3407,489,2446,23039,304,1658,10276,741,5823,298,2136,354,707,5648,304,1902,24943,1760,3345,298,799,13980,302,384,5675,28710,304,799,6965,7871,3117,28723,28705,500,811,912,10064,349,264,1598,2496,395,264,12302,2093,28723,315,3317,272,9217,541,1658,652,14486,385,10276,304,511,767,28809,28713,1489,354,272,2496,28725,690,349,25719,10276,298,2111,500,811,912,10064,264,4676,298,312,4671,871,12947,28723,28705,8980,28747,7240,4661,8457,356,28705,28750,28740,4398,298,4714,13936,1065,304,967,264,6925,3062,28705,13,11521,28745,4232,28747,315,28286,477,500,811,912,10064,1096,302,6965,22280,466,486,272,15954,9298
diff --git a/triton_backend/tools/dataset/mini_cnn_eval.json b/triton_backend/tools/dataset/mini_cnn_eval.json
deleted file mode 100644
index 32bb18282b6..00000000000
--- a/triton_backend/tools/dataset/mini_cnn_eval.json
+++ /dev/null
@@ -1,502 +0,0 @@
-[
-    {
-        "input": "(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. \"I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big,\" Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. \"Thanks for all the support and prayers,\" a comment on a Facebook page in her name read. \"I know this entire journey is much bigger than all of us. I also know I'm just the messenger.\" CNN cannot verify the authenticity of the page. But the power that multiplied Broussard's gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Center in San Francisco. So high, that it is taking five surgeons, a covey of physician assistants, nurses and anesthesiologists, and more than 40 support staff to perform surgeries on 12 people. They are extracting six kidneys from donors and implanting them into six recipients. \"The ages of the donors and recipients range from 26 to 70 and include three parent and child pairs, one sibling pair and one brother and sister-in-law pair,\" the medical center said in a statement. The chain of surgeries is to be wrapped up Friday. In late March, the medical center is planning to hold a reception for all 12 patients. Here's how the super swap works, according to California Pacific Medical Center. Say, your brother needs a kidney to save his life, or at least get off of dialysis, and you're willing to give him one of yours. But then it turns out that your kidney is not a match for him, and it's certain his body would reject it. Your brother can then get on a years-long waiting list for a kidney coming from an organ donor who died. Maybe that will work out -- or not, and time could run out for him. Alternatively, you and your brother could look for another recipient-living donor couple like yourselves -- say, two more siblings, where the donor's kidney isn't suited for his sister, the recipient. But maybe your kidney is a match for his sister, and his kidney is a match for your brother. So, you'd do a swap. That's called a paired donation. It's a bit of a surgical square dance, where four people cross over partners temporarily and everybody goes home smiling. But instead of a square dance, Broussard's generous move set off a chain reaction, like dominoes falling. Her kidney, which was removed Thursday, went to a recipient, who was paired with a donor. That donor's kidney went to the next recipient, who was also paired with a donor, and so on. On Friday, the last donor will give a kidney to someone who has been biding time on one of those deceased donor lists to complete the chain. Such long-chain transplanting is rare. It's been done before, California Pacific Medical Center said in a statement, but matching up the people in the chain has been laborious and taken a long time. That changed when a computer programmer named David Jacobs received a kidney transplant. He had been waiting on a deceased donor list, when a live donor came along -- someone nice enough to give away a kidney to a stranger. Jacobs paid it forward with his programming skills, creating MatchGrid, a program that genetically matches up donor pairs or chains quickly. \"When we did a five-way swap a few years ago, which was one of the largest, it took about three to four months. We did this in about three weeks,\" Jacobs said. But this chain wouldn't have worked so quickly without Broussard's generosity -- or may not have worked at all. \"The significance of the altruistic donor is that it opens up possibilities for pairing compatible donors and recipients,\" said Dr. Steven Katznelson. \"Where there had been only three or four options, with the inclusion of the altruistic donor, we had 140 options to consider for matching donors and recipients.\" And that's divine, Broussard's friend Shirley Williams wrote in a comment her on Broussard's Facebook page. \"You are a true angel my friend.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Zully Broussard decided to give a kidney to a stranger .\nA new computer program helped her donation spur transplants for six kidney patients ."
-    },
-    {
-        "input": "(CNN)On the 6th of April 1996, San Jose Clash and DC United strode out in front of 31,683 expectant fans at the Spartan Stadium in San Jose, California. The historic occasion was the first ever Major League Soccer match -- a brave new dawn for the world's favorite sport in a land its charms had yet to conquer. Summarizing the action for ESPN, commentator Ty Keough eagerly described the momentous \"birth of a new era for American soccer.\" Looking back at footage from that balmy evening now it's hard not to feel a certain nostalgia. Baggy shirts, questionable hairstyles and strange rule adaptations to make games more exciting were all part of the formative MLS experience. Countdown clocks were employed to provide drama at the end of each half. Even more bizarrely, tied games were settled by shootouts that saw attacking players run with the ball from 35-yards out before attempting to beat the opposing goalkeeper. As the MLS prepares to mark the beginning of its 20th season, it's hard to comprehend just how much the league has progressed in the intervening period. Long gone is the desire to tamper with the rules of the game for a start. Attendances are higher than ever before while the number of teams involved has doubled from 10 in the 1996 campaign to 20 in 2015. A further four are set to be added by 2020. On top of this, the new season is the first of a new domestic TV and media rights deal with FOX, ESPN and Univision worth $700 million over eight years. This figure may pale beside the $5.1 billion recently paid by UK broadcasters for the English Premier League, the richest football league in the world, but it represents a tripling in value of the previous MLS deal. According to Phil Rawlins, co-primary owner and president of the new MLS franchise, Orlando City Soccer Club, \"the industry and the game itself has moved on dramatically\" in the U.S.. He believes what would equal 50 years growth in most other industries has been experienced in the first two decades of the MLS. Rawlins' club is a prime example of this rapid transformation. He describes players being pushed out of changing facilities because of a schedule clash with a yoga class not so long ago. This weekend 60,000 fans are expected to witness Orlando City's opening weekend fixture against New York City, another new club making their MLS bow. World Cup winners Kaka and David Villa will turn out for Orlando and New York City respectively. \"We're just on the crest of the wave at the moment,\" Rawlins said of football's American progress. \"Can it be the number two, number three sport in this country? Yes, I think it can. And it can be in a short space of time.\" These positive assertions are backed by the huge interest U.S. fans showed in last year's World Cup in Brazil. Team USA's group stage clash with Portugal attracted 25 million viewers, according to figures from TV ratings firm, Nielsen. That's considerably more than the 15 million baseball's 2013 World Series averaged on FOX or the similar audience that tuned into the 2014 NBA finals on ABC. Anyone who saw 20,000 pumped-up young fans pack out Chicago's Grant Park to cheer on their country via big screens, meanwhile, would find it hard to argue against soccer in the U.S. now being anything other than a big deal. Reaching this promising stage, however, has been anything but a smooth ride. The MLS was reported to have lost as much as $250 million in its first five years while average attendances initially dwindled after the inaugural season. Three teams -- Miami Fusion, Tampa Bay Mutiny (both in 2001) and Chivas USA (2014) -- were disbanded along the way due to a mixture of lack of fan interest and ownership troubles. A report by Forbes at the end of 2013, meanwhile, claimed that only 10 out of 19 MLS teams were profitable. And as recently as this week, MLS players looked like they could be going on strike over wages and the right of players to become free agents when their contracts end. Then there's the way the league develops, attracts and trades players. A salary cap restricts the amount teams can spend on playing squads. Each side, however, has a number of spaces that can be allocated to \"off budget\" signings which are not included within the cap. This includes promising Generation Adidas players who enter the MLS through the draft systems before completing their college education. Homegrown players from club's development academies are also exempt as are a maximum of three designated players (DPs), usually stellar international names whose wages and transfer fees will be covered by club owners or sponsors. One of the main criticisms of the MLS and its complex player acquisition rulebook is that while it does entice prominent stars of the game like David Beckham, Freddie Ljungberg and Thierry Henry to appear in the MLS, it only does so when their careers are on a downward trajectory. Why would an exceptional player want to move to a league that can only attract a handful of top talents at any one time, after all? And herein lies one of the leagues biggest challenges in attracting and keeping the talented players fans want to see. Although the likes of the salary cap encourages fiscal probity, it means MLS teams are restricted by rules clubs in other markets are not. Head coach of Sporting Kansas, Peter Vermes, highlighted these difficulties in comments carried by the Kansas City Star newspaper last year. \"We're in a place where at times you can't compete with foreign clubs because of the kind of dynamics they have in regards to finances. We have a salary cap. They don't,\" Vermes said. According to Paulo Teixeira, a football agent who has worked to bring in and sell players from the league in recent years, current philosophies with regards player-trading may be have to be tweaked to help the MLS grow yet further. He describes the importance of placing an emphasis on attracting younger players with European passports. Such talented individuals will have a sell-on value that can be recouped by the league and their clubs if they move on from the MLS to the biggest and wealthiest leagues across the Atlantic. Theoretically, at least, this money can then be reinvested in the league, player development and attracting yet more promising players to the MLS. This in turn will raise the standard further. An early example of this strategy can perhaps be found in the transfer of Oriol Rossell, a Spanish midfielder who moved from Sporting Kansas to Sporting Lisbon last year in a deal brokered by Teixeira. Rossell arrived on a free transfer aged 20 after being released by FC Barcelona in 2012. He excelled at Kansas, winning the MLS Cup before being sold to the Portuguese giants at a profit in June 2014. Teixeira is quick to make clear such plans would need good scouting systems to truly flourish. It could also be achieved by signing DPs closer to the peak stage of their career, he added. This last point is something that appears be happening already. \"Before they used to have a lot of big names who could no longer run in Europe,\" Teixeira said. \"(But) Villa is not an old guy, (Frank) Lampard is still going strong\" and both could still offer something to teams in Europe, he said by way of example of New York City's first DP signings. Nevertheless, he continued, the signing of more young players with big potential  \"is probably something we'll see more of.\" Whether Teixeira is correct will become apparent in the months and years ahead. Either way, that brave new MLS dawn that broke over San Jose back in 1996 has turned into a bright morning. CNN's Don Riddell contributed to this story.",
-        "instruction": "Summarize the following news article:",
-        "output": "The 20th MLS season begins this weekend .\nLeague has changed dramatically since its inception in 1996 .\nSome question whether rules regarding salary caps and transfers need to change ."
-    },
-    {
-        "input": "(CNN)French striker Bafetimbi Gomis, who has a history of fainting, said he is now \"feeling well\" after collapsing during Swansea's 3-2 loss at Tottenham in the Premier League on Wednesday. The worrying incident occurred in the first half at White Hart Lane -- after Tottenham scored in the seventh minute -- but the 29-year-old left the pitch conscious following about five minutes of treatment. The Guardian added that he was wearing an oxygen mask. Play was temporarily stopped before resuming. As the match progressed, Swansea tweeted that Gomis was \"fine,\" with manager Garry Monk using the same word to describe Gomis' condition. Gomis spent the night in hospital as a precaution, Swansea said on its website. \"I wanted to reassure you concerning my health,\" Gomis told the website. \"It actually looks much scarier than it is physically dangerous, and I am feeling well now. \"I have been under a great deal of stress and fatigue due to my father's health, which requires me to go back and forth from France. \"I was disappointed that I couldn't help my team tonight, but now everything is back in order. I also want to thank everyone for their support and get well messages.\" Gomis had similar fainting spells in France, which prompted the president of his former club, Jean-Michel Aulas of Lyon, to tell French television in 2009: \"We can't not be worried, it scares you each time.\" Swansea ran tests on Gomis, said Monk, prior to signing him on a free transfer last July. \"He just has a little bit of low blood pressure which causes you a little bit of problems,\" Monk said in a televised interview on Sky. \"It's been part of his life. We were well aware of that when we signed him. He's done all the hospital checks and all the medical checks you can possibly do and it's just part of his life. \"It's no problems whatsoever. It's not as serious as it looks.\" Gomis has scored two league goals for Swansea this season, mostly in a backup role. He became the Welsh side's top striker when Wilfried Bony signed with Manchester City in January. Almost exactly three years ago at White Hart Lane, then Bolton midfielder Fabrice Muamba collapsed after suffering a cardiac arrest. He was near death,  according to Bolton, but survived after being treated at the London Chest Hospital. He subsequently retired. Other footballers, including Cameroon international Marc-Vivien Foe in 2003 and Spanish international Antonio Puerta in 2007, didn't survive after collapsing on the pitch.",
-        "instruction": "Summarize the following news article:",
-        "output": "Bafetimbi Gomis collapses within 10 minutes of kickoff at Tottenham .\nBut he reportedly left the pitch conscious and wearing an oxygen mask .\nGomis later said that he was \"feeling well\"\nThe incident came three years after Fabrice Muamba collapsed at White Hart Lane ."
-    },
-    {
-        "input": "(CNN)It was an act of frustration perhaps more commonly associated with golf's fictional anti-hero Happy Gilmore than the world's reigning No 1. player. But when Rory McIlroy pulled his second shot on the eighth hole of the WGC Cadillac Championship into a lake Friday, he might as well have been channeling the much loved Adam Sandler character. Before continuing his round with a dropped ball, the four-time major winner launched the 3-iron used to play the offending shot into the water as well. \"(It) felt good at the time,\" a rueful McIlroy later said of the incident in comments carried by the PGA Tour website. \"I just let frustration get the better of me. It was heat of the moment, and I mean, if it had of been any other club I probably wouldn't have but I didn't need a 3‑iron for the rest of the round so I thought, why not.\" The club \"must have went a good 60, 70 yards,\" he joked. McIlroy composed himself to finish with a second round of 70, leaving him one-under for the tournament and eight shots off the pace set by leader JB Holmes. While an improvement on last weeks performance at the Honda Classic event, where he failed to make the cut, the Northern Irishman's frustration with elements of his game was still clear. \"I think every golfer feels it because I don't hit shots like the one I hit on 8 on the range,\" he said. \"That's what really bothers me, the fact that I get out on the course and I hit shots that I'm not seeing when I'm in a more relaxed environment. \"So it's a little bit of mental, a little bit of physical. It's just everything is not quite matching up.\" Elsewhere on the course, Ryan Holmes scored a two-under-par 71 to remain in second position overall, two shots behind Holmes. Former world No 1., Adam Scott carded an impressive 68 to finish the day three shots off the pace at six-under while Bubba Watson and Henrik Stenson are tied for fourth on four-under.",
-        "instruction": "Summarize the following news article:",
-        "output": "Rory McIlroy throws club into water at WGC Cadillac Championship .\nNorthern Irishman frustrated after pulling shot into water hazard ."
-    },
-    {
-        "input": "(CNN)A Pennsylvania community is pulling together to search for an eighth-grade student who has been missing since Wednesday. The search has drawn hundreds of volunteers on foot and online. The parents of Cayman Naib, 13, have been communicating through the Facebook group \"Find Cayman\" since a day after his disappearance, according to close friend David Binswanger. Newtown Police say Cayman was last seen wearing a gray down winter jacket, black ski pants and hiking boots. He could be in the Radnor-Wayne area, roughly 20 miles from Philadelphia, or may have purchased a train ticket to Philadelphia, according to an alert posted on Facebook. \"We think that he got a email from school and was upset by it and left as an impulsive act,\" Farid Naib, Cayman's father, wrote Thursday on the group page. \"We have spoken to his friends and they do not know where he is. Cayman does have his phone, we don't know if he has any cash, he does not have his wallet.\" The parents said that his phone was out of power at the time. \"Cayman left within 30 minutes after he received an email from school regarding overdue home work (we do not blame the school) and most probably did not do any pre planning ... He is a good kid, and has no substance abuse or other issues, this is the first time he has ever done anything like this,\" his father and mother, Becky Naib, posted Friday. The parents wrote Saturday that Cayman was not wearing waterproof clothing and that he did not take his backpack. Binswanger said weather limited search efforts Wednesday, the night Cayman went missing. Wednesday it was rainy and Thursday there was 6-8 inches of snow. Hundreds of volunteers have stepped up to pass out fliers and to canvass areas, according to posts. A post late Saturday explained search efforts included \"advanced, geo-spacial tracking software to determine the exact locations where searchers went,\" and added that the search would be taken to the skies Sunday with deployment of the Civil Air Patrol. In many posts, the families appealed directly to their son. \"Cayman, if you read this please know that you are forgiven for everything, and I mean everything, you have the ultimate free pass. Just come home, we are so worried about you\" the family posted Saturday. A message to families from the head of The Shipley School, which Cayman attends, read in part: \"Cayman's sister Savannah is in ninth grade at Shipley and his parents, Farid and Becky, are terrific people. They have contacted police and are aware that we are sending you this email. We hope that Cayman is ok and are saying our prayers.\" CNN could not immediately get in touch with police or the FBI.",
-        "instruction": "Summarize the following news article:",
-        "output": "Cayman Naib, 13, hasn't been heard from since Wednesday .\nPolice, family, volunteers search for eighth-grader ."
-    },
-    {
-        "input": "(CNN)My vote for Father of the Year goes to Curt Schilling. The former Major League Baseball pitcher recently fired off a series of fastballs and mowed down a group of Twitter trolls who made the mistake of tweeting vulgar and sexually-explicit comments about Schilling's teenage daughter. The drama started, innocently enough, on February 25, when Schilling played the role of a proud father. He sent a tweet congratulating his daughter, Gabby, on being accepted to Salve Regina University, where she'll play softball. It read: \"Congrats to Gabby Schilling who will pitch for the Salve Regina Seahawks next year!! — Curt Schilling (@gehrig38)\" Almost immediately, responses came in from young men, complete strangers who apparently followed Schilling on Twitter. The tweets quickly went from immature, to creepy, to repugnant. Threats of rape were common. The tweets were deleted, and the accounts were closed after this story went viral. But not before Schilling captured some of the images and posted them on his blog. What was said about 17-year-old Gabby Schilling wasn't just obnoxious. It was vile and obscene. What was said wasn't just mean and ugly. It was threatening and scary. As a parent, it's the kind of thing that makes you rethink your opposition to public caning as a logical punishment for such transgressions. These misogynistic cowards may have thought they could hide in the darkness of anonymity, the sort that many have come to expect from social media sites, where you feel free to be a  despicable human being because, you think, no one will ever find out who you really are and hold you accountable for your words. If so, they thought wrong. They couldn't hide. They were found out, and they got the throttling they so richly deserved. Thanks to dad. According to Schilling, who made it his mission to track down these cretins and make sure those they associate with know who they really are, two people have already paid a price due to their tweets. One was a student disc jockey at a community college in New Jersey, who was suspended, and the other was a part-time ticket seller for the New York Yankees, who was fired. Concerned that this is an example of exactly the kind of cyberbullying that leads some teenagers to commit suicide, Schilling is also thinking about taking legal action against some of the other people involved. Bravo for him. I'm sure that, all across America, dads with daughters -- after reading some of the horrible things that were said about this young girl -- are marveling at Schilling's self-control. I have two daughters of my own, and he's a better man than me. If ever there was a case where profanity-spewing malcontents deserved to have their mouths washed out with soap, this is it. So what additional insights can we draw, and what larger lessons can we learn, from this unexpected but predictable collision of old-fashioned parenthood and newfangled media? There are a few. The first is about accountability, the very thing that the young men who posted these hurtful messages were trying to avoid. But Schilling wouldn't let them. At their best, social media sites like Twitter, Facebook, Instagram and others allow the sharing the information and the building of a sense of community. At their worst, they become digital sandboxes and locker rooms where people think have a license to misbehave without having to worry about consequences. We need to applaud efforts like this that promote greater online accountability. There's also something to be said about protective parents, and how essential they are to a working society. We should still be concerned about those overprotective parents who hover like helicopters from little league to job interviews. We shouldn't bubblewrap our kids, and keep them from playing outdoors, and then sit around wondering why they're soft, timid, and risk-averse. But protective parents -- the kind who shield their kids from real danger -- never go out of style. A parent's top job is to protect his children. Schilling did his job. Finally, it's worth reminding everyone that freedom of expression does not mean freedom from rules, standards, and expectations that should guide your behavior. There are things you don't say. There are boundaries, ways that we expect you to behave so you don't terrorize other people or bring shame upon yourself, your friends, and your family. If you don't have social skills, you don't belong on social media. The tweets make you wince. But in this story, you'll still find plenty to smile about. This whole drama unfolded because Schilling was proud of his daughter. Now, given how he reacted, we all have reason to be proud of him. ​ .",
-        "instruction": "Summarize the following news article:",
-        "output": "Ruben Navarrette: Schilling deserves praise for taking on online haters for offensive comments about his daughter .\nNavarrette: In protecting his child, Schilling set a model for parenting and taught us a lesson about social media ."
-    },
-    {
-        "input": "(CNN)Another one for the \"tourists behaving badly\" file. Two American women have reportedly been arrested for carving their initials into a wall with a coin inside Rome's Colosseum. Daily Italian newspaper La Stampa says the women, aged 21 and 25, were spotted carrying out the act by fellow tourists, who then told security. The two letters -- J and N -- were about eight inches in length and scratched on a brick wall at the historic Roman amphitheater. The women, both from California, reportedly snapped a selfie of themselves with their initials before they were arrested. Their names have not been released. The American pair may now face a fine for \"aggravated damage\" on a building of historical and artistic interest. If one Russian's experience is anything to go by, the price won't be cheap. Last November, authorities in Rome slapped a 20,000-euro ($21,685) penalty on a Russian tourist caught carving his name into the famed landmark. The 42-year-old man was apprehended after a guard at the Colosseum saw him carve the letter \"K\" in a section of brickwork. After police caught up with him, the man was found guilty of causing aggravated damage, fined and given a four-month suspended sentence. The Russian was one of five tourists caught carving graffiti on Colosseum walls in 2014. The earlier incidents involved two Australians -- a father and son -- and a Canadian and a Brazilian, both teenagers. Rome isn't alone in having to deal with inappropriate tourist behavior this month. Egypt is now looking into reports that one of its most historic sites was the backdrop for a Russian porn flick. According to Al Arabiya News, authorities have confirmed that an X-rated film was shot next to the Pyramids of Giza and the Sphinx and are now investigating. The video was reportedly made by Russian tourists, who then uploaded it to the Internet nine months ago. Al Arabiya reports that Minister of Antiquities Mamdouh al-Demati says the issue has been referred to the public prosecutor and new surveillance cameras are being installed in the area. The incident comes as a reminder that no world landmark is safe from the salacious urges of tourists -- no matter how sacred it might be to the locals. Cambodia's Angkor Archeological Park experienced its own string of nudity-related incidents this year. In February, U.S. tourists and sisters Lindsey Kate Adams and Leslie Jan Adams were deported after allegedly getting caught taking partially nude photos at Preah Khan temple, one of the sacred sites inside Cambodia's Angkor complex. They received six-month suspended prison sentences, a one-million riel ($250) fine and were banned from entering the country for four years. In January, a group of photos featuring a topless dancer leaning against the ruins surfaced on Facebook. Early last year, Peru officials were forced to respond to a chain of \"naked tourism\" incidents at Machu Picchu, also a World Heritage Site. They detained four American tourists, two Canadians and two Australians for stripping down for pictures at the site. CNN's Barry Neild contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Two American women arrested for carving initials into a Colosseum wall .\nMeanwhile, Egypt investigating Russian pornography film reportedly shot at Great Pyramids .\nCambodia's Angkor Archeological Park experienced a string of nudity-related incidents this year ."
-    },
-    {
-        "input": "(CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.",
-        "instruction": "Summarize the following news article:",
-        "output": "It will be a first time for the tour stateside .\nFirst show will be in Louisville, Kentucky ."
-    },
-    {
-        "input": "(CNN)A shooting at a bar popular with expatriates in Mali on Saturday killed five people, including French and Belgian citizens, authorities said. One French citizen, one Belgian and three Malians were killed in the attack in the capital of Bamako, said Gabriel Toure, director of a local hospital. An additional eight people were wounded, he said. Authorities called the shooting a \"criminal and terrorist act.\" \"Mali remains committed to seeking peace and will not be intimidated by those who have no other motives than to do away (with) the prospects of peace and harmony amongst the Malians,\" the government said in a statement. A North African jihadist group, al-Murabitun, claimed responsibility for the attack, according to Mauritanian news agency Al Akhbar. The purported claim came in an audio message in which the group said it carried out the attack in retaliation for the killing of one of its leaders, Al Akhbar said. \"Al-Murabitun may be considered a regional competitor to al-Qaeda in the Islamic Maghreb (AQIM),\" according to the Jamestown Foundation, a Washington-based research and analysis firm. The U.S. State Department said in January that al-Murabitun is a \"newly-formed\" militant group that has presence in northern Mali. French Foreign Minister Laurent Fabius said the victim from his country was 31 years old. French President Francois Hollande released a statement condemning the attack. U.S. Secretary of State John Kerry expressed his condolences to the victims' families in a news conference in Paris on Saturday. \"This is an act of cowardice,\" he said. \"But an act of opening fire in a restaurant filled with innocent civilians -- in the end, that only strengthens our resolve to fight terrorism in all of its forms, wherever it exists.\" The French Embassy in Bamako warned its citizens to be on alert if they go out in public. Though it's unclear whether any rebel group is responsible for the attack, Malian forces have battled Islamist militants in the northern part of the nation for years. Fabius said the Malian government was interrogating some suspects. Mali plunged into chaos after soldiers staged a coup three years ago. As a result, Tuareg fighters capitalized on the power vacuum to launch an insurgency that ended with their takeover of the north. After the Tuareg fighters seized the region, a power struggle erupted with local Islamist radicals. The extremists toppled the tribe and seized control of a large piece of northern Mali, an area the size of France. Since then, the nation has battled various rebel factions, mostly in the northern region, with the help of French and African forces. CNN's Brian Walker, Joshua Berlinger and Christabelle Fombu contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "A jihadist group claims responsibility in an audio recording, news agency reports .\nThe Malian government calls the shooting a \"terrorist act\"\nOne French citizen, one Belgian and three Malians are killed ."
-    },
-    {
-        "input": "(CNN)Manchester United defender Jonny Evans and Newcastle United striker Papiss Cisse have been charged by the Football Association for allegedly spitting during an altercation in Wednesday night's Premier League game at St James' Park. In a statement, English football's governing body said the players had been charged \"in relation to an alleged breach of FA Rule E1[a] in that in or around the 38th minute of the game the two players spat at each other. \"The incidents were not seen by the match officials but [were] caught on video.\" The players have until 6pm GMT on Friday to respond to the charge, and could face six-game bans if found guilty. Both Evans and Cisse released statements the day after the incident, with Evans saying: \"I would like to make it clear that I did not spit at Papiss Cisse.\" Cisse's statement said: \"I reacted to something I found very unpleasant. Sometimes it is hard not to react, particularly in the heat of the moment. I have always tried hard to be positive a role model, especially for our young fans, and yesterday I let you down.\" Spitting at another player is considered beyond the pale by professional footballers, and former Liverpool midfielder Dietmar Hamann, now a TV pundit, told the BBC's Match of the Day programme the incident had been \"disgusting.\" \"This is not acceptable,\" he said. \"There are kids watching. Something has to be done. The behaviour towards each other and the referee is deteriorating on a weekly basis.\" Ex-Manchester United midfielder Paul Scholes said he did not believe Evans had deliberately spat at Cisse, telling BT Sport: \"Look, it's not very nice. I think Jonny is spitting on the floor. \"I know Jonny -- he's not that type of person. If he wants to do that then it's not hard to miss, is it? He's only stood a yard away from him. What Cisse does afterwards is unforgivable.\" And former Liverpool player Steve McManaman told the channel: \"Cisse stands up and spits right at Jonny Evans' neck from about six inches. It's absolutely disgusting. \"Two wrongs do not make a right. If Jonny Evans has spat at him then it's wrong, but for Papiss Cisse to get up and react like that is absolutely disgusting. We talk about bad tackles, but that is worse.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Alleged incident happened in match at St James' Park .\nPlayers face six-match ban if found guilty .\nEvans denied spitting in statement .\nCisse statement says: \"I let you down\""
-    },
-    {
-        "input": "March 10, 2015 . We're truly international in scope on Tuesday. We're visiting Italy, Russia, the United Arab Emirates, and the Himalayan Mountains. Find out who's attempting to circumnavigate the globe in a plane powered partially by the sun, and explore the mysterious appearance of craters in northern Asia. You'll also get a view of Mount Everest that was previously reserved for climbers. On this page you will find today's show Transcript and a place for you to request to be on the CNN Student News Roll Call. TRANSCRIPT . Click here to access the transcript of today's CNN Student News program. Please note that there may be a delay between the time when the video is available and when the transcript is published. CNN Student News is created by a team of journalists who consider the Common Core State Standards, national standards in different subject areas, and state standards when producing the show. ROLL CALL . For a chance to be mentioned on the next CNN Student News, comment on the bottom of this page with your school name, mascot, city and state. We will be selecting schools from the comments of the previous show. You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call! Thank you for using CNN Student News!",
-        "instruction": "Summarize the following news article:",
-        "output": "This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call."
-    },
-    {
-        "input": "(CNN)Nine foreign workers are believed to be in the hands of ISIS-affiliated militants after an attack on a Libyan oil field, according to officials. Libya's internationally recognized government has blamed \"ISIS militias\" for the attack Friday in which the Al-Ghani oil field was set on fire. The kidnapped foreigners were working for VAOS, an Austrian-owned oil services company whose headquarters are in Tripoli, the Libyan capital. They include four Filipinos, an Austrian, a Czech and a Ghanaian, according to the Philippine foreign ministry. The Bangladeshi government said one of its citizens was also taken hostage. The nationality of the ninth worker was unclear. The abductions come amid Libya's deteriorating security situation in which Islamic militias, some of them pledging allegiance to the extremist group ISIS, have thrived. Egypt carried out airstrikes against ISIS militants in Libya last month after the killings of Egyptian Christians who had been kidnapped while working in a Libyan city. The Philippine foreign ministry said Monday that its Tripoli embassy had \"stepped up coordination\" with the Austrian company and Libyan authorities \"to locate the abducted Filipinos and ensure their safe and immediate release.\" Martin Weiss, a spokesman for the Austrian foreign ministry tweeted Monday that \"credible sources\" had confirmed that the Austrian citizen and other international workers were in the hands of ISIS \"terrorists.\" The attack on Al-Ghani is the latest in a recent spate of violence by militants targeting oil fields in Libya. The Libyan National Oil Corporation said over the weekend that it had warned VAOS more than two weeks ago to leave the area because of security concerns. VAOS wasn't immediately available for comment Tuesday. The Philippine foreign ministry said 52 other Filipinos are employed by VAOS but were relocated out of Al-Ghani \"much earlier\" and are now in Tripoli. Three other Filipinos have been missing in Libya since they were seized at the Mabruk oil field in February, the foreign ministry said. \"These cases underscore the escalating threat to the safety and security of Filipino oil workers in Libyan oil fields which have been targeted by armed groups in recent weeks,\" said foreign ministry spokesman Charles Jose. He said Philippine authorities were unsure at this point whether the kidnappers were ISIS members. CNN's Jomana Karadsheh and Marilia Brocchetto, and journalists Ayman Kekly and Arlene Samson-Espiritu, contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The abducted workers were seized in an attack on an oil field last week .\nThey are from the Philippines, Bangladesh, Austria, the Czech Republic and Ghana .\nMilitants have carried out a series of attacks on oil fields in recent weeks ."
-    },
-    {
-        "input": "(CNN)Think back to your high school's senior prank. Maybe you and your pals slipped a pig into the building or greased all the door handles or turned all the chairs upside-down. So it's no surprise that Wichita high school Principal Sherman Padgett was reluctant to play along last week when a student showed up and told him to stand in the hallway holding a bucket. \"She didn't say anything about why. She just said 'hold the bucket,' \" the North High School principal told CNN affiliate KWCH. \"I'm not going to hold the bucket unless I have a little inkling of why it could be.\" Good thing he changed his mind. Once he did, student after student streamed by, dropping notes of praise and thanks into the bucket. \"Thank you for making high school the best years of my life,\" one said. \"You've made my first year experience in America one that I will always fondly remember,\" read another. One student recalled how Padgett \"helped me get through my eating disorder and helped me get into therapy.\" The notes were the idea of senior Emily Jones. She decided she wanted to do something nice for the principal and cooked up the plan with her mother. \"Padgett's an awesome principal,\" she said. For Padgett, who has been principal at North since 2006, the experience was priceless. \"Became a little emotional on some of them,\" he said, \"I kind of read them and thought, 'man, this is better than a paycheck. This is why I do the things that I do.' \"",
-        "instruction": "Summarize the following news article:",
-        "output": "Wichita, Kansas, high school students surprise principal with kind-hearted senior prank .\n\"This is why I do the things that I do,\" the principal says ."
-    },
-    {
-        "input": "(CNN)The search for a comic book artist missing in the Cayman Islands since Thursday is now being called a recovery mission. Norman Lee, an artist for DC and Marvel comics, went missing while snorkeling with his wife off the eastern coast of Grand Cayman, CNN affiliate WCVB reported. Strong currents hindered the search, which lasted until Friday evening, Cayman 27 reported. \"It is unlikely that we will make any recovery at this stage,\" Chief Inspector Brad Ebanks told Cayman 27. Lee, 47, of Weymouth, Massachusetts, was known and for his work on \"Wolverine Annual,\" \"Supergirl,\" \"Starman\" and other comic book titles. Tributes flooded his Facebook page and Twitter from friends, fans and colleagues who knew him from art school and comic conventions. \"I cannot express how shaken I am that I will never get the chance to see that smile again, and it saddens me that this world has lost a wonderful man in Norman Lee. To his wife Jan, and his family and all his friends and fans that loved him, my sincerest condolences,\" friend and fellow graphic artist Chris Kinniery said on Facebook. \"I'm so sorry to hear about Norman Lee's disappearance. My condolences go out to his family. ... He was an amazing talent in the industry and it was always a pleasure to work with him,\" freelance artist .",
-        "instruction": "Summarize the following news article:",
-        "output": "Comic book artist Norman Lee went missing in the Cayman Islands on Thursday .\nAuthorities called off search on Friday evening ."
-    },
-    {
-        "input": "March 9, 2015 . This week marks the anniversary of several historic events: a civil rights march to Selma, Alabama's Edmund Pettus Bridge, the disappearance of a Malaysia Airlines flight, and the commemoration of International Women's Day. We'll cover all of them this Monday on CNN Student News, and we explain why the latest U.S. unemployment report is a mixed bag. On this page you will find today's show Transcript and a place for you to request to be on the CNN Student News Roll Call. TRANSCRIPT . Click here to access the transcript of today's CNN Student News program. Please note that there may be a delay between the time when the video is available and when the transcript is published. CNN Student News is created by a team of journalists who consider the Common Core State Standards, national standards in different subject areas, and state standards when producing the show. ROLL CALL . For a chance to be mentioned on the next CNN Student News, comment on the bottom of this page with your school name, mascot, city and state. We will be selecting schools from the comments of the previous show. You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call! Thank you for using CNN Student News!",
-        "instruction": "Summarize the following news article:",
-        "output": "This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call."
-    },
-    {
-        "input": "(CNN)Jessie Usher, the lead of Starz's series \"Survivor's Remorse,\" has been cast in Fox's sequel \"Independence Day 2.\" Liam Hemsworth has also been cast in a lead role and Jeff Goldblum will reprise his role as David Levinson from the original. It has been nearly 20 years since the Will Smith-starring sci-fi film hit theaters, grossing more than $800 million worldwide at the box office. Director Roland Emmerich will return to helm the film. \"Independence Day 2\" is set for release on June 24, 2016 — almost exactly 20 years from when the first film hit theaters on July 3, 1996. Dean Devlin, Emmerich and Harald Kloser are producing. Independence Day 2 is a big get for Usher, who will play a central character in the new film as the son of Smith's character. Smith's role in the first blockbuster helped establish him as a star, kicking his career up to the next level. Usher stars in the Lebron James-produced Starz TV series \"Survivor's Remorse\" as Cam Calloway, a basketball player who moves with his family to Atlanta after he signs a pro contract. The first season premiered Oct. 4, 2014, and Starz quickly renewed the show for a second season, which will air later this year. The rising star's film credits includes TriStar football drama When the Game Stands Tall, which hit theaters in August. On the TV side, he previously starred in the Cartoon Network's series Level Up. He's repped by Paradigm and Williams Unlimited. ©2015 The Hollywood Reporter. All rights reserved.",
-        "instruction": "Summarize the following news article:",
-        "output": "Jeff Goldblum set to reprise his role in \"Independence Day 2\"\nLiam Hemsworth and Jessie Usher also cast in key roles .\nSequel will be released 20 years after the original box office smash about an alien invasion ."
-    },
-    {
-        "input": "(CNN)Former Vice President Walter Mondale was released from the Mayo Clinic on Saturday after being admitted with influenza, hospital spokeswoman Kelley Luckstein said. \"He's doing well. We treated him for flu and cold symptoms and he was released today,\" she said. Mondale, 87, was diagnosed after he went to the hospital for a routine checkup following a fever, former President Jimmy Carter said Friday. \"He is in the bed right this moment, but looking forward to come back home,\" Carter said during a speech at a Nobel Peace Prize Forum in Minneapolis. \"He said tell everybody he is doing well.\" Mondale underwent treatment at the Mayo Clinic in Rochester, Minnesota. The 42nd vice president served under Carter between 1977 and 1981, and later ran for President, but lost to Ronald Reagan. But not before he made history by naming a woman, U.S. Rep. Geraldine A. Ferraro of New York, as his running mate. Before that, the former lawyer was  a U.S. senator from Minnesota. His wife, Joan Mondale, died last year.",
-        "instruction": "Summarize the following news article:",
-        "output": "Walter Mondale was released from the Mayo Clinic on Saturday, hospital spokeswoman said .\nThe former vice president, 87, was treated for cold and flu symptoms ."
-    },
-    {
-        "input": "(CNN)The only thing crazier than a guy in snowbound Massachusetts boxing up the powdery white stuff and offering it for sale online? People are actually buying it. For $89, self-styled entrepreneur Kyle Waring will ship you 6 pounds of Boston-area snow in an insulated Styrofoam box -- enough for 10 to 15 snowballs, he says. But not if you live in New England or surrounding states. \"We will not ship snow to any states in the northeast!\" says Waring's website, ShipSnowYo.com. \"We're in the business of expunging snow!\" His website and social media accounts claim to have filled more than 133 orders for snow -- more than 30 on Tuesday alone, his busiest day yet. With more than 45 total inches, Boston has set a record this winter for the snowiest month in its history. Most residents see the huge piles of snow choking their yards and sidewalks as a nuisance, but Waring saw an opportunity. According to Boston.com, it all started a few weeks ago, when Waring and his wife were shoveling deep snow from their yard in Manchester-by-the-Sea, a coastal suburb north of Boston. He joked about shipping the stuff to friends and family in warmer states, and an idea was born. His business slogan: \"Our nightmare is your dream!\" At first, ShipSnowYo sold snow packed into empty 16.9-ounce water bottles for $19.99, but the snow usually melted before it reached its destination. So this week, Waring began shipping larger amounts in the Styrofoam cubes, which he promises will arrive anywhere in the U.S. in less than 20 hours. He also has begun selling a 10-pound box of snow for $119. Many of his customers appear to be companies in warm-weather states who are buying the snow as a gag, he said. Whether Waring can sustain his gimmicky venture into the spring remains to be seen. But he has no shortage of product. \"At this rate, it's going to be July until the snow melts,\" he told Boston.com. \"But I've thought about taking this idea and running with it for other seasonal items. Maybe I'll ship some fall foliage.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "A man in suburban Boston is selling snow online to customers in warmer states .\nFor $89, he will ship 6 pounds of snow in an insulated Styrofoam box ."
-    },
-    {
-        "input": "(CNN)Farah fled the civil war in Syria with her husband in the middle of the night, hitching rides on trucks until they finally crossed into Jordan. Two days later, she gave birth to a girl in a country where they hold no status. Like 70,000 other refugees from Syria, Iraq, Sudan and the Palestinian Territories, Farah (her name and the others in this article have been changed to protect their anonymity) now lives with her family in Zarqa, a poor Jordanian city teeming with factories and crumbling apartment blocks. Men dominate public spaces, and many women stay at home, isolated. Two years ago, Farah was a nurse and her husband a lawyer. Here, he found work tiling construction sites, but was arrested three times for working illegally. \"Now my husband stays at home, depressed and afraid of being sent to the camps,\" Farah said. She is now the family breadwinner, working at a local organization providing educational programs to Syrian and Jordanian children. Every day she navigates the dangers of Zarqa's crime-ridden streets and ignores sexual advances from men. Yet Zarqa is also a pocket of hope. Some 384 female refugees working with the Near East Foundation  have been able to re-establish savings, restore dignity, strengthen their capacity to bounce back and rebuild their lives. Many of them choose to become earners for the first time. They belong to a network of Jordanian and refugee women -- coaches, mentors and peers -- who lean on each other and offer business and social support, exchange tips and build friendships. Some 80% of the refugees in this network have chosen to invest in building a small home-based business. Fatiya, who escaped Iraq during the 2003 invasion, was surviving on charity until six months ago. Now she runs her own leather goods home-based business, making belts, wallets and key chains for tourists from the safety of her home. These days, Fatiya is busy rebuilding her life. \"I make my own way,\"she says. Zainab, also an Iraqi refugee, is now a hairdresser in Zarqa. On the side, she teaches the art of hairdressing to young women, walking alongside them in their first steps to become economically independent. On March 8, we celebrate International Women's Day and the empowerment of women globally -- including the nearly 6 million refugee women and girls who, like Farah, Fatiya and Zainab, continue to adapt to life in their new surroundings with determination, creativity and skill, despite increasingly limited options. Aid agencies must match the strength of these women's resilience. To truly assist them, we must empower them with opportunities and choices. Of the 10.5 million refugees registered with UNHCR, the UN's refugee agency, less than 1% are eventually resettled. The rest remain in limbo, forced to forge new lives in places where they often have no right to work. Refugees are displaced for an average of 20 years, and more than half disappear into urban sprawls where they struggle to integrate and start their lives over. Humanitarian assistance and media attention tend to fixate on \"immediate\" aid -- distribution of cash, food and subsistence items. This is unsustainable. UNHCR provides critical cash assistance to refugees in Jordan -- but this can be as little as $71 a month, 16 times below the country's poverty line. Refugees need to supplement their allowances, yet they cannot legally find employment, and working in the informal sector can be dangerous. Farah, Fatiya and Zainab are among hundreds of women who are finding a world of options as they build their vocational skills and financial literacy. But their increased role as earners challenges cultural norms, exposing them to heightened abuse and violence. Women's ability to generate income does not guarantee economic independence. To help this network of women control the income they earn, the Near East Foundation facilitates discussions with male family members about their wives' and daughters' newfound roles. \"In the beginning, these women were tremendously shy -- they didn't talk,\" says Hamdan Eliemat, who heads the Bani Hasan Islamic Society, a community organization supporting the women. He laughs. \"And now they won't stop talking. And we men, now we have to listen.\" The international community must advocate for refugees' right to survive. We must ensure that beyond immediate aid, women have access to skill-building, financial resources, social networks and protection from violence and harassment, so they have the chance to build their own futures.",
-        "instruction": "Summarize the following news article:",
-        "output": "Jina Krause-Vilmar: On International Women's Day, focus on helping refugee women adapt to new lives .\nShe says aid agencies like hers aim to help these women build job skills and financial literacy .\nShe says the effort includes educating men about wives' and daughters' new roles ."
-    },
-    {
-        "input": "(CNN)Real Madrid fell to a lacklustre 1-0 defeat at the hands of Athletic Bilbao Saturday, potentially handing the La Liga advantage to arch rival Barcelona. A stunning header from Aritz Aduriz shortly before half time was enough to win the day for an organized and impressive Bilbao. Real piled on the pressure for much of the second half but couldn't find a way past a staunch opponent despite finishing the match with four strikers on the park. Gareth Bale struck a post with an extraordinary shot from close to the halfway line as full time approached while Arduriz almost doubled Bilbao's lead with another headed chance late on. The result means second placed Barca can now take top spot should it win at home to mid-table Rayo Vallecano Sunday. Of more immediate concern to Real boss Carlo Ancelotti after the game, however, was his side's lack of cutting edge. \"I think the problem we are having at the moment is quite clear,\" Ancelotti told reporters. \"We are not finding a way through like we did in the games before. We've only scored one goal, from a penalty, in two games. \"That is the problem we have to fix. We lack efficiency up front.\" Real are now without a win in two league games while the 4-0 February hammering at the hands of city rivals Atletico Madrid, the clubs biggest derby defeat since 1947, is still fresh in the memory. With the second El Classico fixture of the season against Barcelona only a fortnight away, Ancelotti is looking for a quick fix to his side's attacking inhibitions. However, such worries are of little concern to a Bilbao side celebrating its first win over Real in five years. After a slow start to the season the Basques are now up to eighth position. \"We are very proud of how hard we worked,\" goalscorer Arduriz said in comments carried by the AFP news agency. \"The three points will prove a huge boost.\" \"The fans have enjoyed it a lot and they deserved that after the poor performances at San Mames earlier in the season.\" Bayern Munich continued its seemingly inexorable march to a third consecutive Bundesliga title with a come from behind 3-1 victory away to Hannover. Hiroshi Kiyotake gave the hosts a surprise lead after 25 minutes but Xabi Alonso levelled shortly after. A second half brace from Thomas Muller ensured the Bavarians would increase their league at the top of the Bundesliga to 11 points as second place Wolfsburg fell to a 1-0 defeat at Augsburg. Elsewhere Saturday, Borussia Dortmund's winning streak came to an end with as Jurgen Klopp's revitalized side were held to a 0-0 draw with Hamburg. Paris Saint Germain returned to the top of the Ligue 1 table, for 24 hours at least, with a convincing 4-1 win over Lens at the Parc de Princes. David Luiz opened the scoring as the first half came to a close before Zlatan Ibrahimovic doubled the Parisian's advantage from the penalty spot on the hour mark. Substitutes Blaise Matuidi and Javier Pastore secured the win after Yoann Touzghar had pulled one back deficit for Lens. Lyon will return to the Ligue 1 summit should it dispose of Montpelier Sunday. Read: .",
-        "instruction": "Summarize the following news article:",
-        "output": "Real Madrid slump to defeat against Athletic Bilbao .\nSolitary goal from Aritz Aduriz enough to give the Basques victory .\nBayern Munich continue Bundesliga domination ."
-    },
-    {
-        "input": "Tokyo (CNN)Police in Japan say they have arrested a 40-year-old man accused of fatally stabbing five neighbors in a farming community in Sumoto city. The man has admitted stabbing three women aged 59, 76 and 84, as well as two men aged 62 and 82, Deputy Police Chief Keizo Okumoto told CNN. He said the accused refused to comment further as he was awaiting his lawyer. The victims -- two couples and the 84-year-old woman -- lived within 100 meters (330 feet) of the suspect's home, police said. According to local media, the accused and the victims shared the same surname, but it is unclear if they are related. Sumoto city is on Awaji Island, Hyogo prefecture, in Japan. CNN's Susannah Cullinane contributed to this report from London.",
-        "instruction": "Summarize the following news article:",
-        "output": "Police in Japan say they have arrested a man, 40, after five neighbors were fatally stabbed .\nThe accused shares the same surname as the victims, aged 59 to 84, local media say .\nA police official says the man has admitted to the stabbings but refused to comment further ."
-    },
-    {
-        "input": "(CNN)A suicide attacker detonated a car bomb near a police vehicle in the capital of southern Afghanistan's Helmand province on Tuesday, killing seven people and injuring 23 others, the province's deputy governor said. The attack happened at about 6 p.m. in the Bolan area of Lashkar Gah city, said Mohammad Jan Rasoolyar, deputy governor of Helmand. Several children were among the wounded, and the majority of casualties were civilians, Rasoolyar said. Details about the attacker's identity and motive weren't immediately available.",
-        "instruction": "Summarize the following news article:",
-        "output": "Car bomb detonated near police vehicle in Lashkar Gah, deputy governor says ."
-    },
-    {
-        "input": "(CNN)Pakistan's highest court Friday ordered the release of Zaki-ur-Rehman Lakhvi, the alleged mastermind behind the Mumbai attacks, calling his detention illegal. Lakhvi, a top leader of the terrorist group Lashkar-e-Taiba, was not present at Friday's court proceeding. The terror attacks in India left more than 160 people dead in November 2008. In the attacks, heavily armed men stormed landmark buildings around Mumbai, including luxury hotels, the city's historic Victoria Terminus train station and a Jewish cultural center. On Friday, India summoned the Pakistan high commissioner \"to convey our strong feelings about (the) Lakhvi verdict,\" said India's external affairs spokesman Syed Akbaruddin. Last year, the court granted Lakhvi bail, a decision the Pakistani government had said it would challenge. Many in India are still angry over the attacks and had criticized the bail decision. \"It is very disappointing that the accused of the Mumbai attacks has been granted bail,\" the nation's home minister, Rajnath Singh, said in December. India executed the last surviving gunman from the attacks in 2012. Other suspects were all killed during the series of attacks, which went on for three days. CNN's Harmeet Shah Singh contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The terror attacks in India left more than 160 people dead .\nA  court granted the suspect bail last year ."
-    },
-    {
-        "input": "(CNN)An officer, responding to reports of a suspicious person, shot and killed an unarmed man who was running around in a metro Atlanta apartment complex naked. The officer fired two shots when the man charged at him, said Cedric Alexander, the public safety director of DeKalb County. But given that the man was not carrying a weapon, the police department immediately turned over the case to the Georgia Bureau of Investigations for an independent probe. \"What I have requested here [is] a result of what's going on currently across this country as it relates to police shootings,\" Alexander told reporters. The officer was white; the deceased man was African-American, Alexander said. The incident took place Monday afternoon at an apartment complex in Chamblee, a suburb of Atlanta. Someone called 911 to report a man \"acting deranged, knocking on doors and crawling around naked,\" Alexander said. When the officer arrived, the man charged at him, Alexander said. \"The officer called him to stop while stepping backward, drew his weapon and fired two shots,\" he said. The man, struck twice in the upper body, died. Police later learned he was a resident at the complex. \"I can only reasonably assume that if he was running around the apartment  complex naked, I believe we can make the assumption there may have been some mental health experience that he might have been having,\" Alexander said. DeKalb County police officers undergo some degree of training on how to deal with the mentally ill. But this, and other incidents, highlight the need for more, the public safety director said. \"That's becoming more and more apparent,\" Alexander said. \"We have already, as many departments have begun to do, look at how to expand our mental health training when we find it certainly necessary to do so. Because it appears that we're seeing more and more of these cases across the country in which police are engaging with those who appear to be in distress.\" Police did not release the officer's name, but said the seven-year veteran was placed on administrative leave. During the incident, the officer had access to his stun gun and pepper spray, Alexander said.  Why he chose to draw his weapon will come out during the investigation. \"I think in all fairness we need to wait and see what the outcome of the investigation is because I can't tell you, beyond what I have told you so far, what kind of measures that officer may have taken,\" he said. As fatal police shootings come under increased scrutiny in the current climate, police departments also appear to be more forthcoming in proactively releasing information for transparency's sake. Such is the case in Madison, Wisconsin, where Madison Police Chief Mike Koval has been out front and outspoken about the shooting death of  19-year-old Tony Robinson at the hands of an officer. And it seems to be the case in this DeKalb County incident. \"If you look at the state in this country and the things we're going through right now across this country with police-involved shooting, certainly it's a concern to many Americans. And there has certainly been recommendations that have been made in regards to moving towards more independent type of investigations,\" he said.",
-        "instruction": "Summarize the following news article:",
-        "output": "Police: Officer fired two shots when the man charged at him .\nThe case was immediately turned over to the GBI ."
-    },
-    {
-        "input": "London (CNN)A photo of a weasel hitching a surprise lift on the back of a flying woodpecker near London has gone viral on Twitter, with more than 7,000 users retweeting the original post of the image. It was first posted by photographer Jason Ward on Monday and credited to Martin Le-May. After #WeaselPecker gained momentum, British media soon picked up the story, and television channel ITV interviewed Le-May. The amateur photographer from Essex, near London, told the broadcaster he had been walking with his wife in Hornchurch Country Park, Essex, when they heard \"a distressed squawking\" noise and spotted the woodpecker. \"Just after I switched from my binoculars to my camera the bird flew across us and slightly in our direction; suddenly it was obvious it had a small mammal on its back and this was a struggle for life,\" Le-May said. Eventually, Le-May told ITV, the weasel -- known as a \"least weasel\" in some countries -- lost its grip and the bird flew away. Marina Pacheco, chief executive of Britain's Mammal Society told CNN the image looked genuine and that it was possible an omnivorous weasel would take on a woodpecker. \"Weasels will go for anything that looks like food -- they've got a high metabolism and they've got to eat a lot,\" she said. \"It doesn't surprise me that a weasel took a punt -- I've seen a photo of a weasel charging a group of sparrows, they're very hungry animals.\" But she said the weasel, which would generally try to break the neck of its prey to subdue it, may have exceeded its abilities in this case. \"I think it was a bit of a long shot -- it looks like it tried to grab the neck of the woodpecker to break it,\" Pacheco said. \"I think that it probably doesn't have a big enough jaw to bite through the spine of the woodpecker.\" Weasels would not normally target green woodpeckers, Pacheco said -- their predators are normally the size of a stoat or larger. But the birds are known to spend a fair amount of time on the ground pulling up worms and hunting insects. \"If the woodpecker had managed to hit the weasel with its beak it would have been the end of the weasel,\" she said. \"They're quite gung-ho little creatures.\" The pluckiness of the weasel spawned a number of parodies on Twitter, with manipulated images showing the creature in turn being ridden by Russian President Vladimir Putin, popstar Miley Cyrus, football star John Terry -- and even what appears to be a dog red panda dressed in a Darth Vader costume. (Update: Twitter has now educated us on the difference between a dog and a red panda. Sorry, Darth!) As for the bird? The green woodpecker is also known as a \"yaffle\" for its laughing call. After the shock of being targeted by a hungry weasel wears off, we can only hope it lives up to its nickname.",
-        "instruction": "Summarize the following news article:",
-        "output": "A photo of a green woodpecker flying with a weasel on its back has gone viral on Twitter .\nThe image was snapped by amateur photographer Martin Le-May near London .\nIt sparked the hashtag #weaselpecker and has spawned numerous memes ."
-    },
-    {
-        "input": "(Rolling Stone)Kanye West fans are used to seeing thousand-dollar price tags attached to the rapper's Air Yeezy sneakers when they're put up for sale on eBay, but the auction site is now being overrun by sellers offering up plastic bags full of air allegedly captured at West's concerts. The gag started Friday when one ambitious seller attempted to sell a Zipperseal bag with \"Air From Kanye Show,\" with 90 bidders driving the price tag to over $60,000, the Telegraph reports. While that auction was ultimately pulled from the auction site, eBay has since been flooded with similar auctions promising air bagged from a Yeezus show, even though the copycats lacked certificates of authenticity proving the air -- likely carbon dioxide -- was grabbed at a West concert. eBay has apparently stopped trying to swat down the new \"bagged air\" auctions, so buyer beware at this point. Rolling Stone: Kanye reveals new album title . The bagged air gag has inspired a rash of similar items like \"Ziplock bag of air from Garth Brooks concert\" and \"Kanye West Concert Air-Infused Bay Leaves,\" with the latter tagged with a $4,999 opening bid. Another seller is pushing a bag of \"Flatulence from Kanye\" for the Buy It Now price of $5. Rolling Stone: Watch Kanye's speech on racism at BET honors . In other Kanye eBay news, following the soft launch of his Yeezy 750 Boost, his first footwear collaboration with Adidas, pairs of those sneakers are now littered throughout the site with prices ranging from $1,700 to $5,000. Those asking prices come in direct conflict of what West hopes to achieve with his new sneakers, as he recently told Style.com that he hoped his Yeezy Boosts would be \"super-inexpensive\" so that everyone could afford them. Copyright © 2011 Rolling Stone.",
-        "instruction": "Summarize the following news article:",
-        "output": "A wag decided to sell \"air from Kanye West concert\" on eBay .\nBidding got to more than $60,000 before plug was pulled .\nOthers are now doing similar auctions ."
-    },
-    {
-        "input": "(CNN)The flight crew of the Delta Air Lines plane that skidded into a fence at LaGuardia Airport last week cited brake issues during the landing, according to an update on Monday from the NTSB. The crew said they did not sense any deceleration from the wheel brake upon landing, despite the auto brakes being set to \"max,\" according to an ongoing investigation by the National Transportation Safety Board. The runway appeared all white in the moments before landing, according to the report. They based their decision to land after receiving a brake action report of \"good\" from air traffic control, the NTSB said. \"The automatic spoilers did not deploy,\" the crew told the NTSB, \"but that the first officer quickly deployed them manually.\" The captain said he was unable to stop the aircraft from drifting left, according to the report. The Boeing MD-88 sustained significant damage to the left wing, flight spoilers, the nose of the plane and the left wing fuel tank, according to the NTSB. Delta Flight 1086 departed from Atlanta shortly after 9 a.m. Thursday. LaGuardia was dealing with snow and freezing fog as the flight approached its destination about two hours later. The aircraft briefly circled New York because of issues with snow and ice before touching down shortly after 11 a.m. The plane slid off the runway with its nose busting through a fence before skidding to a halt mere feet from frigid waters. Twenty three passengers received minor injuries, and others were transported to the hospital for evaluation. An NTSB meteorologist is examining the weather conditions at the time of the accident, said the report. The cause of the accident has not been determined.",
-        "instruction": "Summarize the following news article:",
-        "output": "Delta Air Lines Flight 1086 skidded into a fence last week at a LaGuardia Airport beset by winter weather .\nThe NTSB says the crew reported they did not sense any deceleration from the wheel brake upon landing ."
-    },
-    {
-        "input": "(CNN)More than 100 schools in Pakistan have been renamed in honor of the children killed in a Taliban siege at a Peshawar school last year. According to the provincial government of Khyber Pakhtunkhwa, 107 local schools now bear the name of a student killed during the December 16 attack on the Army Public School. Government spokesman Mushtaq Ghani told CNN that the gesture was a way to remember the bravery of the students who were taking exams in an auditorium  when Taliban militants strapped with explosives laid siege. Some six hours later, 145 were dead, 132 of them children, according to authorities. In February, Pakistani authorities arrested a suspect, accusing him of commanding the attack. The Army Public School is home to about 1,100 students and staff, most of them sons and daughters of army personnel from around Peshawar. CNN's Sophia Saifi and Mariano Castillo contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "More than 100 schools renamed in honor of students killed in a Taliban schoolhouse attack .\nA six-hour siege at a Peshawar school killed 145 people in December ."
-    },
-    {
-        "input": "(CNN)The Salvation Army in South Africa is using #thedress to spread a different kind of message about the colors black and blue. The organization is using the viral sensation to spread awareness about domestic violence. An ad shows a model wearing a white and gold dress resembling the one that caused an Internet meltdown last week as people debated whether the garment wasn't actually black and blue instead. The model also sports bruises on her face and body. The caption reads \"Why is it so hard to see black and blue? The only illusion is if you think it was her choice.\" The ad has received mostly positive response. The dress, which is sold by the British company Roman, is actually black and blue. Julia Haller, the ophthalmologist-in-chief at Wills Eye Hospital in Philadelphia, explained that different people saw different colors because eyes perceive colors in a slightly different way, based on genes.",
-        "instruction": "Summarize the following news article:",
-        "output": "#thedress was an Internet sensation: Is it blue and black or white and gold?\nThe Salvation Army in South Africa is using it to raise awareness .\nThe reaction has been mostly positive ."
-    },
-    {
-        "input": "(CNN)NASCAR announced Wednesday that it has lifted its suspension of driver Kurt Busch, making him eligible to return to the sport immediately -- albeit \"under indefinite probation.\" Busch was suspended after a Delaware civil court found that he probably committed domestic violence against his former girlfriend, Patricia Driscoll. She accused the standout driver of grabbing her by the throat and slamming her head against a wall in his motor home at Delaware's Dover International Speedway in September. Twice, the NASCAR 2004 Cup champion appealed his indefinite suspension and lost both times. That meant he missed the season-opening Daytona 500 as well as two subsequent Sprint Cup races. But he should be able to compete through the fall, and NASCAR says that it will waive its requirement for him to compete in all championship events. The case against Busch took a significant turn last week, when the Delaware Department of Justice announced that charges would not be filed against him. \"As I have said from the beginning, I did not commit domestic abuse,\" Busch said then. \"I look forward to being back in racing as soon as possible and moving on with my life.\" A Kent County, Delaware, family court commissioner ruled in February that Busch must stay away from Driscoll for a year. NASCAR officials decided to lift Busch's suspension because he complied with the requirements of the racing circuit's reinstatement program, completed behavioral assessments and got the OK from a behavioral health care expert to race again. He's not totally out of the woods. In a news release, NASCAR said Busch must \"undergo additional steps to address the behavior for which he was penalized.\" Steve O'Donnell, NASCAR executive vice president and chief racing development officer, said the lack of charges against Busch expedited his return. \"We have made it very clear to Kurt Busch our expectations for him moving forward, which includes participation in a treatment program and full compliance with all judicial requirements as a result of his off-track behavior,\" O'Donnell said. CNN's Jill Martin contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The lack of charges against Busch expedited the decision, a NASCAR official says .\nKurt Busch was accused of grabbing his ex-girlfriend by the throat, slamming her head .\nHe twice appealed NASCAR's indefinite suspension and lost ."
-    },
-    {
-        "input": "Kuala Lumpur (CNN)The initial hours after the disappearance of flight MH370 were characterized by confusion and chaos, as air traffic controllers struggled to comprehend the situation and radar operators failed to take notice, according to data contained in an interim report. The report -- released one year after the disappearance of the Boeing 777 and its 239 passengers and crew --- provides a detailed picture of delays and protocol violations before the launch of the search and rescue. An astonishing five hours and 13 minutes passed between the last communication from the flight crew and Kuala Lumpur's first distress signal concerning the missing plane. And it was another five hours before the first search flights took off to try to find it. CNN's aviation correspondent Richard Quest said he believes the delayed response was the most disturbing thing revealed by the interim report -- \"the lack of somebody pushing the big red button that says crisis and panic.\" A year later after the plane's disappearance, not a single trace of Flight MH370 has been found despite extensive search efforts. Investigators believe the wreckage lies somewhere on the bottom of the Indian Ocean, based on the analysis of satellite communications data. The first sign that something was wrong with flight MH370 came after plane failed to check in with Vietnamese Air Traffic Controllers after leaving Malaysian airspace. According to protocol, Ho Chi Minh ATC should have informed their Kuala Lumpur counterparts (KL ATCC) about this within five minutes. Instead they waited 20. When Ho Chi Minh finally did inform Kuala Lumpur, the confusion was evident, as seen in transcripts of the conversation released Sunday. KL ATCC asked three times at what point Ho Chi Minh lost contact, then went on to express concern at the delay, asking \"Why you didn't tell me first? Within five minutes you should be (sic) called me.\" The confusion only got worse after Malaysia Airlines mistakenly told Kuala Lumpur Air Traffic Controllers they could see the flight somewhere over Cambodia. It took an hour and a half to clear this up, after Malaysia Airlines admitted to controllers they were only looking at the projected flight track. Malaysia Airlines CEO Ahmad Jauhari Yahya said the misinformation was a momentary lapse by a company employee. \"Our information was only to be as a guide. We are not an ATC per se. We don't have radar,\" he told CNN. The watch supervisor then waited another two hours to activate the rescue coordination center. Still another hour went by before before Kuala Lumpur issued the distress signal. No explanation for the delay is given in the interim report, which is composed of factual data and provides no conclusions or recommendations. After the air traffic controllers lost contact with MH370, the plane continued to fly within the range of multiple radar systems belonging to four different countries. Yet little seems to have been done with the data in the immediate hours after the plane disappeared. The interim report says that \"for unknown reasons\" Indonesia's Medan Radar did not see the flight. And Thailand \"did not pay much attention,\" since MH370's flight path did not fall within its borders. Malaysian military radar tracked the flight for an additional hour, including its turn back across the Malay Peninsula. Despite this information, search and rescue teams did not begin expanding the search area for a full day. Though the interim report makes no mention of it, a failure by the Malaysian military to alert others to the relevant radar data may be blame. A briefing document prepared by the International Civil Aviation Organization (ICAO) said military authorities failed to share the final radar fix of MH370 with their civilian counterparts for 20 hours. CNN approached Malaysia's Ministry of Defence for comment but is yet to receive a response . Another working document notes that MH370's turn back might have been discovered much earlier, if the military and non-military agencies had coordinated better. \"In essence, a week or more was lost in the initial search because of poor civil/military cooperation,\" reads the ICAO working document. The interim report released by Malaysian investigators on Sunday provides no information about when the military radar data was shared with other authorities. It's impossible to know if a speedier response from air traffic controllers, or more immediate access to radar data, would have changed the course of events for MH370. But it would have provided authorities with more time, either to track the flight or to search the ocean before the batteries died in the emergency locator beacons. Looking back at the series of miscommunication between air traffic controllers and the radar lapses also provides valuable lessons that could help future search and rescue operations. Though the MH370 investigation team did not draw lessons in the current report, it plans to provide safety recommendations in the months ahead. Journalist Chan Kok Leong contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Interim report exposes delays and inaction after MH370 disappeared .\nFlight carrying 239 people and crew has not been found, one year later .\nTook 10 hours for the first search flights to take off, according to report ."
-    },
-    {
-        "input": "Naypyidaw, Myanmar (CNN)Twenty-one people are dead and 21 missing after a ferry capsized in the Southeast Asia nation of Myanmar. Myanmar's Ministry of Information said in a statement that the ship capsized Friday night as it sailed, in bad weather conditions, around the city of Sittwe. That's when a large wave crashed into the ferry, causing it capsize near Myaybone and Myaukkyine islands. Authorities have managed to rescue at least 167 people, according to the information ministry for Myanmar, which is also known as Burma. Pictures from the government showed rescue workers helping people off a boat onto the land. Sittwe is the capital of Rakhine state and sits on the Bay of Bengal, about 55 miles (90 kilometers) from the Bangladesh border. This weekend's weather forecast for the city calls for some clouds giving way to clear skies, with high daytime temperatures expected to be in the 30s Celsius (80s to 90s Fahrenheit). Fatal ferry disasters are nothing new to the region. Last month, at least 68 people died when a packed double-decker ferry sank while on the Padma River north of neighboring Bangladesh's capital, Dhaka, officials said. A cargo vessel hit the ferry, causing it to overturn and trapping passengers on its lower deck. Forty-five people died in an accident on the same river in August. In May 2013, several boats carrying as many as 150 people were thought to have capsized near Myanmar's western coast ahead of a storm approaching the area. Those boats were carrying Rohingya, members of Myanmar's long-suffering Muslim minority, Thailand-based U.N. official Kirsten Mildren said at the time. Journalist Manny Muang reported from Myanmar, and CNN's Greg Botelho wrote this story from Atlanta.",
-        "instruction": "Summarize the following news article:",
-        "output": "167 people have been rescued, Myanmar's government says .\nThe ferry capsized after being hit by a large wave in bad weather conditions ."
-    },
-    {
-        "input": "(CNN)Myanmar warplanes fighting rebels dropped a bomb at a sugarcane field in China, killing four civilians, the latter's state media reported Saturday. In addition to the fatalities, nine others were wounded, according to Xinhua news agency. Shortly after the incident Friday, China sent fighter jets to patrol over their shared border. The jets are there to \"track, monitor, warn and chase away\" Myanmar military planes, China's air force told state media. China summoned Myanmar's ambassador in Beijing after the incident in the border city of Lincang. Liu Zhenmin, the vice foreign minister for China, called on Myanmar to investigate and bring those behind the attack to justice. Myanmar forces have been battling ethnic separatist rebels in the rugged border region across from Yunnan province. In recent incidents, stray gunfire has damaged property on the Chinese side of the border, prompting Beijing to warn Myanmar to ensure safety. There was no immediate reaction from Myanmar.",
-        "instruction": "Summarize the following news article:",
-        "output": "China sends fighter jets to patrol over border with Myanmar following the incident .\nReport: In addition to the fatalities, nine others were wounded ."
-    },
-    {
-        "input": "(CNN)When I was a child, a pale specter used to call our house most evenings, eager to chat with my doctor father about her myriad medical concerns. We called her the \"White Bread Lady,\" a moniker she earned for one particularly inane call in which she panicked to my father after consuming white bread. She wasn't breaking out in hives or having any adverse effects to the bread. No, she was just concerned that some future illness could befall her given that one particular dietary decision. Although we all laughed at the time, it was with a bit of shifty-eyed shame. Because most of us (including if not particularly the illustrious Ehrlich family) have lurking within us our very own \"White Bread Lady,\" ready to convince us that each cough, sniffle and less-than-nutritious meal might be a detriment to our health. And, naturally, that White Bread Lady looms even larger when we can type our worries into a search bar and unlock a bevy of potentially distressing information. Yup, so quoth Google, we all have cancer. According to the Pew Research Center, 80% of Internet users have looked up health information online. While that practice can be beneficial in some respects, the abundance of (variably valid) information online can turn us into e-hypochondriacs. (Or, worse, can lead us to neglect getting the care we need.) Read on for five mistakes -- courtesy of a selection of health care professionals -- that people make when diagnosing themselves online. Searching blind . Your eye is twitching like an overly caffeinated college student sitting behind a pretty girl in lecture hall, twirling his pencil and hoping to catch a whiff of her lovely shining hair. You type \"eye twitch\" into Google and come up with a really rad website that explains that this newfound spasm is actually an indication that your third eye is fixing to open, revealing to you wonders untold. You are the chosen one. Too bad that this trove of \"medical information\" is actually some dude's fan-fiction site. Sure, the above is an extreme example, but, as Dr. Kevin Pho of KevinMD.com pointed out, \"There's a lot of bad information on the Web and information that can be dangerous.\" Especially if you're not considering who put up that information in the first place. Pho urges users to favor Web addresses ending in .org and .edu when looking for reputable health-care info, and to check who is funding the collection of that information. \"There's so much information from organizations trying to sell products or push their agenda on the Web,\" he said. He suggests turning to sites like Mayo Clinic as well as troves of information curated by doctors (like Pho's own website) when trolling the web for info. And, of course, if a site mentions trolls and third eyes, one should definitely press on. Flailing in forums . If there's one thing people like to do online, it's talk about their problems -- especially mundane things like coughs and headaches and their babies' various and sundry discharges. That's all well and good; sharing experiences with others is enriching! Unless the people you're sharing with are idiots. \"You can easily fall into that rabbit hole and find some forum that really isn't relevant but maybe sounds kind of close,\" warned Craig Monsen, co-founder of symptom-checker app SymCat. On the other hand, \"sometimes you'll stumble on exactly the right forum where someone has your same exact problem, and their solution does help.\" \"Health care forums are definitely another tool that individuals can use in order to crowdsource a diagnosis based on their symptoms,\" added Dr. Natasha Burgert of KC Kids Doc. \"I think that these can be a really powerful tool not only for discussing potential diagnosis or symptom relief but also finding a forum of individuals in which you can discuss emotional and psychological parts of an illness and develop a wonderful online support community.\" The trick is to be wary about the issues being discussed in forums and how germane they are to you. And, you know, if people start talking about homemade remedies fashioned from bleach, maybe click off and see a doc. Getting emotional . You know that game \"6 Degrees of Kevin Bacon\"? There should seriously be a version of that called \"6 Degrees of Cancer\" -- as in, when looking up your symptoms online, how long does it take to deduce that you have a life-threatening disease instead of, say, a simple cold? According to Burgert, the root of this whole \"worst-case scenario\" thing is getting too emotional. \"For most intents and purposes, when you're looking for online health information, it's about yourself or a family member,\" she said. \"When you're looking through that lens, it's very hard to keep emotional distance. So you can read about a diagnosis that either makes you very scared or calms your fears -- and that's the path you'll continue down, whether it's correct or not.\" Burgert suggested using online symptom checkers simply to \"understand possible diagnoses, find some initial steps for relieving the symptoms and determine if this is something that needs further evaluation or that can be managed at home.\" SymCat and Mayo Clinic's symptom checker let you type in what you're experiencing and unearth a spectrum of diagnoses and suggestions for when to seek a doctor's aid. Your doctor's website might also have such a tool. Voila, you just increased your separation from cancer by at least a couple of degrees. Keeping mum around MDs . \"I think, traditionally, many physicians are a little apprehensive when that stereotypical patient comes to their office with big stacks of printouts from the Internet,\" Pho said. \"But I think more and more doctors are accepting it. Personally, I think that transparency of information is helpful in a way.\" Translation: Help your doctor help you. If you're worried about a particular medical situation and did some research to help narrow down what's ailing you, share that info with your physician. \"I really appreciate when patients bring in information that they found online, because it allows me to guide my instruction and plan based on their true concerns,\" Burgert said. \"People get scared when they get sick and hurt, and they want to use multiple sources of information to help themselves. The Internet adds to that physician's expertise in order to do that.\" Remember, though, knowing how to use the Internet doesn't make you a doctor. Google doesn't count as a second opinion. If you're unhappy with your doc's diagnosis, go get one the traditional way. Putting off the inevitable . If your ailment isn't going away, all the symptom-checking and Mayo Clinic-ing in the world isn't going to help you. Make a doctor's appointment. Like, right now. Sites like ZocDoc make it super easy (and free; doctors pay to be listed) to set up an appointment ASAP, so no whining that you'll have to wait two weeks to see a doc and maybe by then \"it\" will have gone away. Unless, of course, \"it\" is that white bread you just ate. In that case, please stop calling my dad.",
-        "instruction": "Summarize the following news article:",
-        "output": "According to a Pew report, 80% of Internet users have looked up health info online .\nSites like Mayo Clinic and others curated by doctors are safer than online forums ."
-    },
-    {
-        "input": "(CNN)A grand jury in Clark County, Nevada, has indicted a 19-year-old man accused of fatally shooting his neighbor in front of her house last month. Erich Nowsch Jr. faces charges of murder with a deadly weapon, attempted murder and firing a gun from within a car. Police say Nowsch shot Tammy Meyers, 44, in front of her home after the car he was riding in followed her home February 12. Nowsch's attorney, Conrad Claus, has said his client will argue self-defense. The Meyers family told police that Tammy Meyers was giving her daughter a driving lesson when there was a confrontation with the driver of another car. Tammy Meyers drove home and sent her inside to get her brother, Brandon, who allegedly brought a 9mm handgun. Tammy Meyers and her son then went back out, police said. They encountered the other car again, and there was gunfire, police said. Investigators found casings from six .45-caliber rounds at that scene. Nowsch's lawyer said after his client's first court appearance that Brandon Meyers pointed a gun before anyone started shooting. He said the family's story about a road-rage incident and what reportedly followed don't add up. After zipping away from the first shooting, Tammy Meyers drove home and the other car, a silver Audi, went there also. Police said Nowsch shot at both Tammy and Brandon Meyers. Tammy Meyers was hit in the head and died two days later at a hospital. Brandon Meyers, who police said returned fire at the home, was not injured. The driver of the silver Audi has yet to be found by authorities. That suspect wasn't named in Thursday's indictment. Nowsch was arrested five days after the killing in his family's house, just one block away from the Meyers' home. He is due in court Tuesday for a preliminary hearing.",
-        "instruction": "Summarize the following news article:",
-        "output": "Erich Nowsch will face three charges, including first-degree murder .\nHe is accused of killing Tammy Meyers in front of her home .\nThe two lived withing walking distance of each other ."
-    },
-    {
-        "input": "(CNN)With the ongoing protests over the shooting death by police of black teenager Tony Robinson in Madison, Wisconsin, the racist chanting of fraternity members at the University of Oklahoma, and now the inexcusable shootings of two police officers in Ferguson, Missouri, it's safe to say that the always-strained race relations in this country are being pushed to the breaking point. And the point with the most stress is the delicate relationship between police forces and the minority communities that they serve. To approach things from a more positive angle, the situation is dire enough that we absolutely have to try to uncover the good and not dwell on the negative. The best time to make things better is when it seems that everything is getting worse. This unacceptable status quo can motivate us to take the necessary steps to address the problems, which are not going to disappear unless we honestly deal with them. When things are going wrong, responsible people can begin by saying the right things. President Obama said on Twitter, \"Violence against police is unacceptable. Our prayers are with the officers in MO. Path to justice is one all of us must travel together.\" The Congressional Black Caucus issued a statement saying, \"The CBC understands the frustrations in Ferguson, but a response of violence is not the answer during this transformative moment in our country.\" And Attorney General Eric Holder noted, \"This was not someone trying to bring healing to Ferguson. ... This was a damn punk, a punk who was trying to sow discord.\" For the most part, authorities in Missouri have been careful not to blame the peaceful protesters. Surely, not all responses have been as measured, but the gravity of the situation will hopefully bring out the best in people. While we pause for a moment to let passions cool, we can use the time to consider how best to move forward with common resolve instead of mutual recrimination. The way forward is to engage citizens in the community -- to bring them into the room when decisions are being made about policing policies and procedures to make sure that those policies and procedures address the community's real concerns. And it's about putting law enforcement officers in the community as welcome members of that community -- as guarantors of the safety and security of the people instead of intimidating outside forces. This approach would benefit both the community and the police. I'm certain that officers would rather be appreciated and valued by the people they serve than be pressured to fill city coffers by issuing unnecessary citations, as noted in the DOJ report on Ferguson. Nobody becomes a cop because they secretly long to be a collection agent. One of the easiest ways to integrate law enforcement officers into the community is to physically put them on the sidewalks by increasing the number of cops who work good old-fashioned foot beats. Officers who view the world through a patrol car window are separated from the people they serve by more than a sheet of glass. Being encased in a vehicle alienates a person from the world around them. The cop on the beat is not just a quaint notion from old movies, he can be a bridge between police forces and the people they serve. Another idea is to give cops bicycles, which has brought so many law enforcement officials in touch with other cyclists in the community. Lack of community policing is one of the shortcomings cited in the DOJ report on Ferguson. In areas where the gulf between law enforcement and the neighborhood is too wide, mediators can be used to initially bring the two sides together. After all, both sides ultimately have the same goal of safe and peaceful neighborhoods. Both police departments and members of the community can take proactive steps to come together on more than a purely professional level.  A tech services company in the South Bronx recently hosted a video game competition with police officers and residents of the neighborhood. The event left local teenagers saying things about the cops like \"basically they're like us.\" Ultimately, police should be considered members of the community -- a notion that needs to be encouraged by police departments and neighborhoods alike. Communities can make the local cops part of their neighborhood celebrations. New Orleans Police Det. Winston Harbin became a minor Internet celebrity for his impromptu dancing with local people during Mardi Gras. Besides just being fun, Harbin's interaction with the community helped foster the type of mutual appreciation and respect that are essential to effective community policing. Fear and mistrust among minority communities toward police are the legacy of many decades of racism, unequal treatment, bias, subjective stereotyping and lack of opportunity. It is times like now, when that anger and resentment are boiling, that we address it. With the right approach, we can begin to change the attitude between the black community and the police from \"HandsUpDon'tShoot\" to \"HandsTogetherInTrust.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Latest Ferguson shootings push strained race relations to breaking point .\nDonna Brazile: Relationship between police forces and minority communities must improve ."
-    },
-    {
-        "input": "New Delhi, India (CNN)The North Korean ambassador in Bangladesh issued an apology after one of the embassy's diplomats was caught carrying 27 kilograms (59 pounds) of undeclared gold into the country's main airport in Dhaka, according to officials in Bangladesh. North Korean officials could not be reached for comment. Meanwhile, state media in the largely isolated communist country has not yet reported on the incident. Kazi Muhammad Ziauddin, a top official in Bangladesh's customs agency, told CNN his officers discovered the gold on the afternoon of March 5th, after the diplomat arrived on a Singapore Airlines flight. Ziauddin said customs officials had received a confidential tip that a North Korean diplomat would be carrying \"illegal items.\" \"It was very tough. We have to be very careful and sensitive when we deal with a diplomat,\" Ziauddin said, referring to the Vienna Convention, which affords diplomats certain degrees of consular immunity. He said the suspect, whom he identified as Son Young Nam, the first secretary of North Korea's embassy in Dhaka, initially refused to allow his baggage to be searched. \"He said there is no way we can open or scan his bag,\" Ziauddin said. \"At first he said 'This is an electric motor.'  After further questioning he changed his mind and told us, 'These are cipher machines and very confidential.'\" Eventually, customs officers opened the diplomat's bag and found the undeclared goods, that included 170 gold bars and golden ornaments. At current market prices, the gold would be valued at around $1 million. Entering Bangladesh without declaring this quantity of precious metal is a violation of the country's customs regulations. An official with Bangladesh's Foreign Secretary, speaking to CNN on condition of anonymity, said his government lodged a protest with the North Korean ambassador. It's far too early to jump to conclusions about whether the alleged gold smuggling incident could be an isolated example of corruption or an effort to move official funds. Due to international sanctions, North Korea faces substantial obstacles when trying to access international banking systems. \"It would be interesting to see if [the diplomat] is disciplined when he gets back\" to North Korea, said John Delury, associate professor at Yonsei University's Graduate School of International Studies. \"That would be a way of saying whether or not he's a corrupt diplomat.\" According to the Foreign Secretary's office in Bangladesh, the North Korean diplomat allegedly caught carrying the gold has since left the country. CNN's K.J. Kwon contributed from Seoul .",
-        "instruction": "Summarize the following news article:",
-        "output": "North Korea apologizes to Bangladesh after one of embassy's diplomat caught with 59 pounds of gold .\nGold was undeclared, says Bangladeshi government ."
-    },
-    {
-        "input": "(CNN)The father of baby Lily, found by rescuers after her mother's car flipped into a river, says she's doing great and that he feels blessed. Rescuers found the toddler Saturday hanging upside down in the car, which had crashed into a frigid Utah river a day before. Lily's mother, Lynn Jennifer Groesbeck, died in the crash that had landed their car on its roof in the Spanish Fork River. She was 25. Deven Trafny, 34, was out of town on a job at the time of the accident, CNN affiliate KUTV reported. He rushed to his daughter's side as soon as he heard. \"(I) came in, I put my finger in her hand, and I told her her Dad was here, and I love her,\" he told reporters Wednesday. \"I haven't left her bedside since, and I've just been here just sitting next to her waiting for her to get better so she can come home.\" Trafny said that Lily is awake and has been singing nursery rhymes. Video of the two of them at a hospital shows her waving at a camera. \"She knows everything she knew before anything happened. It's amazing. Doctors say it's amazing,\" he said. How did toddler survive car crash in Utah river? Lily might have died unseen with her mother had a man not gone fishing in that particular spot Saturday. The angler waded into the river around noon, then noticed the car wheels-up in the water. The fisherman called emergency dispatch. The water was so cold that, when the rescue was over, seven of the men involved had to be treated for hypothermia. They heaved the car onto its side and saw Groesbeck in the driver's seat. It was clear to them that she was dead. Lily was still strapped into her seat, where she may have been for 14 hours, if the wreck occurred at about 10:30 Friday night, as police believe. Trafny described Groesbeck as the love of his life, according to KUTV: \"I'm going to miss her a lot. I still have to deal with that.\" But he also considers himself lucky to still have his daughter -- healthy and alive. \"I'm just blessed. I'm counting all my blessings right now,\" said Trafny. CNN's Ben Brumfield contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Baby Lily was trapped in the car in the cold river for as long as 14 hours .\n\"She knows everything she knew before anything happened. It's amazing,\" father says .\nLily's mother, Lynn Jennifer Groesbeck, died in the crash ."
-    },
-    {
-        "input": "(CNN)A medical helicopter carrying at least three people crashed late Thursday in Eufaula, Oklahoma, the Federal Aviation Authority  said. It was flying from Tulsa to McAlester when it went down west of Lake Eufaula, near the Canadian River. Three employees from medical air transport company EagleMed LLC were on board, according to FAA spokesman Lynn Lunsford. FAA investigators are headed to the crash scene. The McIntosh County Sheriff's Office is handling the crash for local law enforcement, but a spokeswoman said early Friday that it had no information to give out so far. Earlier this week, an Army Black Hawk helicopter crashed in waters off the Florida Panhandle. All 11 service members aboard are believed dead, an Air Force official said Thursday. The military has been working to recover the helicopter, which searchers found at the bottom of the Santa Rosa Sound near Eglin Air Force Base, as well as the bodies of all seven Marines and four Louisiana Army National Guard members who were aboard. CNN's Greg Botelho contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The chopper was registered to an air ambulance company .\nIt was en route from Tulsa to McAlester ."
-    },
-    {
-        "input": "(WIRED)Screens are rectangles. Even the 3-year-old playing with your iPad could tell you that. But what would the digital world look like through a different sort of frame? Say... a circular one? Monohm, a startup based in Berkeley, California, was founded around this very idea. For the last year, the three-person team has been working a circular, palm-sized device dubbed Runcible. They cheekily refer to it as the \"anti-smartphone,\" a description that goes for both its form factor and its value system. The round device is meant to be the antidote to our feed-obsessed, notification-saturated digital existence. It's a challenge to the rectangular status quo and everything it represents. That's a quixotic dream, but an interesting one. Display technologies have a long and rectangular history. Before smartphones there were movie screens, TVs, and computers, not to mention paintings and pages of print. And then of course there are windows—in some ways the original glass rectangles. In each case, the rectangle's prominence can be attributed in large part to practicality. Whether you're talking about film or glass or stone, rectangles are easy to make. They don't leave much wasted material. As frames for shaping the world, however, different types of rectangles can produce vastly different effects. 14 of the Best Architecture Photos From the Past Year . In her book The Virtual Window, which traces the rectangular frame from Renaissance painting up through Microsoft Windows, media theorist Anne Friedberg offers an example from the history of architecture, centering on a public feud between French builder August Perret and the preeminent modernist architect Le Corbusier. Perret was a strong advocate of the traditional French casement window, which was oriented vertically. Its main function, he said, was to let light into a room. Le Corbusier, making use of new manufacturing techniques, designed his buildings around long, horizontal windows, which were as much about framing the outside world as illuminating the space within. The disagreement influenced architecture for decades to come. The simple act of turning a rectangle on its side gave us entirely new ways to think about space. Rectangles are still subtly dictating our behavior today. Movie screens, chased by TVs, have gotten bigger and wider, encouraging us to sit back and lose ourselves in the spectacle. (In 1930, Russian filmmaker Sergei Eisenstein lamented how the cinema's \"passive horizontalism.\" He wanted the screen to be square.) Smartphones, with their slender, touch-controlled displays, have become a distinctly more active rectangle. Paired with the never-ending vertical feeds that fill apps like Facebook, Instagram, and Twitter, they've become an irresistible, inexhaustible diversion. The point is this: Frames matter. They suggest certain things about how we should approach them. They shape the type of stuff that's made for them. And if just turning a rectangle on its side can make such a big difference, imagine all the interesting things that might happen if you left the rectangle behind altogether. Runcible isn't meant to be a smartphone replacement so much as an alternative. \"I think we've become really, really good at getting interrupted and creating conduits for interruption,\" says Monohm CEO Aubrey Anderson, who met his co-founders during a stint at Apple. \"It's time now to use technology to get a little quieter.\" If miniaturizing the computer is what got us the smartphone, Runcible asks what a gizmo might look like if you started by souping up a pocket watch. And the shape of the device is central to that thinking. A circular frame, after all, is no good for browsing a Twitter feed. 12 Most Ancient and Magnificent Trees From Around the World . So what is it good for? That's the question. At this point, Runcible as much a provocation as an actual product. The three-person team, which has been working with the San Francisco design studio Box Clever on the concept for nearly a year, has some prototype hardware and a crude sketch of an operating system, but they've still got a long way to go. They've got a few vague ideas for applications. One is a sort of dashboard that gives you an overview of activity on your social media accounts. Another is a compass-style mapping system that encourages wandering instead of pure A-to-B efficiency. But they're more enthusiastic about the philosophy behind it all: They want to see applications that distill information and streamline interaction, software that constrains the smartphone experience as it exists today. All this is easier said than done, of course. Throwing out centuries of rectangular thinking and starting from scratch ain't easy. Plus, it's not clear that people really want constraint to begin with. Smartphones are distracting, sure, but they're also incredibly useful and immensely entertaining and maybe a little distraction is a fair price to pay for all the good stuff. Still, even as a hint of a possible device, Runcible is compelling. For one thing, the company's hardware model feels great in the hand (The team's hardware guy, George Arriola, came from Sony, where he helped design the PlayStation 4.) The model's curved back brings to mind the very first iPhone—and makes you consider how each successive generation has become a little bit harder to hold. And though unformed, the vision for the software is interesting too. If today's interactive rectangles and infinite feeds signal that there's always more stuff just outside the frame, circles could offer something more self-contained, more complete. Maybe even something actively inefficient. Rectangles are beautiful and functional. Circles are zen. A circular device would sever the link to the printed page, the TV and the computer, and invite developers to look elsewhere for metaphor and inspiration. Pocket watches and compasses. Microscopes and telescopes. Peep holes, port holes, and wormholes. Dials, buttons, and other circular controls. 15 Incredible Photos That'll Remind You to Be Awed by Planet Earth . If nothing else, the concept could be valuable simply for helping us identify some of the assumptions and habits that underlie our existing devices. Maybe thinking about circles could help us make our rectangles better. Runcible is just one scrappy, literal attempt to abandon the rectangle. But similar thinking is happening elsewhere. Android Wear, Google's smartwatch operating system, reconsiders what apps should look like on a tiny circular display. Apple Watch is in some ways another rectangle, but its real estate is limited enough that it will also encourage new, less rectangular thinking. (Note how its home screen ditches iPhone's grid of icons for a blob of circular ones. Also note the recent rise of circular avatars over the traditional square ones in apps and interfaces of all kinds). We've seen how sensors can be harnessed to choreograph experiences that happen outside of the frame entirely, as with Disney's Magic Bands, which usher you through the company's parks. And then of course there are technologies like augmented reality and virtual reality, where your nose is effectively pressed so close to the glass that the frame disappears entirely. Here, the screen is less of a window, more of a lens. The only frame is your field of vision. Rectangles will endure. They're easy, they're efficient. But as new components and manufacturing techniques make it easier to experiment with other forms, we'll likely find people exploring the unique effects they can produce. Just recently, in fact, we saw an instance of a tech industry giant leaving the glass rectangle behind in a very big way. In a 10 minute video, Google proposed a new headquarters that would leaves boxy buildings behind in favor of tent-like structures draped in glass. These buildings don't have vertical windows or horizontal windows. They're nothing but windows, or maybe they're so radical that the concept of \"window\" doesn't even really apply. Whatever the case, there's nothing rectangular about them, and Google's convinced they're the future. Read more from WIRED: . There's a Super-Fast Method for Boarding Planes, But Airlines Aren't Using It . 21 Awesomely Well-Designed Products We're Dying to Own . What Cities Would Look Like if Lit Only by the Stars . Subscribe to WIRED magazine for less than $1 an issue and get a FREE GIFT! Click here! Copyright 2011 Wired.com.",
-        "instruction": "Summarize the following news article:",
-        "output": "A Californian startup has developed a circular smartphone called \"Runcible\"\nThe provocative anti-phone is meant to be an antidote to our obsession for digital devices ."
-    },
-    {
-        "input": "(CNN)If you feel a ripple in the Force today, it may be the news that the official Star Wars universe is getting its first gay character. According to the sci-fi website Big Shiny Robot, the upcoming novel \"Lords of the Sith\" will feature a capable but flawed Imperial official named Moff Mors who \"also happens to be a lesbian.\" The character is the first gay figure in the official Star Wars universe -- the movies, television shows, comics and books approved by Star Wars franchise owner Disney -- according to Shelly Shapiro, editor of \"Star Wars\" books at Random House imprint Del Rey Books.",
-        "instruction": "Summarize the following news article:",
-        "output": "Official \"Star Wars\" universe gets its first gay character, a lesbian governor .\nThe character appears in the upcoming novel \"Lords of the Sith\"\nCharacters in \"Star Wars\" movies have gradually become more diverse ."
-    },
-    {
-        "input": "Karachi, Pakistan (CNN)Suicide bombers attacked a Christian community in eastern Pakistan on Sunday, setting off two blasts that killed at least 14 people and wounded dozens more, officials said. The Pakistani Taliban claimed responsibility for the deadly attack and warned of more to come. The explosions, which struck the Nishtar Colony area in the city of Lahore, wounded at least 78 people, said Dr. Muhammed Saeed Sohbin, medical superintendent at Lahore General Hospital. Video from the scene aired by CNN affiliate GEO News showed twisted metal, shattered glass and panicked residents outside a church compound. Ambulance and security personnel were seen moving in. Later footage showed water cannons arriving to disperse the crowd. Ehsanullah Ehsan, a spokesman for the Pakistani Taliban, said by telephone that his group was responsible for the suicide bombings, declaring that such attacks would continue until Sharia law is implemented in Pakistan. After a period of disunity, the terrorist group's three major splinter groups announced last week that they were joining forces again under the name Tehrik-i-Taliban, or TTP. The Pakistani military has been waging a campaign against the militant group in North Waziristan, one of the loosely governed tribal areas bordering Afghanistan. Pakistani Prime Minister Nawaz Sharif, whose government held unsuccessful peace talks with the TTP last year, strongly condemned Sunday's attack, according to a statement from his office. Sharif asked provincial governments to tighten security and \"take all possible measures\" to protect people and property, the statement said. The last major attack on Pakistan's Christian community took place in 2013, when suicide bombers struck a church in the northwestern city of Peshawar, killing more than 80 people. More recently, a Christian couple were burned to death in November by mob that accused them of blasphemy. \"The Christian community is a soft target for militant outfits in Pakistan,\" said Rabia Mehmood, a researcher at the Jinnah Institute, a Pakistani think tank. \"But generally Christians and other religious minorities are under a constant threat by the extremist elements in the society and rampant religious intolerance.\" On Sunday, Pope Francis said he learned of the attacks \"with pain, with much pain.\" He called for peace in Pakistan and said that persecution of Christians doesn't get the attention it deserves. The Pope prayed that \"this persecution against Christians, which the world tries to hide, might end, and that there be peace.\" Other minorities in the predominantly Sunni Muslim nation have also been targeted this year. Last month, an attack on a Shiite mosque in Peshawar killed at least 19 worshipers and injured dozens of others. The Pakistani Taliban reportedly claimed responsibility for that attack, too. CNN's Sophia Saifi reported from Karachi, Pakistan, and Jethro Mullen wrote from Hong Kong. CNN's Brian Walker and journalists Saleem Mehsud and Adeel Raja contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The Pakistani Taliban claims responsibility for the suicide bombings .\nAt least 78 people were wounded in the attack, hospital official says ."
-    },
-    {
-        "input": "(CNN)They are very convincing when they call. They have a Washington phone number and can cite your financial history down to the cent. They say you're under investigation, in danger of losing your home, or worse, your freedom -- unless you pay thousands of dollars on the spot. But they're not real.  And you're not in trouble.  Not unless you take it seriously.  This is a scam. Email your story ideas and tips to CNNtips@cnn.com. A big one. Federal authorities say it's the largest IRS impersonation scam they've ever seen -- swindling victims out of more than $15 million since it began in 2013. \"They have information that only the Internal Revenue Service would know about you,\" said Timothy Camus, deputy inspector general for investigations with the Treasury Department. \"It's a byproduct of today's society. There's so much information available on individuals.\" Using identity theft technology, the thieves have successfully victimized more than 3,000 people in the past two years, although the Treasury Department cautions that number is only documented cases and the true number might be higher. Camus said they've recorded more than 366,000 reports of contact with the scammers, and it's increasing at a rate of 10,000 to 12,000 a week. Authorities believe the thieves are operating out of India, using phishing technology to make it appear they're IRS agents in Washington. Call 1: 'The IRS is filing a lawsuit against you' Call 2: 'Your address is under state investigation' The Federal Trade Commission, which goes after scammers like these from a civil standpoint, and the Treasury Department, which leads the criminal probe, both have open investigations. The largest loss reported was a staggering $500,000, Camus said. Most have lost about $5,000. One of those victims was former NFL player Frank Garcia, who is now a sports radio host in Charlotte, North Carolina. When he got the call, it sounded so authentic, he left the radio station in a panic, scramming to get the money they wanted. 12 scams to avoid . \"The only thing running through my head is, I'm going to jail. I'm gonna be on television, in handcuffs, for tax evasion,\" he recalled. \"I had to follow specific steps not to be arrested. That the authorities had been contacted and in fact, they are on the way and will be there in 30 minutes.\" Garcia says he spent five hours driving to various stores around Charlotte, depositing $500 each time into a PayPal account set up by the woman on the phone. He ended up losing about $4,000. He, and other victims, told CNN the swindlers never let their victims hang up the phone. \"I have never been arrested in my life and was very scared,\" said Kin Ko, a New Jersey resident who lost about $5,000 after he says the impersonator told him he was facing five years in prison, and the IRS was about to confiscate his assets: His car, his house and all the money in his bank account. The person had a badge number, read him an arrest warrant from a nearby police jurisdiction. The thieves are incredibly smart and convincing. They harness stolen identities and use programs such as Google Earth to identity locations where their victims can transfer money. \"It sounded as legitimate as could be,\" said Al Cadenhead, a pastor in North Carolina who also fell victim. \"They knew where I was. He told me where to go -- to the Rite Aid, up two streets turn left to the Rite Aid. The names of the streets, it was really just incredible.\" Cadenhead told CNN he didn't come to his senses until he'd signed over $16,000. Identity theft tops list of consumer complaints . \"It was like I came out of a coma and realized this was not normal and this is not how you do business,\" he said. \"I know other people who've heard the story say 'How did a guy with a PhD fall victim?' I was the perfect victim. I've never been audited, never paid a traffic ticket. I don't know how to pay fines. How do I know they aren't stern and serious about everything?\" Camus said many people are afraid when they hear from the IRS, so they do whatever the caller says. In December, federal authorities found and arrested two U.S.-based \"runners\" who admitted to transferring almost a million dollars from pre-paid cards to foreign bank accounts. In many cases, victims are warned and stopped from sending more money by bank tellers or clerks at money wiring locations. Ko said that it was a bank teller who told him this was a scam when the impersonator asked him to go through a second round of depositing money. If you think you have been a victim of an IRS impersonation scam, you can report it to TIGTA's web site or call 800-366-4484. For Frank Garcia, the light came on when the woman on the phone asked him for another $8,000 after he thought his debt had been paid. He says he decided to hang up and await what the woman promised would be an arrest -- which, of course, never came. How hackers are stealing your tax refund . \"I felt taken advantage of. I felt small. And I was naive,\" he said. \"I wasn't aware. I didn't understand the system. And didn't blame anybody but myself for not taking more time to understand those things.\" Often, Camus says, immigrants are targeted and threatened with deportation. The elderly are also a popular target, although the scam has grown so large, people of all ages, income levels and status are getting these calls. Treasury Department officials say if you get one of these calls, the best and simplest way to handle it is to hang up. Camus himself, a Treasury Department investigator, even got one. He told them, \"Your time is coming.\" Watch The Lead with Jake Tapper weekdays at 4pm ET. For the latest on The Lead with Jake Tapper click here.",
-        "instruction": "Summarize the following news article:",
-        "output": "IRS scam has swindled victims out of more than $15 million since 2013 .\nScammers warn of seizing property, prison time .\nFTC and Treasury department have open investigations ."
-    },
-    {
-        "input": "(CNN)The attempts by some in the GOP to undermine President Barack Obama's Iranian policy certainly seem extraordinary. First, the Republican leadership invited Benjamin Netanyahu to speak to Congress without consulting the White House, and he duly warned against softening of the West's line on Iran. Now, 47 senators have written an open letter to the Iranian regime to advise that any deal agreed to with Obama could be reversed after the 2016 presidential election. It is an astonishing move. But this is not, strictly speaking, unique. People in both parties have done far more remarkable things in the past. Back in 1983, Sen. Edward Kennedy tried to set up a personal diplomatic channel with the Soviet Union -- effectively sidestepping President Ronald Reagan. Working through proxies, he suggested that he visit Moscow to meet with the communist leadership and offered to help them make their case to the American people as to why they preferred dialogue over confrontation. As Washington Post journalist Vincent Bzdek notes in his book \"The Kennedy Legacy,\" this occurred close to a U.S. presidential election, and some conservatives have interpreted it as an act of treason -- perhaps even a breach of the Logan Act, which forbids private citizens from engaging in diplomacy with the goal of changing foreign policy. One might argue that the senator was motivated by high ideals: He had a clear record of campaigning to reduce Cold War tensions and thought Reagan was mishandling the Soviets. On more than one occasion, Kennedy politicized foreign policy while in the Senate. In my book on the 1980 Democratic presidential primaries, I note that Kennedy opposed Jimmy Carter's hard-line stance on the Soviet invasion of Afghanistan -- a position that was well-intentioned, prophetic and somewhat advantageous to Kennedy's campaign for the Democratic nomination. But it's not only Democrats who have done that. Step forward, Richard Nixon. In 1968, the presidential election looked close. With the Johnson administration edging toward a peace deal in Vietnam, Nixon's team rolled the dice. According to an account in Politico, Anna Chennault, a Republican activist, was given a message to pass onto the South Vietnamese government: If they undermined the peace talks by being stubborn, the Democrats would lose the election and Nixon as the next president would offer them better terms. The South Vietnamese indeed proved intransigent, and the Republicans won the White House. No evidence exists directly tying Nixon personally to the conspiracy, but we now know for sure that it happened and it's far hard to imagine that it would have gone ahead without his knowledge. There is a view that it bordered on treason and -- again -- makes the 2015 GOP efforts look tame by comparison. The extreme lengths that Kennedy and Nixon went to behind the scenes underlines the point that foreign policy has always been a deeply partisan matter that can often end in a challenge to executive authority. It's true that the spirit of the Constitution indicates that the country is expected to speak with one voice on foreign policy through the President. But such harmony hasn't always been possible. Recall that Congress and the Carter White House tore themselves apart over the Panama Canal Treaties. That the Reagan administration's policy in Nicaragua was so controversial that his staffers sent aid to the rebels through back channels, and the then-Democratic House speaker, Jim Wright, was accused of presenting a private peace plan to the left-wing government. That Bill Clinton's 1994 nuclear deal with North Korea was criticized by both Republicans and Democrats. Or that liberal Democrats did their best to defund the Iraq War. As Damian Paletta writes, foreign policy is generally conducted quietly by the White House through executive agreements that pass without comment. But the idea that foreign policy is beyond partisanship is naïve, and disagreements have gone public when the political conditions are appropriate. The particular matter of the Iran talks is sensitive for the Republicans because it involves the electoral holy trinity of Obama, Israel and the presidential primaries. They weren't going to walk away from this one, and we can hardly be surprised that they haven't. The Democrats would do the same if the situation were reversed, as they have many times in the past. In a republic purposefully designed to have limited executive power, with a competitive two-party system bolted on to keep its politics fluid, this was arguably inevitable.",
-        "instruction": "Summarize the following news article:",
-        "output": "Timothy Stanley: GOP senators' letter to Iranian leaders seems extraordinary .\nBut undermining a president's foreign policy is not at all unique, Stanley says .\nHe says both left, right have gone around administrations to deal with foreign leaders ."
-    },
-    {
-        "input": "(CNN)Pope Francis, who succeeded a resigning pontiff exactly two years ago, told a Mexican television network Friday that he expects his pontificate will be brief. \"Four or five years,\" he told Televisa. \"I do not know, even two or three. Two have already passed. It is a somewhat vague sensation. Maybe it's like the psychology of the gambler who convinces himself he will lose, so he won't be disappointed and if he wins, is happy. I do not know.\" Francis, who was elected in March 2013 after Pope Benedict XVI stepped down, said his predecessor had reopened a door to more popes emeritus. When asked whether he likes being Pope, he enthusiastically responded: \"I do not mind.\" Francis, 78, said he feels God only wanted him to lead the Roman Catholic Church for a short time \"and nothing more.\" He said that he would not support putting an age limit on the papacy. He also enjoys calling on the Pope Emeritus. \"It's like having a wise grandfather at home. One can seek advice,\" Francis said. Francis made similar retirement comments in August when he praised Benedict for \"his beautiful gesture\" and said he might do the same thing one day. He told Televisa he misses the ability to go out in public without attracting a huge crowd. \"I would like ... to go out one day, without being recognized, and go to a pizzeria for a pizza,\" he said. Also Friday, while at a communal penance service at St. Peter's Basilica, the Pope announced a \"Jubilee of Mercy\" will start in December. The theme will end November 26, 2016, the Vatican said. There have been 26 jubilees since the first in 1300. The  most recent was in 2000. Pope says it's OK to spank children if you don't demean them . CNN's Livia Borghese contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Pope has talked of retirement before, but this time he says he think papacy will end after no more than five years .\nFrancis says he doesn't like the idea of an age limit .\nPonitff announces a rare jubilee to begin in December ."
-    },
-    {
-        "input": "(CNN)A Facebook post by actor Ashton Kutcher lamenting the lack of diaper changing tables in public men's rooms has parents talking. The new father -- he and partner Mila Kunis welcomed baby Wyatt Isabelle in October -- complained to his followers that he had yet to encounter a changing table in the public bathrooms he visits. He offered to give a social media shout-out to the first business where he found a diapering table in the men's room. The post had logged more than 230,000 comments as of Wednesday morning. Lots of folks offered up places Kutcher should patronize, such as Walmart and Cracker Barrel, where they say changing tables abound. Some dads said they didn't have a problem finding changing tables, but it may be because they're frequenting more down-to-Earth establishments than the Hollywood star. Other posters said \"family restrooms\" would take care of the problem altogether. Do modern dads get enough credit? Many praised Kutcher for raising the issue: \"Thank you for doing this. This is not just an issue for dad's such as yourself who are awesome, but so many of the families I know who have two daddies have this issue ALL The time,\" wrote one poster. Another mom agreed: \"My boyfriend was taken aback when he had to get a key for the family change room instead of just going into the men's with our son because they had no change table. It doesn't make ANY SENSE. Gender equality needs to go both ways.\" No update yet on whether Kutcher has encountered a diapering station in a men's room. Dad blogger's death prompts renewed push to rename 'Amazon Mom'",
-        "instruction": "Summarize the following news article:",
-        "output": "Actor Ashton Kutcher complained on Facebook that men's rooms don't have diapering tables .\nHe offered to give free publicity to the first establishment where he encountered one .\nKutcher and his partner, Mila Kunis, welcomed baby Wyatt Isabelle in October ."
-    },
-    {
-        "input": "(CNN)Closed for over a decade, a trail that's been described as \"the world's most dangerous path\" will no longer be off limits to the public thanks to a massive reconstruction project. Due to reopen March 26, Spain's stunning Caminito del Rey features a cliff face boardwalk that hangs 100 meters above the Guadalhorce River. The trail, which begins in the village of El Chorro in southern Malaga province, has been around since the early 1900s, originally built to provide access to two waterfalls for hydroelectric workers. Over the years, the boardwalk deteriorated and, following a string of fatal accidents in 1999 and 2000, the government demolished the access points to the walkway. The reopening, which comes over a year since reconstruction efforts kicked off, coincides with Spain's annual Holy Week celebrations. According to the Spanish daily El Pais, the provincial government allocated 5.5 million euros ($5.8 million) to the project. The entire route is 7.7 kilometers long, with boardwalks covering 2.9 kilometers of the trail. The most famous section includes the Balconcillo de los Gaitanes bridge, which spans the Gaitanes Gorge. Tourism officials say it takes between four and five hours to walk the entire route, which includes some steep slopes. Entry will be free for the first six months after the attraction reopens on March 26. Visiting hours are 10 a.m.-5 p.m. from April 1 to October 31, and 10 a.m.-2 p.m. from November 1 to March 31. Those who want to visit need to book a spot on the official website, Caminitodelrey.info.",
-        "instruction": "Summarize the following news article:",
-        "output": "Spain's '\"most dangerous path\" due to reopen this month after extensive reconstruction efforts .\nOriginally built in the 1900s, the trail's boardwalks deteriorated over time, forcing the local government to close it ."
-    },
-    {
-        "input": "(CNN)Outside of Israeli politics, Isaac Herzog is not a well-known name. That may change on March 17, when Israelis head to the polls for election day. In the final round of polling before the elections, Herzog's Zionist Union party is in the lead, holding a four-seat edge over Prime Minister Benjamin Netanyahu's Likud party. \"I believe in a certain type of leadership that is not always customary in this region. I'm not a general. I don't give orders. I know how to work together,\" he says. Throughout the campaign, Herzog has been seen as an underdog, lacking the charisma and the English fluency of Netanyahu. Herzog says that doesn't bother him at all. \"I have always suffered from a certain underestimation,\" Herzog said, \"and I have always surprised.\" He promised, \"I will surprise again, and I will show my leadership and stamina.\" Herzog began his political career in 2003, when he first won a seat in the Knesset with the Labor Party. He held a variety of ministerial positions, including minister of housing and construction, minister of tourism, and minister of welfare and social services, before becoming leader of the Labor Party in 2013. In those elections, he also became the leader of the opposition, as Benjamin Netanyahu won another term as prime minister. But when Netanyahu called for early elections in 2014, Herzog pegged his bid for the premiership on social reform. \"What I run for is social justice. I will change the nature of the division of wealth in a fair and more balanced way, close inequality and give a sense of purpose to the people here in the workplace, in the housing, and in the cost of living,\" promised Herzog. Before the election, the issue of a nuclear Iran garnered international headlines as it further aggravated tense relations between the White House and Netanyahu. Herzog, in a speech almost immediately after Netanyahu's address to Congress, promised to work with the United States and European powers, not against, to ensure the safety of Israel. He echoed that sentiment in an interview with CNN's Elise Labott. \"A nuclear-armed Iran is dangerous to world peace, is dangerous to our region, is dangerous to Israel. As leader of Israel, I will never accept a nuclear-armed Iran. Never. And all options are on the table.\" In these elections, negotiations with the Palestinians haven't been one of the major issues, but Herzog promised to restart the stalled peace talks with the Palestinian Authority. \"I will do my best to ignite a political process with our Palestinian neighbors. ... Although I cannot promise 100% results, I promise 100% effort.\" Herzog comes from Israeli political royalty. His grandfather, Rabbi Yitzhak HaLevi Herzog, was the first chief rabbi of the state of Israel. His father, Chaim Herzog, was an Army general, an ambassador to the United Nations and the president of Israel. Herzog believes it is his destiny to be the next prime minister of Israel. \"What I carry with me is a unique legacy, a family legacy, but most important, an experience that brings me to be able to lead our nation.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Polls show Isaac Herzog's Zionist Union party four seats ahead of Benjamin Netanyahu's party .\nIsraeli parliamentary elections will be on March 17 ."
-    },
-    {
-        "input": "(CNN)Hundreds of thousands of protesters gathered in cities across Brazil this weekend, incensed by serious economic woes and a massive scandal involving the country's oil company. The demonstrators have called for President Dilma Rousseff to be impeached. CNN correspondent Shasta Darlington, who lives in Sao Paulo, explains who the demonstrators are, why they're mad and what could be in store for the President and the country. What do the protesters want? They're angry about the country's economy being in shambles at a time when political corruption at the highest levels of government is alleged. If it were just one of those things, maybe we wouldn't be seeing this kind of turnout because Brazilians have lived with corruption for decades and the economy was not doing well before the presidential election in October. But it's all come to a head. Political corruption -- let's break that down. What's the scandal about? Petrobras is the national oil company. It's one of the most powerful, richest companies in the country -- and has been counted among the largest in the world. It was very respected. People invested in it. Rousseff, who won election in October by a slim margin, was -- before becoming President -- the chairwoman of Petrobras' board of directors. She was chairwoman during much of the time that former oil company executives have told investigators that bribes were paid to Petrabras executives and politicians in order to secure contracts from Petrobras. In early March, the country's highest court authorized the investigation which involves about 40 politicians, many of them from the President's ruling Workers' Party and the speakers of the House and Senate. But Rousseff hasn't been accused of anything, right? Right. She's not being investigated. Protesters' call for impeachment -- the formal process of accusing, because she's not been accused of anything -- looks very unlikely. But, the public perception is that even if she didn't personally profit for this scheme, she was still the chairwoman at a time when it was supposedly happening and she should be held accountable. Has she responded to the protesters? She has. What she says is that she will not stand for corruption and she's rooting it out. As proof, she contends, she's given her attorney general free rein to investigate who is stealing from the state oil company. It's important to know that the Petrobras scandal has been known publicly for at least a year and during Rousseff's election campaign, she said over and over that she would root out corruption. It's also interesting that she isn't lashing out at protesters. She is saying she won't put up with violence -- and so far the protests haven't gotten violent. But she says that Brazil is a democratic country and people have the right to protest. She comes from a left-wing background so that approach is fitting. You brought up that she is from the left-wing Workers' Party. How is that significant? The protests were mostly organized by the right-wing party that opposes Rousseff...organized by people who didn't vote for her. There aren't people out today protesting. The next demonstrations are scheduled for April 12. I guess that goes to how organized these demonstrations really are -- setting an advance date like that. Yes, well, that's Brazil. It's so the word can get out on social media. If she's not accused of anything and the protests are politically motivated, will the protests affect her presidency? Rousseff has four years ahead of her, it could end up being very hard for her govern. She won a little over half the vote in October and the people who didn't vote for her are still making their voices heard. And this Petrobras scandal is a problem. Why was the election so close? You have to bring it back to the economy. It had been slowing down, and traditionally the wealthier parts of the country like Sao Paulo haven't voted for Rousseff's party, the Workers' Party. The country is at 7.5% inflation. The April 12 protest is ahead but three or four months down the road, what might happen? The Senate and House speakers are Rousseff's allies and they are among those implicated in the Petrobras scheme. What if she isn't able to get badly needed legislation? What will she do? What will be party do? These questions are hard to answer now. CNN's Ashley Fantz in Atlanta talked with CNN's Shasta Darlington who lives and reports in Brazil.",
-        "instruction": "Summarize the following news article:",
-        "output": "CNN's Shasta Darlington, who lives in Sao Paulo, explains who protesters are and what they want .\nDemonstrators are angry about a massive scandal involving state oil company .\nProtesters also are incensed about a stumbling economy ."
-    },
-    {
-        "input": "Jakarta (CNN)Turkish officials have detained 16 Indonesian citizens who confessed they were planning to cross the border into Syria, Indonesia's foreign ministry said. The group comprises a man, four women, three girls and eight boys, said spokesman Arrmanatha Nasir. The ages of the children have not been released. Nasir told reporters on Friday that the group was stopped in the southeastern Turkish city of Gaziantep, just north of the city of Aleppo in Syria. They admitted to officials they planned to enter Syria, Nasir said. The Indonesian government is sending a security team to Turkey to investigate the group's objectives and plans and to increase cooperation with Turkish security officials, Nasir said. This group of 16 Indonesians is different from another 16 Indonesians who reportedly deserted their tour group recently in Turkey and are feared to have crossed into Syria. Nasir confirmed these are two separate groups.",
-        "instruction": "Summarize the following news article:",
-        "output": "The group is made up of one man, four women, 11 children .\nThey were stopped just north of Aleppo, Syria ."
-    },
-    {
-        "input": "New Delhi (CNN)Thankfully, no one was wounded after crude bombs were hurled at a Tamil news station in India on Thursday. But the loud explosions injured a vital part of the world's largest democracy: free speech. Last week, when India's government and a British documentarian faced off over a film featuring a man imprisoned for a 2012 gang rape in South Delhi, a little-known channel hundreds of miles away in southern India was waging its own battle. Hardline Hindu groups were angry with broadcaster Puthiya Thalaimurai for filming a show about the relevance of a traditional necklace -- called mangalsutra in Hindi and thaali in Tamil -- worn by married Indian women. For them, the contents, as shown in the promos, were offensive to Hindu culture. The station planned to release the program Sunday, International Women's Day. But it canceled the telecast after demonstrations took place outside its office. Protesters allegedly attacked one of its cameramen. Four days later, the channel came under fire again, when four men on two motorbikes threw bombs into its compound in a predawn attack, authorities say. Six people involved in the bombing have been arrested, said S. George, the commissioner of the southern Indian city of Chennai. Their leader turned himself in separately, claiming responsibility for the attack, police said. \"The show wanted to give women a platform. We welcome all opinions and thoughts. But you cannot strangle freedom of free expression by violent means and threats,\" said Shyam Kumar, the CEO of New Generation Media Corp., which runs Puthiya Thalaimurai. \"We condemn the attack in the strongest possible terms,\" he told CNN. 'India's Daughter,' the film banned by India: What did it show? But India is no stranger to censorship imposed legally or forced by rowdy protesters. The country's constitution guarantees freedom of expression, but not without restrictions. Communities or people claiming their religious sentiments were hurt by anyone else's opinion can file a lawsuit. Authorities can seek restraining orders from local courts -- as they did to ban the recent BBC documentary \"India's Daughter\" -- by citing potential disorder. Earlier last year, Penguin India withdrew \"The Hindus: An Alternative History,\" a book by American academic Wendy Doniger, after a local advocacy group accused the writer of denigrating Hinduism. In December, a Bollywood movie, \"PK,\" came under attack over similar accusations when mobs tore apart its posters in parts of India. A satire on religious rituals, \"PK\" became a roaring success by being one of the country's highest-grossing movies. But India, home to one of the world's largest film industries, has blocked several movies from screening. At least two films were not allowed last year. One of them featured the lives of the Sikh assassins of Prime Minister Indira Gandhi, and the other centered on the violence in Sri Lanka in the closing months of its civil war. Hounded by protests over his novel, Perumal Murugan, a Tamil author, announced quitting writing in a dramatic post on Facebook in January. \"Perumal Murugan, the writer is dead. As he is no God, he is not going to resurrect himself. He has no faith in rebirth. As an ordinary teacher, he will live as P Murugan. Leave him alone,\" he  said on Facebook two months ago. Religious and caste-based organizations had slammed his novel \"Madhorubhagan,\" which depicted a childless wife taking part in an ancient festival allowing consensual sex between strangers. Just last week, India blocked the BBC from airing \"India's Daughter\" because it included comments from one of the men convicted of raping a young student in a moving bus in New Delhi in 2012. The reason: The inmate's views could create unrest. \"There's a growing intolerance towards different shades of opinion. It's a medieval mindset. What India needs is a concerted effort to move beyond it and embrace free expression in totality,\" said Kumar, the New Generation Media chief executive.",
-        "instruction": "Summarize the following news article:",
-        "output": "Indian broadcaster Puthiya Thalaimurai drew protests for a show about traditional necklaces worn by married women .\nThis comes after India banned a film featuring a man imprisoned for a gang rape in South Delhi .\n\"There's a growing intolerance towards different shades of opinion,\" a broadcasting company CEO says ."
-    },
-    {
-        "input": "(CNN)\"This is going to be a test for Serena Williams. We're going to find out what she's made of. How mentally tough she really is.\" Those were the words of tennis analyst Mary Joe Fernandez in 2001, early into Serena Williams' decider with Kim Clijsters in the final of one of tennis' most prestigious tournaments. The match was being televised on one of the biggest sports networks in the world, ESPN. The then 17-year-old Clijsters had just held to love amid vociferous cheers and -- in a breach of tennis etiquette -- a large portion of the fans on center court at the event now known as the BNP Paribas Open roared approvingly when Williams struck a double fault to begin the next game. Despite Williams growing up about 130 miles away in a suburb of Los Angeles, while Clijsters hailed from Belgium, there was clearly no home-court advantage. Williams' father, Richard, and older sister, Venus, were booed as they made their way to their seats prior to the finale, with Richard claiming in USA Today he was the victim of racial abuse. Venus Williams, meanwhile, said in a press conference at her next tournament she \"heard whatever he heard.\" It was Richard Williams who shaped his daughters into grand slam winners from a humble background, bereft of the type of money used to help manufacture many a champion. \"One guy said, 'I wish it was '75, we'd skin you alive,'\" Richard Williams told USA Today. \"I had trouble holding back tears. I think Indian Wells disgraced America.\" Charlie Pasarell, then the tournament director, said in the same story that he didn't discount Richard Williams was racially abused. CNN.com did not hear back from Pasarell when it put in an interview request for him and Clijsters declined an interview request. The fans' reaction apparently stemmed from the sisters' semifinal -- or lack of it. Venus Williams pulled out a mere minutes prior to the start, citing a knee injury. Whispers of Richard Williams pre-determining the outcome grew, no doubt aided by the comments of Elena Dementieva. After Venus Williams beat Dementieva in the quarterfinals, the Russian said the sisters' father would \"decide\" who won. Serena Williams, who was 19 back then, ultimately passed the 'test' that day against Clijsters, judging by the result: A three-set win. Serena Williams has certainly, too, shown her mental toughness over the years, adding 18 grand slam singles titles to the one she won prior to 2001. But what transpired tarnished the tournament, the sport and hurt one of tennis' all-time greats to such an extent that she stayed away from Indian Wells. Until now. Serena Williams plays her first match in Indian Wells in 14 years on Friday, saying she was \"following her heart\" in deciding to return. In the years that have passed since 2001, Williams went from teen to young adult to veteran, all the while collecting major titles elsewhere. \"It has been difficult for me to forget spending hours crying in the Indian Wells locker room after winning in 2001, driving back to Los Angeles feeling as if I had lost the biggest game ever -- not a mere tennis game but a bigger fight for equality,\" Serena Williams told Time.com in February. \"Emotionally it seemed easier to stay away. \"There are some who say I should never go back. There are others who say I should've returned years ago. I understand both perspectives very well and wrestled with them for a long time. \"I'm just following my heart on this one.\" Raymond Moore, a former owner of the tournament and now its chief executive, was \"elated\" to have Serena Williams back in the field. \"In the past, the things that happened, there were no winners,\" Moore told CNN.com. \"I think it was a terrible incident. Regrettable from all sides. Now, Serena has been able to change that. We're grateful, excited and happy and we're going to welcome her with open arms. \"In terms of her reception, I would like her to be here feeling that her decision was received with the greatest and warmest reception possible.\" But Serena Williams will be the lone Grand Slam winner in her family attending. Serena speaks about nerves before 1st match back at Indian Wells . Even with the best efforts of the tournament -- now owned by billionaire Larry Ellison -- Venus Williams is continuing her boycott, as is Richard Williams, according to Moore. The siblings' mom, Oracene Price, will be alongside Serena Williams, though, added Moore. Serena Williams told reporters in Indian Wells on Thursday that her father, mother and Venus Williams gave her their blessing when she contemplated coming back to Indian Wells. \"We wanted to get Venus,\" said Moore. \"In fact we'd like to welcome the whole family. But Venus I think is not quite in the same place as Serena is. And so we've not been successful in enticing her to enter or take a wild card. \"But Oracene is coming and so are some other family members.\" Serena Williams scoffed at suggestions that the sisters' match in Indian Wells -- or any other between them -- was fixed. \"Throughout my whole career, integrity has been everything to me,\" she told Time. \"It is also everything and more to Venus. The false allegations that our matches were fixed hurt, cut and ripped into us deeply. \"The under-current of racism was painful, confusing and unfair. In a game I loved with all my heart, at one of my most cherished tournaments, I suddenly felt unwelcome, alone and afraid.\" Reflecting on the incident, Bart McGuire, the chief executive of the women's tour in 2001, said Venus Williams had been suffering from a genuine injury and that the notion that Richard Williams dictated the outcome of matches between his daughters was off the mark. But he admitted things could have been handled better. Venus Williams, for example, might have explained her withdrawal to fans on court and then signed autographs, he said, citing the example of last year's World Tour Finals. Roger Federer withdrew from the final in London -- but not before he addressed fans and signed autographs. What happened in the final between Serena Williams and Clijsters was awful, said McGuire. \"I thought it was horrible,\" he told CNN.com. \"I thought it was very tough on the players. \"By that time I'd known enough to know that Venus had been significantly injured and that it was not a set-up of any kind. I thought it was unfair to Serena and Kim.\" Serena Williams is twice a champion at the tournament and winning this year would be \"fantastic,\" said Moore. But even if not, he added: \"I think it's a wonderful ending in closing an ugly chapter. We're just looking forward, we're not looking backward.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Serena Williams makes her return to Indian Wells on Friday .\nShe hasn't played at the tournament since 2001 .\nHer dad said he was racially abused at the tournament in 2001 .\nVenus Williams still has not returned to the California event ."
-    },
-    {
-        "input": "(CNN)Could mobile technology turn the tide in the fight to eliminate racism from football? Anti-racism organization Kick It Out (KIO) says there's been a 35% rise in fans reporting abusive behavior committed by other fans -- and that's largely due to a new phone app. \"The app has had a massive impact,\" KIO media and communications manager Richard Bates told CNN, noting that it accounted for 27% of all complaints. \"Self-policing is so important when it comes to stamping out racism in football. People are confident that their complaints will be taken seriously.\" The 184 reported incidents recorded during the last five months of 2014 covered racial and religious discrimination, as well as sexual orientation, gender and disability offenses. The increase was measured against the same period the year before. KIO's mobile phone application, which was released in 2013, encourages fans to report incidents anonymously by pressing a \"Report It!\" tab and entering the name of the football ground, positioning of the offender, and incident details. Although contact details of the person reporting the incident are requested, KIO emphasized that the anonymity of the tool is key. Every single objection logged by fans, either on the phone, via email, or on the app, triggers an investigation by the English Football Association (FA). These include 73 incidents on social media (a 24% increase), which have so far led to 21 instances of the offender being identified, and at times having their accounts deleted. The FA, which released its own witness complaint guidelines in 2013, encouraged fans to be proactive in flagging up incidents. \"Reporting abuse, whether witnessed or experienced, is an important part of the game's overall anti-discrimination work,\" said an FA spokesperson. \"There are clear ways for players and fans to do this, and these improved procedures should result in more reports in the future.\" Bates admitted that it takes \"bravery and courage\" for fans to record photographic evidence on their phones, but pointed to the effect British expatriate Paul Nolan has had after he filmed Chelsea fans physically stopping a black man from getting on the Paris metro last month. The amateur video obtained by The Guardian newspaper, and posted on its website, shows the man make multiple attempts to board a train at Richelieu-Drouot station, only to be pushed away each time by a group of passengers. The Chelsea fans can then be heard chanting: \"We're racist, we're racist and that's the way we like it.\" Chelsea were in France to play Paris Saint-Germain in a Champions League first-leg tie - Europe's premier football tournament -- which finished 1-1. Five people have been identified and suspended by Chelsea, who have launched an ongoing investigation. Later in February, KIO says it contacted the police over social media footage which showed men, thought to be West Ham fans, singing anti-Semitic songs on a train while traveling to Tottenham Hotspur's White Hart Lane ground ahead of an English Premier League game. \"Would we know that these Paris and London incidents had happened if they weren't on video?\" Bates asks, adding that KIO is considering an option to upload video and still images of offenders directly to a complaint log on the app. \"If you have video footage and you can issue that retrospectively, it can make it easier to identify the perpetrators,\" he says. In a report released in 2013 entitled \"English Football's Inclusion and Anti-Discrimination Action Plan\", the FA explicitly talks about \"raising confidence in the reporting and disciplinary process at grassroots level.\" KIO's jurisdiction extends to all of English football, including the amateur ranks. It is one of the few independent bodies in world sports which solicits complaints from fans in order to weed out abusive behavior. It also polices the actions of players, which has embroiled the organization in controversy of its own in the past. In 2012, Rio Ferdinand refused to wear a KIO T-shirt before a match while playing for Manchester United. He was protesting a perceived lack of response by the governing bodies against John Terry after the Chelsea defender was accused of racially abusing his brother, Anton Ferdinand of Queens Park Rangers. Terry was banned for four matches and fined £220,000 by the FA for his actions.",
-        "instruction": "Summarize the following news article:",
-        "output": "Thirty-five percent increase in abuses reported .\nAll complaints lead to an inquiry by England's Football Association .\nSelf-policing fans encouraged to be careful ."
-    },
-    {
-        "input": "(CNN)The news was stunning to hear: A young woman, 18-year-old Michelle Carter, was charged with involuntary manslaughter for allegedly sending text messages urging her friend to commit suicide. How could a young person -- who was a close friend of the deceased, Conrad Roy, 18 -- allegedly do such a horrifying thing? And what are the legal ramifications of what a teen says online or in a text? But a lesser discussed point this tragic story raises is how quickly a teen can go from normal everyday life to facing severe, life-altering legal consequences. That is why Lisa Green, author of the informative new book \"On Your Case: A Compassionate (and Only Slightly Bossy) Legal Guide for Every Stage of a Woman's Life,\" believes every parent of a teen should have a criminal defense lawyer in mind and at the ready -- just in case. \"To me this is the unrecognized area that parents, particularly parents of teens, miss all the time,\" said Green during a recent interview at CNN's studios. \"So many of our friends have armies of tutors, extracurricular activities, all sorts of angles covered ... but when it comes to the law, there's this black hole.\" Mom arrested for leaving 9-year-old alone at park . Green, a journalist, lawyer and television legal analyst, says people often think of  the law as scary and intimidating and believe they don't need to worry about it because their kid is never going to get into legal trouble. Whether it's a case as serious as Carter's involuntary manslaughter charge, or a simple allegation of vandalism, parents need to be prepared, Green said. \"I cannot count the number of kids I know, good kids, who find themselves in the wrong place at the wrong time,\" said Green. \"I have now two young adults, and when they were going through their teenage years, it was a simple matter of a party that went wrong, a group of kids in the park when the police stop by and have some questions, bringing something to school they shouldn't have. \"And in each of those cases, a little bit of knowledge of the law, a little bit of knowledge of what their rights are, the right way to behave, would have saved parents a heck of a lot of grief.\" Green thinks parents should think about the issue in the same way they would track down a good orthopedist if their child breaks a bone, or the best tutor if their child is struggling before the SAT. Parents' house seized after son's drug bust . \"I am not saying that parents need to go out and get their own law degrees ... but just dipping your finger into the topic will help you understand what's available to you to help you parent better,\" she said. She gives some real-life examples especially in the social media age. For instance, what if a teen is asked by a school administrator to turn over his or her cell phone based on allegations the teen was sending inappropriate texts? What parents should know, Green said, is that a school can't open a cell phone for no reason at all. \"They need to have reasonable suspicion that something's wrong,\" said Green, who said parents would be wise to talk to their teens about what's appropriate and what's not when it comes to handling such requests from school officials. \"If you're asked, as a child, for a locker search, to open a phone, to open a laptop, if it's your property, pause and ask if you could call Mom and Dad,\" she said. \"We can act whether we're lawyers or not as that first line of defense.\" Green also says that parents of college-bound teens should spend a few minutes looking online at the school's code of conduct. \"They don't tell you about it during that fantastic tour with the kid walking backwards as your child is looking around to say, 'Who can I party with?' But it's a really important set of information because different schools have different levels of tolerance\" for activities such as drinking on campus, she said. And even before teens head off to college, parents should know about social host laws, where parents could be held criminally responsible or face civil damages if teens drink alcohol in their home and then go off and do something inappropriate or even tragic. Scary consequences for parents hosting underage drinking . More than 150 cities or counties and 24 states currently have such laws on their books, according to Mothers Against Drunk Driving. In some cases, parents can be held liable even if they didn't know the drinking was taking place. \"It doesn't have to be you with the shaker, like an episode of 'Mad Men,' serving up Manhattans to a group of grateful teens,\" said Green. \"If you've made it possible in your home, if you don't lock your liquor cabinet -- I never did -- and all of a sudden kids are drinking, that could be a problem as well.\" While Green is passionate about educating parents on why they need to think about the law, she stresses that bad behavior by teens still needs to be punished either legally or at home. \"I am not advocating that kids should be absolved of responsibility. If a kid does something wrong, if they broke the law, they ought to be punished appropriately by it. But we also live in a society where we have legal rights, and I want parents to know that they should be aware of what those are so they can help their child use better judgment.\" So if after reading this, you are moved to try to find a criminal defense lawyer, how on earth do you go about finding one? Green's advice is to ask friends and colleagues for referrals and also consult with your state bar association, since you would want someone in your state who has experience with criminal defense issues. Then she says you should call up and interview a few lawyers, asking them everything from how much their services would cost (that can vary), to who would handle the work, to what their philosophy is about the law and teens' rights. \"You may or may not end up being friends afterwards, but that's not as important as feeling secure that your lawyer is approaching the situation in a way that feels right for you.\" Do you think it's important for parents of teens to be aware of the law and their children's legal rights? Share your thoughts with Kelly Wallace (@kellywallacetv) on Twitter or CNN Living on Facebook.",
-        "instruction": "Summarize the following news article:",
-        "output": "Parents of teens should have a criminal defense lawyer in mind -- just in case, says author .\nAuthor Lisa Green calls this the \"unrecognized area\"  that parents of teens \"miss all the time\"\nIn 24 states, parents can be held liable for teen drinking in their home ."
-    },
-    {
-        "input": "(CNN)Thirty years ago, a journey across Europe meant a passport full of stamps, a wallet full of different currencies and plenty of time spent waiting in line to be glared at by border officials. That all began to change in June 1985, when the continent's countries began signing up to the Schengen agreement -- a deal that lifted frontier controls between cooperating neighbors. Today, 20 years after it came into force, with more than 26 states now participating, Schengen has completely altered the experience of traversing Europe. Nowhere is this more visible than at the old crossing points -- places that were once hives of activity but are now ghostly, vacated shells of their former selves. Not entirely forgotten though. Spanish photographer Ignacio Evangelista spent several years criss-crossing the continent to capture these abandoned checkpoints on camera for a project he calls \"After Schengen.\" The result is a fascinating gallery of images that charts the unusual architecture of places whose fate has been intertwined with Europe's ever-evolving political and economic allegiances. \"I don't know why but from many years ago, I feel very attracted to situations or places where the natural and the artificial come together, sometimes a little bit in conflict,\" Evangelista tells CNN, explaining his interest in frontiers. He says he spent his formative years poring over the World Atlas, marveling at the straight-line borders carved by colonialists across the map of Africa and wondering why Europeans couldn't iron the kinks out of their own squiggled frontiers. \"When you are a young child in front of a map you feel ... you have the whole world in front of you and you can travel with your mind of course, with your imagination,\" he says. As a young adult in the early 1990s, Evangelista experienced many of these borders firsthand when he embarked on an Interailing trip -- a country-hopping rite of passage that sees many young Europeans take advantage of cheap pan-continental train tickets. \"Before, when I was young, if you traveled from Spain to Germany you had to cross three countries and take three currencies,\" he recalls. \"Once I was traveling with my friend, Interailing ... from Italy to Greece, we had to cross the old Yugoslavia. \"I think into the night,  2 or 3 a.m.,  we cross the border from Italy into Yugoslavia and the train stopped. We were sleeping, of course. Then three or four soldiers come into the train and shouted at everybody, very aggressively, like in a spy movie. \"We waited half an hour, then half an hour later the train went on. It was exciting, even funny as I was 18 years old, but now it's not so funny.\" As Evangelista points out, in a Europe cleaved by the Cold War, many borders were not just the cultural dividing lines they are today. Back then they were fortifications demarcating places of oppression and freedom. The checkpoints themselves were sometimes places of fear, of hostile bureaucracy -- a past Evangelista says lingers on in the buildings left behind. \"For me it's fascinating because you can see the passage of the time, the human footprint. \"These places had a very strong coercive role, people had to stop the car and the policeman had to ask you who you are, you showed your passport, maybe you had to open your bags. The police had the power to not let you go on. \"It's interesting to me, looking at these places now they are a little bit spooky, because at most of them you can feel this ghostly atmosphere.\" The frontier buildings range in size and style, from giant Soviet declarations of authority that loom over major highways to tiny huts in deep, dark forests. Europe's richer countries tend to maintain old posts, while less wealthier states seem content to let them deteriorate, Evangelista says. Some, he says, are gone completely, marked only on maps and located using GPS trackers or by talking to locals. A strong supporter for an open Europe at a time when some of the continent's nations are talking about severing the close economic and political bonds they share with their neighbors, Evangelista recalls one encounter that underscored the human side to his project. While setting up his camera at a checkpoint on the Austria-Hungary frontier, he watched as a man drove in from the Austrian side and parked, followed by a woman, two minutes later, from the Hungarian side. \"They began to speak and they were kissing very much. After 10 minutes they went back their separate ways, and I thought, before the Schengen agreement, this couple had no future.\" Follow Evangelista's ongoing project at www.ignacioevangelista.com .",
-        "instruction": "Summarize the following news article:",
-        "output": "Spanish photographer Ignacio Evangelista's \"After Schengen\" project captures images of abandoned European checkpoints .\nSchengen agreement came into force 20 years ago, lifting border controls between participating European nations .\nBorder checkpoints range from giant Soviet statements to small huts in deep, dark forests ."
-    },
-    {
-        "input": "(CNN)When man relies on machine, there is always something that can go wrong. And there is no more unforgiving environment than the high-stakes world of Formula One. \"Racing is one of those... probably a crueler sport in that there's so many other variables,\" says Red Bull's No.1 driver Daniel Ricciardo, ahead of Sunday's season-opening Australian Grand Prix. \"Unfortunately it's not like tennis where all the racquets are pretty much the same and you can't blame anyone but yourself.\" Ricciardo experienced the full extent of F1's cruelty at last year's Melbourne race. The Australian crossed the line in second place -- seemingly becoming the first Aussie to finish on the podium at Albert Park -- but jubilation soon turned to despair. He was disqualified hours after the race when officials ruled his car had exceeded the maximum fuel flow rate, a rule which states each car is limited to 100 kilograms of fuel per race and was introduced to make the sport more fuel efficient. \"It's frustrating, I mean it's a part of the sport which will always be frustrating,\" said the 25-year-old, reflecting on what was his Red Bull debut. \"I'm sure even Lewis (Hamilton) and Nico (Rosberg) last year were frustrated at the times because you never have the perfect car,\" he added, referring to the two Mercedes drivers, who finished first and second last season. \"But it is what it is, I signed up for that a long time ago, and you just learn to accept it.\" It's a sport where even driving ability often fails to overcome mechanical failures -- eventual world champion Hamilton was forced to retire from last year's race at Albert Park. But technical blunders don't always end in disaster -- Hamilton won the Italian Grand Prix after experiencing mechanical faults early in the race, and Finnish driver Valterri Bottas lost a wheel in 2014's opener before roaring to a fifth-place finish. Despite Ricciardo's previous disappointment, he's can't wait to race on his home turf again next Sunday. \"I definitely feel privileged to have a home race... it just increases all the hype and all the excitement\" said Ricciardo, adding that he would use \"all the Australian flags and all the cheers to my advantage.\" After last year's disaster Ricciardo is looking to make things right, \"There's a little bit of redemption, I'm confident we can get it back.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Red Bull's No.1 driver Daniel Ricciardo says Formula One is a \"crueler sport\"\nLast year Ricciardo was disqualified from the Australian Grand Prix after finishing second .\nThe 25-year-old is looking for redemption in Melbourne on Sunday ."
-    },
-    {
-        "input": "(CNN)\"Cinderella,\" the latest live-action retelling of a classic, sparkled like a glass slipper in its opening weekend, with an estimated debut of $70.1 million. That's well above expectations of $50 to 60 million, and it tops last year's $69.4 million opening for \"Maleficent,\" another take on the classic fairy tale, which featured Angelina Jolie and a darker tone. This version, however is sweetness and light, thanks to veteran director Kenneth Branagh. Small-screen stars Lily James (\"Downton Abbey\") and Richard Madden (\"Game of Thrones\") play Ella and her Prince, who overcome personal grief and uncertainty with courage and goodness. Oscar winner Cate Blanchett provides venom as the wicked stepmother. Reviews for \"Cinderella\" have been strong, with a current Rotten Tomatoes rating of 83%. Disney timed the release well: It's been more than a month since a family film hit theaters (if you can call the \"SpongeBob\" sequel a \"family\" film), and some American school districts just began spring break. Liam Neeson's latest thriller, \"Run All Night,\" opened in a very soft second place. Neeson has been a solid draw since he reinvented himself as an action star six years ago with \"Taken,\" but \"Run All Night\" debuted even more weakly than last year's dismal \"A Walk Among the Tombstones.\" This time out, he plays a former hit man whose family is targeted by his old boss, played by Ed Harris -- but it seems people would rather see Neeson attack and fight than run. In limited release, the critically acclaimed horror thriller \"It Follows\" made $163,000 in just four theaters, far and away the weekend's best per-theater average. Among returning films, Colin Firth and \"Kingsman: The Secret Service\" shone the brightest, topping the $100 million mark in total domestic grosses and actually rising a spot on the chart, while last weekend's winner, \"Chappie,\" and runner-up, \"Focus,\" duked it out for fourth place. Next weekend, Shailene Woodley returns in the young adult action sequel \"The Divergent Series: Insurgent,\" while Sean Penn tries the action genre with Idris Elba and Javier Bardem in \"The Gunman.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Live-action retelling of \"Cinderella\" has an estimated box office debut of $70.1 million .\nLiam Neeson's latest thriller, \"Run All Night,\" opens in soft second place ."
-    },
-    {
-        "input": "(CNN)The march toward publication of another book by the reclusive \"To Kill a Mockingbird\" author continues forward. Many people who know her have raised questions about 88-year-old Harper Lee's ability to consent to publish another book, while others have said she knows what she's doing, according to a New York Times report. Alabama officials have found Lee wants to publish the book. Since Lee wrote \"To Kill a Mockingbird\" in 1960, she had steadfastly refused to publish another book. Lee now lives in an assisted living facility in Monroeville, Alabama, and some friends say her forgetfulness makes her unable to knowingly consent to publishing the book, \"Go Set a Watchman.\" After receiving an anonymous complaint of elder abuse about Lee, the state of Alabama sent investigators from the Alabama Securities Commission to talk to her and others around her. \"It was clear to our investigators that she fully understood the questions that were being asked, that she indicated she certainly wanted her book published, and she had her opinions that were voiced during the interview,\" Joseph Borg, the agency's director, told CNN. \"And at that point we decided that she certainly knew what was going on.\" Lee wrote \"Go Set a Watchman\" before \"To Kill a Mockingbird,\" and it features some of the same characters. Lee lawyer Tonja B. Carter found the \"Go Set a Watchman\" draft in the author's belongings in August and negotiated a publishing deal with HarperCollins. For now, \"Go Set a Watchman\" is still scheduled to be released by HarperCollins in July. The Securities Commission investigation is closed. \"Should something come up a later date that shows something was wrong, which we have no indication of, we could take another look,\" Borg said. \"But since we had no complaint from the person who in the middle of it all, so there was no reason to maintain the case open.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Elder abuse claim filed about elderly \"To Kill a Mockingbird\" author .\nHer second book is out in July but people question her consent to publish ."
-    },
-    {
-        "input": "(CNN)It's a striking image: a new baby cradled in the American flag, held by a Navy sailor whose face we can't see. But is it a patriotic photograph or desecration of the American flag? Navy veteran Vanessa Hicks, a Virginia Beach photographer whose website is filled with sweet images of babies and their parents, shot the picture of the sailor and his baby and posted it to her Facebook page. \"I do believe that this picture right here shows what it means to be an American,\" Hicks told CNN affiliate WTKR.\"That flag, the uniform, that baby -- exactly what every service member is out there fighting.\" Not everyone agreed with Hicks, herself a Navy wife whose husband is deployed. A Facebook page titled \"You call yourself a photographer?\" posted her photo and said that the use of the flag as a prop was a desecration -- and a bad picture. The U.S. Flag Code does state that the flag should never be used as \"wearing apparel, bedding, or drapery.  It should never be festooned, drawn back, nor up, in folds, but always allowed to fall free,\" according to a Congressional Research Service report (PDF). Nor should it be \"used as a receptacle for receiving, holding, carrying, or delivering anything.\" The threatening private messages, which she has reported to Facebook, saddened her. But she decided to keep posting the picture and to take pictures of other members of the military who request the same picture with their babies and the flag. Most people posting to the critical Facebook page disagree with the criticism of the picture. Wrote one poster, \"I am a veteran and find this picture beautiful. Clearly the military member loves his country and baby.\" \"The photo is making a point,\" another photo fan posted. \"The flag represents our republic, the republic is supporting our future. There is nothing disrespectful about this use of our flag. It is beautiful.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "A picture of a newborn cradled in the flag raises questions .\nFacebook critics call it desecration of the flag .\nA Navy veteran whose husband is deployed took the picture ."
-    },
-    {
-        "input": "(CNN)I'm Candida Moss and I am professor of New Testament and Early Christianity at the University of Notre Dame. I was an adviser on the \"True Cross\" episode and served as one of the many on-camera experts in CNN's \"Finding Jesus\" series, which currently airs on Sundays. Viewers were invited to tweet and post their questions on the \"Finding Jesus\" Facebook page during the show. Below are some of the more interesting questions and my answers to them. My apologies to everyone I didn't get to. Feel free to tweet your questions to me directly. Herb Scribner: Can anyone explain to me what the Bible's deal is with 40 days/nights? Moss: It's more an interest in the number 40. In the Hebrew Bible the people of Israel wander in the wilderness for 40 years before they reach the Holy Land. The flood lasts for 40 days and nights; Moses spends 40 days and nights on the mountain; Goliath spends 40 days encouraging the Israelites to challenge him before David steps up; 40 is a common age for people to be when they get married; in the book of Judges it is always 40 years between judges; and David and Solomon each reigned for 40 years. What we can take away from all of this is that people in the ancient world saw 40 as suggesting a full, complete period of time. It's sort of like a narrative stock number, in the way that modern jokes follow the rule of three. Yalanda M. Price: Was there any division between the followers of Jesus and the followers of John the Baptist? Moss: One of the interesting things about the relationship between Jesus and John the Baptist is that John doesn't lay down his tools and follow Jesus after he baptizes him. Nor, it seems, did John's disciples. They had separate ministries and, while there may have been contact between the two groups, they were also de facto competitors in the ancient religious marketplace. There are some hints in the New Testament that Jesus and his followers had to differentiate themselves from John by stating that Jesus' baptism was better (Acts 11:6) and countering the idea that Jesus was actually John raised from the dead (Matthew 14:2) Some scholars argue that these references are evidence of tension between followers of Jesus and followers of John. Mark Goodacre answers your questions about the Shroud of Turin. Jeffery Graff: Can the DNA tests on the bones indicate whether he is a Jew or even whether he is of the tribe of Levi? Moss: I'm so glad someone brought up DNA. The DNA tests on the Bulgarian bones yielded only mitochondrial DNA (DNA passed down by the mother), not the more reliable nuclear DNA (the kind of DNA referred to in forensic investigations). In the original study of the Bulgarian relics (of which I was a part) the mitochondrial DNA revealed that the Bulgarian relics were of \"probable Semitic origin.\" Thinking back to my time in the laboratory with the Copenhagen scientists, I recall that the lead investigator estimated that the probability was about 75%. The episode last night stated things a little too sharply when it said that the bones were from a Middle-Eastern man. As for the more specific question about the genetics of Jews and members of the tribe of Levi: Current scientific technology does not reveal this kind of information even if start-up genetic testing companies promise this kind of information. Cyndi Rosenthal: Are there any other historical references of John the Baptist outside of the Bible? Daniel José Camacho: Any extra-biblical sources that shed light on historical figure of John the Baptist? Moss: (These questions are on a similar topic, so I've chosen to answer them together.) Actually there is external attestation for the life and importance of John the Baptist. This is important because it's fairly rare to find this kind of evidence for ancient figures outside the writings of their followers. The Jewish historian Josephus mentions John the Baptist in his book, \"The Jewish Antiquities.\" Josephus describes John as a \"good man\" who possessed \"virtue\" and had \"great influence\" over the people. According to Josephus, Herod put John the Baptist to death because he was afraid that he might raise a rebellion. This gives us another -- arguably more historical -- perspective on why John was executed and provides further evidence about just how important John was in his own day. Watch the latest full episode anytime on CNNgo . Daniel José Camacho: Wait, how did Jesus get \"Our Father\" prayer from Johnny B??? Didn't catch that. Moss: I'm also really glad someone brought this up, because I wondered about it too. In the Gospel of Luke, one of Jesus' disciples says, \"Lord, teach us to pray, as John taught his disciples.\" It's an interesting request that tells us something about John the Baptist's ministry and the demanding characters of Jesus' disciples! In Luke, Jesus responds to this request by teaching them the Lord's Prayer. We don't know that the Lord's Prayer came from John, and personally I don't think it did; I think this is just how Luke shaped his version of events. But if you were just reading Luke you could come to that conclusion.",
-        "instruction": "Summarize the following news article:",
-        "output": "Religion professor Candida Moss appears in each episode of the program .\nMoss was part of the original study to determine if relics found in Bulgaria could be the bones of John the Baptist."
-    },
-    {
-        "input": "Andre Spicer is Professor of Organizational Behavior, Cass Business School at City University London. The views expressed in this commentary are solely those of the writer. CNN is showcasing the work of The Conversation, a collaboration between journalists and academics to provide news analysis and commentary. The content is produced solely by The Conversation. LondonApple CEO Tim Cook has released the much-anticipated Apple watch -- his company's first new product since the iPad. Cook said the new watch, in addition to telling the time, was a \"comprehensive health and fitness companion.\" But we're unlikely to hear much about how people will actually use this new product for some time. Our research has found that smartwatches certainly do have some benefits for users. But they also have a hidden, darker side which the companies selling them are unlikely to talk about. As part of a research project with Alberto Rizzoli, we have tried to understand what drives people to invest in smartwatches. One participant, a judge, hoped he could keep track of the deluge of emails during long hours in the courtroom when he had to pay attention -- and be seen to pay attention. The dilemma is familiar to many of us. We spend our days engaged in social interaction. Huge proportions of our time are taken up with meetings or impromptu encounters with colleagues. However when we are in these meetings, our smartphone is buzzing away reminding us that our inbox is filling up. We may want to check these incoming messages, but know it would be rude to glance at our phone. When one of the first smartwatches came to market, the judge thought he had found the solution to this problem. We found that heavy smartwatch users valued how the devices helped them track information as it arrived while still appearing socially attentive. It also saved users the hassle of having to dig through their pockets or handbags to find a buzzing phone. Surprisingly only about half the people we spoke with actually used all the health tracking technology built in to the watches. They were more interested in keeping abreast of their inbox than their calorie count. We also noticed a worrying side to these new devices as heavy users of the watches incorporate them into their daily routines -- we call it the \"phantom device effect.\" They would compulsive check their watch not just for the time, but for a wide range of information. In some cases their new gizmo would become such an instinctive part of their life that even when not wearing one they would check their bare wrist. Some would feel a phantom buzz, notifying them of an imaginary incoming email. The phantom device effect leads us to ask about just how ingrained in our daily habits these devices might become. Recent research has suggested that average smartphone users check their phone 150 times a day, starting just minutes after waking up. We also know this has the effect of extending work into all areas of our lives -- monitoring work emails late into the night or during social or family events. We used to worry that the average American watched television for six hours a day. Now we accept as a matter of course the fact that we tied to our devices for most of the time we're awake. Indeed many of us go to bed with our smart devices quietly monitoring our rhythms as we sleep. This raises the question of what impact it will have on our lives. In the work Carl Cederström and I have done on the hidden dangers of wearables, a big big concern is privacy. The Apple Watch, like most wearables, is essentially a tracking device, recording heart rate, sleep patterns, movement, whereabouts, and much more depending on the apps installed. All this is packaged and transmitted to datacentres for analysis. The result? A database of personal information of which the Stasi could only have dreamed. As well as sucking up personal information, wearables could fuel an unhealthy obsession with personal wellness. By pumping health and wellbeing information at us non-stop, we start to become a little too focused on our bio-rhythms. Data which would have been generated only in the most unusual situations now becomes commonplace, making people not just health conscious, but also self-obsessed. Instead of checking in with social networks, we spend more time checking in with our own bodily rhythms. As a result other people start to become more interested in what is going on inside themselves rather than what is happening in the world. As we pay more attention to our feed of personal biodata we're likely to become more anxious about things that we may never have given a second thought to in the past. Not walking your allotted steps in a day or finding you had a poor sleeping pattern at night can become a source of significant personal guilt or worry. As a result, we pour more attention into monitoring and controlling ourselves, giving us less time to do the things which actually make us happy. If Apple's projections are indeed correct and tens of millions of people purchase smartwatches, it's likely to create a step change in our lives. We already see people routinely posting on social media information they would have only shared with their doctor a few years ago. Widespread use of smartwatches could mean that instead of relieving our boredom by answering emails, people will spend their time sifting through their biodata stream, planning ways to maximize their personal wellness ratings. Rather than talking about what was on television last night, people will start comparing charts of their sleeping patterns. When this happens we will know that the internet of me has arrived. READ: Will your Apple Watch program YOU? Copyright 2015 The Conversation. Some rights reserved.",
-        "instruction": "Summarize the following news article:",
-        "output": "Smartwatches have hidden, darker side the companies selling them are unlikely to talk about, says Andre Spicer .\nSpicer: Average smartphone users check phone 150 times a day, starting just minutes after waking up .\nWe're also likely to become more anxious about things we may never have given second thought to, he adds ."
-    },
-    {
-        "input": "(CNN)Police raided millionaire heir Robert Durst's home in Houston, Texas, on Tuesday, a day after he was charged with first-degree murder. Dick DeGuerin, Durst's attorney, confirmed that investigators were searching the home. It was not immediately clear what they were looking for inside Durst's 14th-floor condo, where he has lived for many years, CNN affiliate KTRK reported. The raid comes days after FBI agents arrested Durst in a New Orleans hotel. The Los Angeles County District Attorney filed a first-degree murder charge against him on Monday, accusing Durst of shooting and killing his close friend Susan Berman in December 2000. If convicted, he could face the death penalty. \"I think it's ridiculous for them to be making a search 15 years after Susan Berman was killed, and they're searching a place in Houston,\" DeGuerin told CNN. Durst remains jailed in New Orleans, where he's facing drug and weapons charges stemming from his arrest over the weekend. Durst's alleged connections with Berman's death and two others were the focus of HBO's true crime documentary, \"The Jinx.\" DeGuerin has claimed it's no coincidence authorities moved in to arrest Durst just as the documentary's finale was about to air. And on Tuesday, he said he wasn't surprised about the Houston raid either. \"I'm not surprised they're acting like a bunch of keystone cops, particularly after being embarrassed by the TV program,\" he said. \"And I'll be even more surprised if they find anything of any evidentiary value whatsoever.\" Prosecutors accuse Durst of \"lying in wait\" and killing Berman, a crime writer and his longtime confidante, because she was a witness to a crime. She was shot in the head shortly before investigators were coming to speak with her about the disappearance of Durst's first wife in 1982. Durst has long maintained he had nothing to do with Berman's death or his wife's disappearance, though some have questioned whether comments he made at the end of the documentary -- muttering under his breath in the bathroom that he \"killed them all\" -- could be interpreted as a confession. DeGuerin told reporters Monday that his client didn't kill Berman. \"He's ready to end all the rumor and speculation and have a trial,\" DeGuerin said. But it's unclear when a trial could take place. Durst waived his right to fight extradition to Los Angeles, but because prosecutors in New Orleans are pursuing charges against him, he remains jailed there. Durst had a Smith & Wesson .38-caliber revolver on him when he was arrested, according to New Orleans Police Department records. Investigators found marijuana and a \"substantial\" amount of cash in Durst's hotel room, a source familiar with the investigation told CNN. He was booked in New Orleans Monday on charges of being a felon in possession of a firearm and possession of a firearm with a controlled substance. He may be moved soon to a different Louisiana prison where inmates with \"an acute medical condition\" are sent, DeGuerin said. Court documents filed Tuesday say Durst will receive medications while imprisoned, \"including but not limited to hydrocodone as needed for pain.\" CNN's Dave Alsup, Holly Yan and Chris Welch contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Police raided Robert Durst's Houston condo, his lawyer says .\nThe millionaire real estate heir was arrested over the weekend in New Orleans .\nHe's charged with first-degree murder in the slaying of his longtime friend in 2000 ."
-    },
-    {
-        "input": "(CNN)America may be growing less hateful. That's according to an annual Southern Poverty Law Center report that says the number of hate groups in the United States remains on the decline for the second year in a row. In the \"Intelligence Report\" released Tuesday, the SPLC says the number of hate groups operating in the U.S. declined 17% between 2013 and 2014. They are now at their lowest levels since 2005, the watchdog organization said. \"Patriot groups, which are animated by a series of conspiracy theories about the alleged evils of the federal government, fell even faster, to 874 groups from a 2012 peak of 1,360 groups. In just the last year, the number of Patriot groups declined by 20%, from 1,096 groups to 874,\" the report said. \"But those numbers may be somewhat deceiving. More than half of the decline in hate groups was of Ku Klux Klan chapters, and many of those have apparently gone underground, ending public communications, rather than disbanding.\" The decline may be due to a number of factors, including the rebounding economy, law enforcement action and leadership issues within the groups, the SPLC said. The \"high social\" cost of being associated publicly with a hate group is also noted as a factor for decline, the group said. \"The atmosphere has changed,\" Mark Potok, editor-in-chief of the \"Intelligence Report,\" told reporters on a conference call. \"Overall, the groups are less important.\" People publishing messages of hate directly to the Internet is also cited as a possible reason for the decline in the organized groups. California and Florida have the largest number of hate groups, with more than 50 each. Alaska and Hawaii are the only states with no hate groups, according to the SPLC. The report tracks the number of chapters or groups, not individual members, so while the number of groups is on the decline, it's possible that some of the groups that remain have increased membership. The Ku Klux Klan experienced the largest decline of all hate groups, according to the report, losing more than 50% of its chapters. One of the oldest and most infamous hate groups in the United States, the KKK took its foothold after the Civil War, terrorizing the African-American population with intimidation and violent actions, including lynching. The Klan's growth slowed after the establishment of Jim Crow laws in the American South, according to the SPLC. The KKK's last resurgence came during the civil rights movement of the 1960s. The group was again responsible for large-scale terror against the South's African-American population. In 1990, the SPLC started to keep track of the number of KKK chapters, which reached a high of 221 in 2010, in response to President Barack Obama's election, according to the report. Since then, the numbers have been on the decline. The SPLC now reports 72 chapters, down from 163 a year ago. Under Obama, 4 in 10 say race relations worsened . The reason for the decline of the Ku Klux Klan is not totally clear, though the report even suggests the groups may be going deeper underground. \"It appears that most of the groups simply faded as their leaders and members got older, but it is also very possible that many simply stopped announcing where their chapters were,\" the report said. The center estimates that between 5,000 and 8,000 people are Klan members in the United States. Opinion: America facing anti-Muslim bigotry . The SPLC report also notes that more people may be operating as so-called lone wolves. The overall number of hate groups peaked in 2011 and has been on the decline since. However, the number of registered users to Stormfront -- a website claiming to be \"the voice of the new, embattled White minority!\" -- has doubled since 2008. It now has nearly 300,000 users. Lone wolves are a concern, the report said, because 90% of all domestic terror attacks since 2009 were carried out by individuals or pairs. Lone wolf planned attack on U.S. Capitol, FBI says . The report points to neo-Nazi Frazier Glenn Miller, who allegedly killed three people in Kansas who he thought were Jewish, and an incident in Las Vegas, where an anti-government couple killed two police officers before being killed themselves, as examples of radical criminal elements acting alone. The SPLC refers to this and other indicators to say that many individuals may be moving from organized groups to the Internet, to become more anonymous. Looking towards next year, SPLC is \"expecting a real wave of Islamophobia\" because of the proliferation of ISIS and the heavy media coverage of the extremist group, Potok told reporters.",
-        "instruction": "Summarize the following news article:",
-        "output": "Media coverage of ISIS could spur \"real wave of Islamophobia,\" report's editor says .\nThe number of hate groups across the U.S. dropped 17% from 2013 to 2014, the report says .\nIn particular, the number of KKK chapters dropped from 163 to 72 in a year, it says ."
-    },
-    {
-        "input": "(CNN)The Rev. Fred Craddock, the pulpit giant who was \"like no other preacher you have ever heard,\" has died, his church announced. Craddock, who redefined the art of preaching, died Friday in Blue Ridge, Georgia. The cause has not been disclosed. The 86-year-old had been in declining health due to Parkinson's disease in recent years, according to the United Methodist Reporter. \"Fred Craddock was a national treasure and a devoted servant of the church and Jesus Christ. His impact on preaching -- in terms both of scholarship and practice -- is incalculable,\" said the Rev. Thomas Long, a friend and a pastor at Emory University's Candler School of Theology. Craddock faces his toughest convert . Preachers studied classic Craddock sermons such as \"Have You Heard John Preach?\" and \"Grace and Disgrace,\" much like aspiring jazz musicians listened to saxophonist John Coltrane and amateur boxers studied tapes of Sugar Ray Robinson -- for clues to greatness and inspiration. Craddock elevated preaching to an art. He was often called a preaching genius. Rather than deliver a sermon like a lecture -- an intro, three main points and a conclusion -- he developed an \"inductive\" conversational style of preaching. His sermons unfolded like a short story -- there was foreshadowing, plot twists, dialogue; language of startling beauty and surprise endings. The way he ended his sermons was as memorable as what he said. He would abruptly stop, turn from the pulpit, and quietly sit as the audience sat in silence. People didn't applaud or shout hallelujah after his sermons. They were too busy absorbing what he had just said. The Rev. Barbara Brown Taylor, an author and world-renowned preacher, once said of Craddock: \"He spoke of Kierkegaard as easily as he spoke of the Indianapolis 500. He quoted Kafka as helpfully as Corinthians... but he was also someone who noticed a lot about ordinary human life on earth.\" Craddock, who taught preaching at  the Candler School of Theology until his retirement, was selected as one of the 12 most effective preachers in the English-speaking world in a poll of 341 seminary professors and editors of religious periodicals in 1996. I had a chance to meet Craddock, as well as hear him. I spent several weeks with him in the autumn of 2011 to write a profile about his relationship with his father. He was just as impressive outside the pulpit. He would ease his rotund little body in a wooden chair and share touching stories about growing up poor in the small town of Humboldt, Tennessee, during the Depression with an alcoholic father. He'd talk about the infirmities of old age -- I remember saying with a chuckle that \"I should have something\" when referring to his bout with Parkinson's because he was getting old. He was funny, folksy, witty and his eyes danced with glee when he told a story. Sitting in Craddock's presence was like listening to a wise uncle or grandmother. He had courtly, Southern manners, and he gave the impression that he had all the time in the world for you. I never once heard him criticize anyone. It's not uncommon to meet great people who, to borrow a phrase that Craddock used, have \"domestic wreckage\" at home. But the Craddock family was among his biggest fans. He was married to his high school sweetheart, Nettie, for more than 50 years, and they had two children, Laura and John. Laura named her son after her father. And John, who became a CEO instead of a pastor, said his father was the most remarkable man he knew. \"I don't care if it's a guy on the street asking for a dollar or the president of the United States, he makes you feel as if you're the most important person in the world when he's talking to you,\" said John Craddock. \"I won the lottery as far as great fathers go.\" Over the years, I kept in contact with him. I'd go to preaching workshops at his beloved Craddock Center, a non-profit ministry that served needy children in North Georgia, Tennessee and North Carolina. The center provides books, hot meals, storytelling and music. Perhaps Craddock saw something of himself in the kids he helped. One thing is certain: He wasn't content to preach compassion. He lived that message. Craddock's funeral will be held at 2 p.m. Monday at Cherry Log Christian Church in Blue Ridge, Georgia. People we've lost in 2015 .",
-        "instruction": "Summarize the following news article:",
-        "output": "Fred Craddock revolutionized art of preaching .\nCraddock was selected as one of the world's best preachers .\nHe died Friday at 86 in Georgia ."
-    },
-    {
-        "input": "(CNN)The State Department only last month started automatically preserving dozens of high-level officials' emails, according to a department spokeswoman, a revelation that comes amid the controversy surrounding former Secretary of State Hillary Clinton's own email use. This is key because Clinton, in her news conference on Tuesday to answer questions about why she exclusively used a private email address to communicate during her tenure, told reporters she'd thought her communications to senior department officials were always saved. That's an important point, given legal requirements for preserving federal records. \"In meeting the record-keeping obligations, it was my practice to email government officials on their State or dot-gov accounts so that the emails were immediately captured and preserved,\" she said. The news about the State Department's archiving practices -- and the change that started in February -- could refuel the furor of Republicans over Clinton's choice to not use her government email account, to have a private server in her family home house her emails, and to decide (with her staff) which emails to delete and which to turn over to the State Department for review. Rep. Susan Brooks, a member of the select committee investigating the 2012 Benghazi attack, said in the Republicans' weekly media address Saturday, \"You see, right now there is no way for us to know whether we have all of the State Department communications that rightfully belong to the American people.\" \"The only way to truly know is by having access to Secretary Clinton's personal server,\" the Indiana Republican added. \"We are asking Secretary Clinton to turn her server over to a neutral, third-party arbiter. After a complete inventory, this arbiter can make a determination as to which emails should be public and which should remain private. These decisions would be completely impartial and independent.\" So far, Brooks' committee had gotten just under 300 of Clinton's emails from the State Department. Last year, as it was trying to update its records, the State Department asked former secretaries of state for nonpersonal emails from their personal accounts that could be work-related. That request prompted Clinton to turn over 30,490 emails -- about 55,000 pages. Clinton has asked for those emails to be made public, and department officials are reviewing them to make sure no sensitive information is released. Clinton's office had said her account contained 62,320 sent and received emails from March 2009 to February 2013. \"About half were personal that were not in any way related to work. I had no reason to save them,\" Clinton said at her press conference. The fact that all emails from senior officials weren't automatically saved until recently doesn't mean they are gone, according to the State Department. \"I wouldn't state it's lost to history, because there are always -- there are technical means of gaining access to past information,\" spokeswoman Jen Psaki said Friday. At the same time, Psaki said the State Department chose to change its approach -- by automatically saving all emails -- \"because it's an imperfect system.\" \"This is a more efficient and better way,\" she said of the new system. \"But, obviously, there were ways to preserve (emails), and employees and individuals were expected to do that prior to this new process.\" Separately, Psaki said that a letter was recently sent to former State Department staffers asking for their help in the preservation effort. \"If they should become aware ... of federal record in their possession -- such as an email sent or received on a personal email account while (they were) serving in their official capacity at the Department -- that ... record (should) be made available to the Department,\" Psaki said the ex-staffers were told. Even before last month's chance, emails of current Secretary of State John Kerry, who uses a government address, were already being saved automatically. CNN's Adam Levine contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Hillary Clinton used a private email address while Secretary of State .\nShe's said she thought her communications to officials were always saved .\nHowever, a State Department official says emails weren't automatically archived until last month ."
-    },
-    {
-        "input": "(CNN)Two more American aid workers who had high-risk exposure to Ebola in Sierra Leone arrived back in the United States on Monday for monitoring. Neither is sick or known to be infected with the disease. Eight of their colleagues have already been flown back to the United States, where health authorities are watching them closely for signs of the disease, according to the Centers for Disease Control and Prevention. The returning workers are clinicians for Partners in Health, a Boston-based aid group. They all had contact with a colleague who's been diagnosed with the disease and is being treated at the National Institutes of Health in Bethesda, Maryland. The heath care worker with Ebola was in critical condition Monday, the NIH said. That changed from the NIH saying on Friday that the patient was in serious condition. Details about the patient's identity weren't released. While in West Africa, the workers \"came to the aid of their ailing colleague,\" according to a Partners in Health statement. As the CDC investigates who else might have had contact with the Ebola patient, more workers might be flown back to the United States, according to Tom Skinner, a spokesman for the CDC. The workers are being housed near hospitals that specialize in treating Ebola patients, and if they show signs of the disease, they'll be admitted as patients -- as was the case with one worker in Nebraska. The person developed symptoms Sunday evening and \"out of an abundance of caution\" was taken to the biocontainment unit at the University of Nebraska Medical Center in Omaha. \"At this point, this person has not tested positive for the Ebola virus,\" Phil Smith, M.D., medical director of the unit, said in a statement Monday. \"However, because of a change in symptoms, we decided the most prudent course of action was to bring the individual to the Biocontainment Unit, where we can better monitor symptoms and safely perform testing. However, some of the symptoms which prompted the move to the Biocontainment Unit have resolved this morning.\" None of the other aid workers are showing symptoms of Ebola. Nurse who contracted Ebola sues hospital company . State health authorities say all 10 of the workers are deemed to have had high-risk exposure to Ebola. This includes people who were not wearing protective gear and were exposed to the bodily fluids of someone with Ebola while that person was symptomatic, or someone who lived in the same household and provided direct care to a symptomatic Ebola patient. The first of these workers arrived Friday night in Atlanta showing possible signs of Ebola, but testing over the weekend came back negative, according to a government official who did not want to be named. The worker is being isolated in housing near Emory University Hospital for 21 days. \"Twice a day, we'll have visual monitoring, either face to face or we'll Skype with them, or do FaceTime,\" said Nancy Nydam, spokeswoman for the Georgia Department of Public Health. \"And if it's Skype or FaceTime, they'll have to be in a place where we can clearly identify that they are where they're supposed to be.\" The next set of aid workers returned back to the United States on Saturday evening and went into isolation in housing on the campus of the University of Nebraska Medical Center, according to spokesman Taylor Wilson. As in Atlanta, these workers have voluntarily agreed not to leave their housing. \"They will be monitored so they'll stay there,\" Wilson said. Another set of three workers arrived Sunday morning at housing near the National Institutes of Health. These workers will have more freedom to move around than the ones in Georgia and Nebraska. According to Christopher Garrett, a spokesman for the Maryland Department of Health, his state will follow the CDC's guidelines, which forbid monitored individuals from getting on public transportation but allow them to go outside as long as they stay at least 3 feet away from other people. '1686: That's my number' Two more workers arrived Monday morning in Atlanta and were isolated in housing near Emory. A third worker was expected to arrive Monday in Atlanta, but health authorities deemed this person to be lower risk and allowed for monitoring at home, according to a government official. The patient now at NIH is the second with Ebola admitted to the NIH hospital. Safety at Ebola funerals . Nina Pham, a nurse at Texas Health Presbyterian Hospital Dallas, was admitted to NIH in October after she contracted the disease while treating Liberian national Thomas Eric Duncan. Pham recovered and was released free of disease. Duncan died. Emory, the NIH and Nebraska are three of only four hospitals in the United States that have biocontainment units to deal with a highly infectious disease such as Ebola. More than 10,000 people have died in a West African epidemic of Ebola that dates back to December 2013, according to the World Health Organization. Almost all of the deaths have been in Guinea, Liberia and Sierra Leone. Ebola is spread by direct contact with the bodily fluids of an infected person. CNN's Carma Hassan, Joe Sutton and Eliott C. McLaughlin contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "After developing symptoms, a worker is moved to Nebraska's biocontainment unit .\nCondition of Ebola patient at NIH hospital is changed from serious to critical .\nEight aid workers have already returned, and two more are back Monday ."
-    },
-    {
-        "input": "(CNN)Former Manchester United star Danny Welbeck returned to haunt his old side to send holders Arsenal into the semifinals of the FA Cup with a 2-1 victory at Old Trafford Monday. The England striker was surplus to requirements at Louis van Gaal's United, but was quick to capitalize on a mistake by Antonio Valencia to grab the winner just after the hour mark. A disappointing night for United was compounded by a late red card for Argentina international Angel Di Maria, who was given his marching orders after manhandling referee Michael Oliver. Di Maria was initially booked for simulation and took his protests too far by grabbing the official, which led to his instant dismissal. With 10 men, United was never able to exert any real pressure on Arsenal and it took two fine saves David De Gea to deny further goals from Santi Cazorla and Alexis Sanchez for the away side. Victory for Arsenal booked a return to Wembley for a last four clash with either Reading or Bradford City, who are replaying their quarterfinal tie next Monday. Liverpool or Blackburn Rovers will play Aston Villa in the other semifinal. Arsenal had taken a 25th minute lead through Nacho Monreal, the Spanish defender converting after fine move which involved Mesut Ozil and Alex Oxlade-Chamberlain. But the advantage was short lived with Di Maria finding Wayne Rooney to score with a diving header. United looked to press ahead for the winner, but home hopes were dashed when Valencia made a hash of a back pass and Welbeck was quick to latch on to the loose ball and beat De Gea. 'I believe it was justice for him to score he worked extremely hard today,' Arsenal manager Arsene Wenger said of Welbeck. His counterpart van Gaal will now, barring a miracle, be left without a trophy in his first season in charge of the English powerhouses, with the pressure now on to seal a top four place in the EPL to guarantee Champions League football next season.",
-        "instruction": "Summarize the following news article:",
-        "output": "Arsenal beats Man Utd 2-1 in FA Cup quarterfinal .\nFormer Manchester United player Danny Welbeck scores winner .\nHolders Arsenal took the lead through Nacho Monreal before Wayne Rooney equalized .\nAngel Di Maria sent off for shoving referee in second half ."
-    },
-    {
-        "input": "Ferguson, Missouri (CNN)Loistine Hoskin cared for her old car as a sentimental possession, missing a tire while parked beside her home, until one day the city ticketed her for having a \"derelict vehicle in driveway.\" A tow truck took it away in 2009. So began an odyssey with Ferguson police, municipal court and city hall that left her with $1,200 in fines that to this day she still doesn't fully understand. She paid the sum because endless court hearings about the car wore her down. \"I don't have a lawyer. I'm not a lawyer. It's me going up against the city of Ferguson when the attorneys won't help,\" Hoskin said this week. She never saw again her 1996 Oldsmobile Ninety-Eight with 168,000 miles. Its location is a mystery. She accepts the loss because at least she's not  in jail, where city authorities threatened to put her at one point, she said. While Hoskin's surrender of both car and $1,200 to the city may seem a matter of personal choice, the U.S. Justice Department revealed this week a \"pattern and practice\" of racial discrimination within Ferguson that may lend credibility to Hoskin's account of a government run amok. Just about every branch of Ferguson government -- police, municipal court, city hall -- participated in \"unlawful\" targeting of African-American residents such as Hoskin for tickets and fines, the Justice Department concluded this week. The millions of dollars in fines and fees paid by black residents served an ultimate goal of satisfying \"revenue rather than public safety needs,\" the Justice Department found. To the outside world, the federal findings were staggering, but to Hoskin and other longtime residents, the conclusion was nothing new. They've felt it all along, they say. It's only now that federal authorities have documented the institutionalized racism, as part of a civil rights investigation after a white police officer's fatal shooting of an unarmed black teen, Michael Brown, 19, last summer. The officer, Darren Wilson, was cleared of federal civil rights violations this week and was earlier cleared of criminal wrongdoing. The Justice Department is expected to pursue a court-supervised consent decree that requires the city of Ferguson to make changes to its police and courts. \"It's definitely a vicious cycle,\" said Hoskin, 64, a retired airline reservation agent. \"Unfortunately for most people who are in this cycle, they continue to be in a downward spiral because they can't get jobs, they can't do anything, they can't pay the fines.\" Hoskin's household has experienced its share of ugly encounters with Ferguson police, the family said. Hoskin's daughter was ticketed by police in what she described as racial profiling, she said. In 2007, Kimberly Hoskin got a $124 ticket for driving her sister's car, which wasn't insured in Missouri, and then had to pay an additional $100 fine when she missed a court appearance because of an emergency appendectomy. The court, she said, rejected her medical excuse and issued a warrant for her arrest. She paid a total of $224 in fines because she was in the middle of a job search and didn't want an outstanding warrant to prevent her from getting a good job. \"Why is it that all the people in court are black?\" Kimberly Hoskin said of defendants in municipal court. \"I've had so many police officers make a U-turn, follow me, run my plates, find out everything is OK, then turn around and go back in the direction they were going. \"There's nothing we can do. In Ferguson, the police do what they want to do. Ferguson does what it wants,\" said Kimberly Hoskin, 35, who now works the overnight shift on a General Motors manufacturing line. City officials and police declined to respond to CNN's requests this week for comment about the Hoskin family's assertions. Last September, the City Council repealed administrative fees imposed by police when overseeing the release of a towed vehicle, the mayor said. The city also no longer has a specific offense for failing to appear in court, which eliminates certain additional fees and court costs, the mayor said. The Justice Department's report details how Ferguson operated a vertically integrated system -- from street cop to court clerk to judge to city administration to city council -- to raise revenue for the city budget through increased ticketing and fining. Ferguson's budget increases were so sizable that city officials exhorted police and court staff to levy more and more fines and tickets against violators, who turned out to be largely African-American, the Justice Department said. The demands for revenue were so intense that the police department had \"little concern with how officers do this,\" even disciplining officers who failed to issue an average of 28 tickets a month, the Justice Department report said. Officers competed \"to see who could issue the largest number of citations during a single stop,\" the Justice Department said. One apparent winner was an officer who issued 14 tickets at a single encounter, according to the federal investigation report. Many police stops of civilians \"have little relation to public safety and a questionable basis in law,\" the report said. Ferguson police chief mum on federal report . Indeed, Ferguson enjoyed so much success in issuing tickets and fines that Ferguson, population 21,000, was ranked in the top eight of the 80 municipal courts in St. Louis County by having more than $1 million in revenue in 2010, the report said. When Ferguson court revenues exceeded $2 million in 2012, the city manager responded to the police chief in an internal email: \"Awesome! Thanks!\" according to the federal report. Even municipal judges were pressured to boost revenue. \"The city has made clear to the police chief and the municipal judge that revenue generation must also be a priority in court operations,\" the federal investigation found. The city finance director said in a 2011 report that the municipal judge had been successful since 2003 in increasing court collections, and that internal 2011 city report noted a judge's statement that \"none of these changes could have taken place without the cooperation of the court clerk, the chief of police, and the prosecutor's office,\" the Justice Department investigation found. Cash filled the city treasury. By 2013, revenue from enforcing municipal codes reached $2.46 million, the federal report said. By 2015, the city anticipated that fines and fees would account for 23% of the budget, or $3.09 million of $13.26 million in general fund expenses, the Justice Department found. Just five years earlier, court fines and fees made up only 12% of the budget, or $1.38 million of $11.07 million in general fund revenues, the Justice Department found. The fines were among the highest of surrounding municipalities. For example, area parking fines ranged from $5 to $100, but Ferguson's parking fine was $102. A fine for \"weeds/tall grass\" was $5 in one nearby city, but Ferguson's fine ranged from $77 to $102, the Justice Department found. The federal government made a forceful conclusion: . \"City, police and court officials for years have worked in concert to maximize revenue at every stage of the enforcement process, beginning with how fines and fine enforcement processes are established,\" the federal report said. After the Justice Department's announcement this week, Ferguson Mayor James Knowles told reporters that he, Police Chief Tom Jackson and City Attorney Stephanie Karr met with federal officials about their findings and initiated several initiatives, including reforms to the municipal court. Knowles said municipal judges have created a docket for alleged offenders having trouble paying fines. Also, a defendant may ask a judge or prosecutor about different payment plans or alternative sentencing, the mayor said. Defendants have been required to pay an entire fine at once, regardless of ability to pay, the federal report said. The city also passed an ordinance last September to cap municipal court revenues at 15% of the city's overall budget, the mayor said. That figure is half Missouri's legal limit, he added. All the reforms are intended to \"move this city, its residents and our entire community forward,\" the mayor said. Back at her home, Loistine Hoskin recalled the height of absurdity in her fight against the city, which occurred shortly after her husband, Calvin, died in 2008 of complications from paralysis he suffered in a car accident three years earlier. She had been his caregiver. She appeared in court to appeal the citation, but an officer arrested her and put her in the back of the squad car. Her offense? Failing to appear in court, she said. She spent four hours in jail. She insists she made every court date. For now, she lives in fear of the police, even at home. \"We just got to a point where we said we're just not going to have anyone over -- because they were fearful when they left they would get some ticket, and they didn't even live here,\" Hoskin said. CNN's Ed Lavandera and Tristan Smith contributed from Ferguson. Michael Martinez reported and wrote this story from Los Angeles.",
-        "instruction": "Summarize the following news article:",
-        "output": "Loistine Hoskin gave up fighting the city, paying $1,200 in fines over a broken-down car .\nHer daughter recounts how she was racially profiled for a $124 ticket .\nFerguson mayor announces reforms so his troubled town can move forward ."
-    },
-    {
-        "input": "Moscow (CNN)Russian President Vladimir Putin appeared in a series of photos released by the Kremlin on Friday after a week in which canceled engagements led to speculation over his health. The three images showed Putin meeting with the head of the Supreme Court in Moscow on Friday, the Kremlin said. State broadcaster Russia 24 also aired video footage of the meeting. CNN cannot independently confirm that the meeting took place as stated. Presidential spokesman Dmitry Peskov sought to quell the rumors of ill health on Thursday, joking that they were down to \"spring fever,\" Russia's state-run Tass news agency reported. \"When the sun comes up in spring, and as soon as spring is in the air, then the fever begins,\" Peskov said. \"Someone dreams of (Rosneft CEO Igor) Sechin resignation, others -- of government resignation, while others have not seen President Putin on TV for several days,\" he said, according to Tass. \"We are calm on this fever, and respond to the questions with patience,\" Peskov said. Speaking to Russian radio station Echo of Moscow, Peskov also urged people not to worry, saying that Putin was \"absolutely\" healthy. Asked if the President's handshake was firm, he replied that Putin \"can break a hand.\" Last fall, rumors also circulated about the health of North Korea's leader, Kim Jong Un, after he was absent from the public eye for more than a month. He eventually reappeared, with North Korean state media releasing photographs of him walking with a cane. The speculation about Putin began after a planned meeting in the Kazakh capital, Astana, between the Russian leader and the presidents of Kazakhstan and Belarus was postponed at short notice. Peskov confirmed Wednesday that the leaders had agreed to delay it for several days but did not give a new date, Tass said. The Kremlin press office said Friday that Putin would meet Monday with his Kyrgyz counterpart, Almazbek Atambayev, in the Russian city of St. Petersburg, Tass said. Putin's last public appearance was Sunday, on International Women's Day. Since then, he has had a number of meetings, but no video was released, only stills uploaded to the Kremlin's website. In the course of his many years in power, Putin has cultivated the image of a strong and vigorous leader. His exploits, captured on film and released to the media, have included riding horses while shirtless, exploring the seafloor in a submersible, handling a tranquilized tiger and, just last year, earning a karate black belt. The Russian President has been in the international spotlight in recent months after Moscow's intervention in Ukraine, which has prompted heightened tensions with the West. Opinion: Why has Putin gone missing? CNN's Alla Eshchenko reported from Moscow, and Laura Smith-Spark wrote and reported from London.",
-        "instruction": "Summarize the following news article:",
-        "output": "Kremlin releases images of a meeting it says President Vladimir Putin held Friday .\nPutin spokesman dismisses rumors of ill health sparked by Putin's cancellation of planned talks ."
-    },
-    {
-        "input": "(CNN)Thiago Silva's path to redemption took a mere 12 minutes. For that short period of time the Brazilian must have been panicking that his needless error had cost his Paris Saint-Germain side a place in the last eight of the European Champions League. His handball had gifted English Premier League leaders Chelsea a 2-1 aggregate advantage in extra time, but this was one tale that would be adorned with a Silva lining. Seconds after being denied by a world class save from Thibaut Courtois, Silva looped a header into the net to send PSG through on away goals. Not only did he save himself from a barrage of criticism, he also got colleague Zlatan Ibrahimovic out of jail too. The maverick Swedish striker's dismissal after 31 minutes made PSG's passage into the quarterfinals all the more remarkable, as it battled the best team in England with 10 men for an energy-sapping 90 minutes. Gary Cahill looked to have ended PSG's resistance with a 81st minute goal before former Chelsea defender David Luiz equalized with four minutes left. Eden Hazard converted from the spot in extra time after Silva's misdemeanor before the 30-year-old popped up at the death to make amends and avenge PSG's defeat at the same stage by Chelsea last season. \"We played a great game,\" Luiz told Sky Sports. \"It was amazing tonight, the spirit, the players gave everything. When we lost Ibra we said we had to keep it simple and keep the ball. \"It's amazing for the club and the city. We tried to win the game even with one less man. We have a long way to go to win the Champions League and we keep our feet on the floor.\" The first tie ended 1-1 in Paris three weeks ago but most of the headlines related to an ugly incident of racism on the city's Metro system. A group of supporters, apparently Chelsea fans, were caught on camera appearing to prevent a black man from boarding a train before chanting: \"We're racist and that's the way we like it.\" The UK's Metropolitan Police announced on Wednesday that it had summoned five men to appear in court later in March, while Chelsea has vowed to ban for life any of its fans that are found guilty. The club extended an invitation to the man who was the the subject of the abuse, known as Souleymane S, but the 33-year-old refused. The opening stages were tense and feisty, just as they were in the French capital during the first leg, but the match in London ignited after a flashpoint just past the half hour mark. Ibrahimovic, so often the hero for PSG, turned villain after being dismissed for a forceful tackle on Chelsea's Brazilian playmaker Oscar. The home side surrounded referee Bjorn Kuipers who duly produced a red card -- the fourth of the Swede's often controversial career in the Champions League. But replays suggested the 33-year-old was perhaps unfortunate to see red, having tried to pull out of the tackle at the last second. Despite having a man advantage, Chelsea struggled to create clear cut openings as PSG defended resolutely. Kuipers was in the spotlight again as the half came to a close -- a mazy run from striker Diego Costa ended when he was felled by PSG's Uruguayan forward Edinson Cavani. The referee took a long look at the incident but decided to wave play on, much to the chagrin of Chelsea's players and manager Jose Mourinho. Mourinho's travails continued after the break as the Blues toiled in its attempts to open up a determined PSG rearguard. A half chance for Gary Cahill came and went before the visitors came to the fore. First Cavani played in Maxwell, whose ball across the face of goal was begging to be tapped in, before the Uruguayan had a chance to tilt the tie firmly in his side's favor. Played in on goal by Argentinian Javier Pastore, Cavani rounded Chelsea goalkeeper Thibaut Courtois but could only glance a shot off the near post from a tight angle. Pastore then got into the area after neat approach play and, though he got a powerful low shot away, Courtois was equal to it and palmed it clear. Chelsea began to reassert its dominance as PSG tired. Substitute Ramires weaved into the area and was denied by a fine save at his near post by Salvatore Sirigu. From the resulting corner a miskick from Costa presented the ball at the feet of England defender Cahill, who lashed home from 12 yards out. But Chelsea were unable to hold out. Luiz, who won the Champions League with Chelsea during a three-year stint at the club, had been booed throughout the game for a running battle with Costa. And he rose highest to power a header into the net and make the tie level on aggregate with just four minutes remaining. PSG began the extra half hour in sluggish fashion and were punished when Silva needlessly handled a looping cross into the area as he went up to challenge substitute Kurt Zouma for the ball. Belgium international Eden Hazard duly tucked the penalty away, to restore Chelsea's lead and ensure the tie would not go to penalties. Still Chelsea looked edgy. Courtois was forced to make a world class save from Silva, as the Brazilian desperately tried to make amends for his handball. But despite that warning, Silva netted with a fine header from the very next corner, looping a brilliant header over the goalkeeper from 14 yards out. After the game Mourinho admitted PSG had \"clearly\" been the better side: \"Our performance was not good enough,\" he told Sky Sports. \"We had the game in our hands twice but I think PSG were stronger than us and coped better with the pressure of the game.\" German champions Bayern Munich cemented its place in the last eight with a comprehensive 7-0 dismantling of Shakhtar Donetsk. After a 0-0 draw in the first leg in Ukraine, Shakhtar's task became an uphill one inside four minutes when Olexandr Kucher was dismissed for bringing down Mario Götze inside the penalty area. Thomas Muller tucked home the resulting penalty, and Bayern subsequently laid siege to Shakhtar's goal. Robert Lewandowski nodded Rafinha's cross against the post before Pep Guardiola's side doubled its advantage just after the half hour mark. Defender Jerome Boateng had the simple task of tapping home from point blank range after former Dortmund striker Lewandowski's effort had been saved. Bayern ran riot after the break, France international Franck Ribery grabbing the third with a fine low finish after a powerful run into the area. Muller grabbed a second moments later before Holger Badstuber made it five with a thumping header. Lewandowski finally got on the score sheet before Götze completed the rout.",
-        "instruction": "Summarize the following news article:",
-        "output": "French side Paris Saint-Germain knock Chelsea out of the European Champions League .\nA goal in extra time from Thiago Silva sends French champions through on away goals .\nPSG played 90 minutes with 10 men after Zlatan Ibrahimovic's first-half dismissal .\nBayern Munich hammer Shakhtar Donetsk 7-0 to book its place in the last eight ."
-    },
-    {
-        "input": "(CNN)The UK's Metropolitan Police has summoned five football fans to answer allegations of racism. It relates to an incident captured on camera, prior to Chelsea's European Champions League match with Paris Saint-Germain last month. In the French capital before the match an apparent group of Chelsea fans are caught on camera, appearing to prevent a black man boarding the Paris Metro. The video, widely shared on social media, also depicted the group of men chanting: \"We're racist, we're racist and that's the way we like it.\" The man in the video was blocked several times from boarding the train and shoved off at least twice. He was later identified as Souleymane S, a 33-year-old Parisian on his way home from work. London's Met Police had appealed for help to identify those responsible but in a statement released Wednesday, said five men were due in court later this month. It read: \"On 10 March and 11 March, the Metropolitan Police Service served summonses on five men involved in incidents on the Paris Metro before the Paris Saint-Germain v Chelsea Champions League fixture on Tuesday 17 February. \"They have been summonsed to appear at Waltham Forest Magistrates' Court on Wednesday, 25 March regarding a police application for football banning orders.\" The authorities have not identified any of the alleged perpetrators. Chelsea were quick to condemn the group, saying in a statement after the incident that the chanting was \"abhorrent\" and something that \"has no place in football or society.\" Manager Jose Mourinho said he was \"ashamed\" by the flashpoint, while the club moved quickly to suspend the five men involved. It said they faced a life ban should they be found guilty. \"I watched the image the next day, I didn't want to watch again, I watched only once,\" Mourinho told reporters a few days after the match in Paris. \"It is enough for me, it is sad enough for me. It is a humiliation for that gentleman, I imagine myself in that same situation, I want to go home after a day of work and a couple of guys they kick me out of my public transport.\" A club spokesperson also confirmed that Chelsea's Russian owner Roman Abramovich was \"disgusted\" by the incident whilst French President Francois Hollande personally spoke to Souleymane. Chelsea invited him to attend the return fixture at Stamford Bridge but he declined the offer. \"I won't go,\" he was quoted as saying by French radio station RTL. \"They can't buy me with a little piece of paper. I'm not a child. \"I don't want to sit in that stadium next to those people who pushed me. I still hear the voices of those people who pushed me because of the color of my skin.\" Football banning orders can last for between three and ten years, and any breach can result in a criminal offense, with a prison sentence of up to six months.",
-        "instruction": "Summarize the following news article:",
-        "output": "UK's Metropolitan Police summon five people to answer allegations of racism .\nThe group, believed to be Chelsea fans, abused a black man on Paris Metro .\nVideo of the incident showed a black man being prevented from boarding a train .\nChelsea has suspended five fans and will issue life bans if they are found guilty ."
-    },
-    {
-        "input": "(CNN)If you enjoy laughing at the England football team -- calling all Germans and Scots -- and you have a spare $900,000, then Sotheby's has the very thing for you. The auction house is offering an untitled work by Italian sculptor Maurizio Cattelan that chronicles -- in painstaking and painful detail if you are English -- every defeat suffered by the England team between 1874 and 1998. Cattelan has carved all the defeats into a large (and it would have to be, wouldn't it?) piece of black granite, meaning the lucky buyer can be constantly reminded of a succession of ignominious afternoons and evenings. The artist says of his memorial stone-style work simply: \"I guess it's a piece which talks about pride, missed opportunities and death.\" Known for his humorous and satirical art, Cattelan had plenty to work with as he recorded defeats starting with a 2-1 loss to Scotland in Glasgow in 1874 and ending with another 2-1 setback against Romania at the 1998 World Cup. Since then, of course, he'd have had a fair few more to immortalize, including the two to Italy and Uruguay in Brazil last summer that were part of the worst-ever England display at a World Cup. The artist, whose other works include La Nona Ora (The Ninth Hour), which depicts Pope John Paul II being struck down by a meteorite, first put his football piece on display at an exhibition in London in 1999. It's not known whether any England managers and players, past or present, came along to have a sneaky look. Current England boss Roy Hodgson is known to be a bit of an Italophile, but the former Inter Milan coach needs to be quick if he wants catch a glimpse -- Cattelan's work is on public exhibition at Sotheby's New Bond Street galleries until Tuesday 10 March. It will then be offered as part of the single-owner 'Bear Witness' sale.",
-        "instruction": "Summarize the following news article:",
-        "output": "A monument to England's football failure is set to go on auction in London .\nMaurizio Cattelan's sculpture chronicles every England defeat between 1874 and 1998 .\nThe black granite artwork is expected to fetch up to $900,000 ."
-    },
-    {
-        "input": "(CNN)A crowd of tens of thousands filled Yitzhak Rabin Square in Tel Aviv on Saturday evening, calling for a change in Israeli leadership amid growing discontent with the current administration. The rally, called \"Israel Wants Change,\" was put on by One Million Hands, a grassroots movement that focuses its efforts on two major issues: a two-state solution and a reduction in the cost of living. On both of these topics, founder Dror Ben-Ami says, Prime Minister Benjamin Netanyahu has failed. \"We think that the only way to go forward on that account is to change government and to get this current government out of office. To get Bibi out of office,\" says Ben-Ami. Organizers are not promoting any specific candidate or pushing voters toward any political party in the March 17 balloting. Instead, they are encouraging voters to change the current administration. Police estimated the crowd at 40,000 people. Headlining the rally was Meir Dagan, the former head of Mossad, Israel's spy agency, and a vocal critic of Netanyahu. In recent weeks, Dagan criticized Netanyahu's decision to speak before the U.S. Congress, and he continued his criticism at the rally, saying that he has never seen such stagnation in Israeli politics. Netanyahu still has broad support in Israel, and his Likud party is doing well in the polls. The Zionist Union is Likud's primary challenger. In the initial round of polling immediately following Netanyahu's speech before Congress, the Zionist Union was expected to win 24 seats in the Knesset while Likud was expected to win 23 seats. The poll comes from CNN affiliate Channel 2 Israel. Because of the nature of Israeli politics, Netanyahu's Likud party could lose the election while Netanyahu still becomes prime minister. In many ways, this rally was reminiscent of a similar social movement that began in the summer of 2011. Hundreds of thousands of protesters packed the streets of Tel Aviv, demanding improvements in the cost of housing, education, and health care. What started as a small social media campaign turned into a nationwide movement that spread to other major cities. This time, the issues are different. But the growing sense of discontent and the calls for change remain.",
-        "instruction": "Summarize the following news article:",
-        "output": "Group One Million Hands sponsors event in Tel Aviv .\nThey want to see a new prime minister .\nIsraeli elections come March 17 ."
-    },
-    {
-        "input": "(CNN)Paul McCartney, Patti Smith, Beck, and John Legend are ready to rock Cleveland. On Wednesday the Rock and Roll Hall of Fame announced an array of A-list presenters and performers for its 30th Annual Induction Ceremony, which takes place at Cleveland's historic Public Hall on Saturday, April 18. Seven artists and musical groups will be honored with induction into the Rock and Roll Hall of Fame during the ceremony. The newly announced presenters list includes: . Paul McCartney (inducting former Beatle Ringo Starr) Stevie Wonder (inducting '70s soul singer Bill Withers) Patti Smith (inducting late Velvet Underground front man Lou Reed) Peter Wolf (inducting The Paul Butterfield Blues Band) Fall Out Boy (inducting punk rockers Green Day) John Mayer (inducting late electric guitarist Stevie Ray Vaughn) Steve Cropper (inducting soul group \"5\" Royales) Joan Jett & the Blackhearts round out the 2015 class of inductees. On a day that celebrates the best in rock 'n' roll, expect musicians to bring down the house. Two of this year's most recognized artists are set to perform. Beck, who ruled the Grammys with three wins, including album of the year, will sing at the ceremony. John Legend will also take the stage, after winning best original song at this year's Academy Awards. See all the 2015 Grammy winners . The ceremony will feature other lively performances from blues guitarist Jimmie Vaughan, Tom Morello of Rage Against the Machine and Audioslave, former Eagles band member Joe Walsh, and Nirvana drummer and Foo Fighters founder Dave Grohl. The Rock and Roll Hall of Fame Induction Ceremony will be broadcast on HBO Saturday, May 30. (CNN and HBO share a parent company.) The ceremony will be accompanied by a series of events during Cleveland's Rock Week (April 11-18), including live music sessions and a new 2015 Inductees exhibit at the Rock and Roll Hall of Fame and Museum.",
-        "instruction": "Summarize the following news article:",
-        "output": "The Rock and Roll Hall of Fame announces the presenters for its 2015 induction .\nPaul McCartney will introduce Ringo Starr; Stevie Wonder will induct Bill Withers .\nThe ceremony will take place in April and will be broadcast on HBO in May ."
-    },
-    {
-        "input": "(CNN)A U.S. Air Force veteran who allegedly tried to join ISIS in Syria but was turned back by Turkish authorities before he could get to the war-torn country entered a not guilty plea to terror-related charges Wednesday in a federal court in New York. Tairod Nathan Webster Pugh, accused of making the foiled attempt in January, was indicted by a grand jury on charges of trying to give material support to the terror group and obstruction of justice, the U.S. Justice Department said in a two-count indictment announced Tuesday. Among the evidence, prosecutors allege: Investigators discovered on his laptop computer a letter saying he wanted to \"use the talents and skills given to me by Allah to establish and defend the Islamic States,\" and a chart of crossing points between Turkey and Syria, where ISIS controls some territory. Who has been recruited to ISIS from the West? Pugh, a 47-year-old convert to Islam and a former New Jersey resident who served in the Air Force from 1986 to 1990, was arrested upon his return to the United States in January, the Justice Department said. \"Pugh, an American citizen and former member of our military, allegedly abandoned his allegiance to the United States and sought to provide material support to ISIL,\" Assistant U.S. Attorney General John Carlin said, using an alternate acronym for the Islamist terror group that controls territory in parts of Iraq and Syria. At his arraignment Wednesday in U.S. District Court in Brooklyn, Pugh appeared in a black T-shirt and khaki pants and stated his name. He pleaded not guilty through his attorney, Michael Schneider. The defendant, a former avionics instrument system specialist in the Air Force, flew from Egypt to Turkey on January 10, weeks after being fired from a Middle East-based job as an airplane mechanic, U.S. authorities allege. Why is ISIS so successful at luring Westerners? But Turkey denied him entry. In the indictment, U.S. authorities said Turkey was likely suspicious Pugh was headed for Syria. Instead Turkish officials sent him on a return flight to Egypt, where he was detained. In Egypt, he was carrying multiple electronic devices, \"including four USB thumb drives that had been stripped of their plastic casings and an iPod that had been wiped clean of data,\" the Justice Department said in a statement. Pugh had purposefully tampered with the devices to prevent others from getting access to his electronic media, the indictment said. Pugh was deported to the United States, where agents with the FBI's Joint Terrorism Task Force obtained a warrant for his devices, including the laptop, the department said. Investigators found a letter from January addressed to a Misha, whom they believe is his wife, authorities said. In it, the writer says: \"I am a (mujahedeen). I am a sword against the oppressor and a shield for the oppressed. I will use the talents and skills given to me by Allah to establish and defend the Islamic States.\" In addition to that letter and the Turkey/Syria border chart, agents also found recent Internet searches for information on \"borders controlled by Islamic state,\" as well as \"Internet searches for 'Flames of War,' an ISIL propaganda video,\" and \"downloaded videos, including one showing ISIL members executing prisoners,\" the Justice Department said. They also found what the government said was another 180 jihadist propaganda videos. Pugh was arrested in Asbury Park, New Jersey, on January 16. While in the Air Force, Pugh was trained in installing and maintaining aircraft engine, navigation and weapons systems, the Justice Department said. Pugh converted to Islam after moving to San Antonio in 1998, according to the indictment. The indictment said he took a job as a mechanic with American Airlines in or about 2001. The airline has not responded to a request for comment. In 2001, an American Airlines co-worker alerted the FBI that Pugh \"sympathized with Osama bin Laden, felt that the 1998 bombings of U.S. embassies overseas were justified, and expressed anti-American sentiment,\" the complaint said. One year later, an associate told the FBI that Pugh had expressed interest in traveling to Chechnya to \"fight jihad.\" From October 2009 to March 2010, he worked in Iraq as an Army contractor for DynCorp, according to the complaint. According to Pugh's LinkedIn page, he listed himself as a maintenance manager for Gryphon Airlines, a Kuwait-based charter airline, since September 2014. But the airline told CNN that Pugh was only under consideration to work for it in 2014. \"In third quarter 2014, Mr. Pugh was under consideration for a future Gryphon project, but did not meet the qualifications,\" the airline said in a statement. \"Gryphon declined to hire Mr. Pugh. Gryphon personnel are cooperating with the authorities.\" His last known U.S. address was in Neptune, New Jersey, but he had lived in Egypt for about the last year, the indictment said. If convicted, Pugh could be sentenced to up to 35 years in prison. CNN's Ray Sanchez, John Newsome and Steve Almasy contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Tairod Nathan Webster Pugh enters not guilty plea to terror-related charges .\nPugh flew to Turkey on January 10, authorities say .\nAuthorities allege a letter on his laptop told his wife he was a mujahedeen ."
-    },
-    {
-        "input": "(CNN)We have no problem taking Wall Street executives to task for decisions that leave American families financially devastated, yet we give Silicon Valley billionaires a pass when they do the same thing. America needs to realize that instead of creating jobs, Silicon Valley is erasing them, leaving millennials financially stranded before their careers can get off the ground. Silicon Valley is tossing millennials aside like yesterday's laptop. The commonly held belief is that with hard work and a good education, a young person in America can get a good job. But despite falling unemployment, college grads age 22 to 27 are stuck in low-paying jobs that don't even require a college degree. The percentage of young people languishing in low-skill, low-paying jobs is 44%, a 20-year high. Only 36% of college grads have jobs that pay at least $45,000, a sharp decline from the 1990s, after adjusting for inflation. Perhaps most depressingly, the percentage of young people making below $25,000 has topped 20%, worse than in 1990. In other words, those with a bachelor's diploma were better off before the digital revolution. If this comes as a surprise, that's because images from popular culture push the idea that young college graduates are shrugging off bad employment prospects with their do-it-yourself attitude. In our collective imagination, millennials are saying, \"No jobs? That's OK — I'll create my own!\" And then they solve their own problems by heading to Silicon Valley with little more than an iPhone and an idea to create the next hip app that supposedly will turn them into overnight millionaires. A fictional example of this new breed of young idealistic entrepreneur would be Mike Bean, founder of Internet behemoth Gryzzl on the show \"Parks and Recreation.\" Played by Blake Anderson, Bean might best be described as \"barefoot and pregnant with ideas.\" The bumbling entrepreneur conquers the world practically by accident, armed only with his digital savvy, a can-do spirit, and a penchant for invading users' privacy. You get the idea that his success came easily. Privacy concerns aside, the Mike Beans of America are just about as rare as the Mark Zuckerbergs. In fact, the percentage of people under 30 who own private businesses has reached a 24-year low. Garages across the country are not exactly humming with millennials launching tech startups. But wait — won't the digital economy eventually lead to better jobs? After a period of adjustment, won't things get better? Unfortunately that's not the path we're on. One of the biggest misconceptions about the digital economy is that for every middle-class job rendered obsolete by technology, there's a new, equally good (or better) job created by Silicon Valley. But exactly the opposite is happening. The digital economy is vaporizing the good jobs and replacing them with two kinds of jobs: minimum wage jobs (think Amazon warehouse employees) and so-called \"sharing-economy jobs\" (think Uber drivers). The sharing-economy jobs are even worse than minimum wage jobs because they offer no stability or protections for workers. Sharing economy jobs aren't really jobs at all; they're freelance gigs. Sure, Silicon Valley doesn't owe America jobs. But something is wrong with the picture of a handful of tech billionaires overseeing a kingdom of falling wages, decreased worker protection and zero job security. This \"winner-take-all\" digital economy is not sustainable. People on both sides of the political spectrum are worried. Liberal luminary Robert Reich, a professor at the University of California at Berkeley and former secretary of labor under Bill Clinton, calls the sharing economy the \"share-the-scraps\" economy. Speaking of tech companies that utilize on-demand labor, such as Uber, Instacart and Taskrabbit, he says, \"The big money goes to the corporations that own the software. The scraps go to the on-demand workers.\" Meanwhile, conservative columnist Ross Douthat fears a dystopian future in which \"a rich, technologically proficient society will no longer offer meaningful occupation to many people of ordinary talents.\" Put simply, Silicon Valley's utopia is the rest of America's dystopia. And those who are punished more than anyone else are recent college graduates, whose lifetime earning potential has already suffered an irreversible setback. And if you think your own job is safe, think again. New research predicts that nearly half of all jobs are susceptible to automation over the next two decades. This is a giant leap backward, but it's deceptively described as technological \"progress.\" As anyone who's talked to an automated system on the phone lately can attest, \"automated\" usually means \"worse.\" What can be done? How can we fight this slide back toward the Middle Ages? If we take no action, we're headed toward a kind of digital world feudalism where there are a handful of kings, a lot of peasants and no middle class. There's no easy fix, but we can do three things immediately. First, we can stop glorifying tech titans and start talking openly about Silicon Valley's questionable tactics and its real job creation record (i.e., just follow the numbers). Second, we can encourage more lawsuits against the abusive practices of \"sharing-economy\" powerhouses. Third, we can elect leaders who are vocal about holding Silicon Valley accountable for their power over the entire American workforce, including white-collar employees. The fictional Gryzzl's tagline borrows some millennial slang: \"Wouldn't it be tight if everyone was chill to each other?\" Indeed it would. And if we want a better future for millennials and the generations after them, we need to challenge the prevailing Silicon Valley ethos before it's too late.",
-        "instruction": "Summarize the following news article:",
-        "output": "David Wheeler: Silicon Valley doesn't create jobs; it's wiping out middle-class jobs .\nYoung college graduates are struggling with lack of jobs, yet many still idolize Silicon Valley ."
-    },
-    {
-        "input": "(CNN)Life on a Civil War ship in the Deep South was no pleasure cruise. Heat, humidity, the rigors of military life and bouts of boredom could frustrate a fella. So it may be no surprise that among the first artifacts to be brought up from the wreck of the CSS Georgia in Savannah are two sets of leg irons. \"We all know about discipline in the Navy,\" said Jim Jobling, a project manager with Texas A&M University's Conservation Research Laboratory.  \"It is for restricting the mobility of a prisoner who wanted to desert or had committed a crime.\" Of course, the leg irons could have been used on a Federal sailor or soldier taken captive. But that didn't occur with the Confederate ironclad. It was scuttled in December 1864, having never fired a shot in anger while it defended the city. About 400 artifacts have been brought up by divers in the initial stage of the recovery of the CSS Georgia, which must be moved for a deepening of the shipping channel. \"We have scratched the surface as far as the artifacts are concerned. There are a lot more to come up,\" Jobling told CNN on Thursday. Texas A&M is working with the U.S. Army Corps of Engineers in Savannah to conserve many of the artifacts for eventual display. Archaeology isn't just about cool artifacts. It's about making a connection with the humans that used these items. And while experts haven't gleaned much about the daily life of the crew from the early dives, they have found some items that would have been used by an individual: An ironstone plate commonly used in the South, a musket trigger guard and the butt stock of a gun. Visibility at the site is practically nonexistent. Divers have to depend on feel to locate and remove smaller items. Much of it is is detritus of the wreck: small iron plates, nails and spikes. Among the items cataloged since the dives formally began at the end of January are two pieces that helped with the business end of the CSS Georgia: its cannons. Four of the artillery pieces are amid the wreckage about 40 feet down on the floor of the Savannah River. One iron piece had eyes, or holes, that connected a gun carriage to ropes that the ironclad's crew manipulated to move the gun forward and backward. Another piece, a trunnion cap, helped hold the cannon to the carriage. Divers and archaeologists are following a timetable and grid in the recovery, with smaller items being brought up by hand. A previous salvage effort and damage from dredging displaced some of the artifacts. Other items remain right where they fell. \"Someone undid the engine, moved it and was dropped\" into the depths, said Jobling. Contract divers are out on the site every day, weather permitting, not more than a couple miles east of the city's famous River Street and waterfront. The $15 million removal of the CSS Georgia is necessary for the state and federal harbor deepening project, which will see the channel go from 42 to a uniform 47 feet so massive cargo container ships can use the port without relying on the tide. In June or July, U.S. Navy divers are expected be on site, to bring up the larger pieces: two engines, the propeller, a steam condenser, the four guns and the casemates that housed them. One of the casemates is large: 68 feet by 24 feet. Four artillery shells will be recovered, with the U.S. Marine Corps to render them safe for museum display. The wooden lower hull no longer exists. The CSS Georgia didn't have enough power to maneuver and effectively trade artillery rounds with any enemy vessels that might approach from the Atlantic Ocean. Instead, the vessel became a stationary floating battery, bristling with artillery pieces. The Yankees refused to take on the CSS Georgia or other nearby defense obstructions. Archaeologists have the challenge of preserving portions of the CSS Georgia through chemical and other means, making her iron stable so  the remains one day can be displayed. Conservation of selected artifacts and parts will be done at Texas A&M and will take about two years to complete. State and local officials hope conserved pieces will be exhibited somewhere in the city. Much of the CSS Georgia is corroded, and archaeologists are gauging the integrity of each piece for conservation. But many pieces remain in pretty good shape. An X-ray of one of the leg irons shows a fair bit of corrosion. Experts will inject epoxy to fill those gaps and remove any concretion on the iron, said Jobling. Officials need more artifacts and investigation to tell the story of the crew. \"Hopefully, there is a section of the wreck that will tell the human side of the CSS Georgia,\" said Jobling. While salvage operations soon after the Civil War removed a lot of iron from the site, there's a chance personal items survive, especially if  they are below the sediment line and protected from the ravages of oxygen. Interestingly, the vessel's crew had to run the main engine constantly, just to keep it afloat. Why? That's because the CSS Georgia's green wood made it susceptible to leaks. So on December 21, 1864, just as Union Maj. Gen. William T. Sherman's hordes of soldiers reached Savannah, the ironclad's crew likely opened its water valves. The CSS Georgia silently slipped below the surface. The sailors used small boats to get to shore and began a 20-mile walk from Savannah. \"They took what they could carry with them,\" Jobling said.",
-        "instruction": "Summarize the following news article:",
-        "output": "A Civil War ironclad must be moved so the shipping channel can be deepened .\nWreckage lies a couple miles from Savannah's famous River Street .\nThe CSS Georgia never fired a shot in anger during the Civil War ."
-    },
-    {
-        "input": "(CNN)Vigilante motorists in New Zealand have taken to snatching the car keys of foreign drivers amid rising concern over dangerous driving by tourists. New Zealand media have reported five cases this year of locals forcibly taking the keys of foreign motorists after witnessing driving that concerned them. All the incidents -- which have been condemned by authorities -- took place in the South Island, which draws tourists from around the world for its rugged scenery, including lakes and mountains featured in director Peter Jackson's \"Lord of the Rings\" films. The incidents occurred amid heightened public concern over tourist driving standards, with eight people killed in crashes involving foreign drivers in the space of a fortnight last month. Among the dead were a family of four from Hong Kong who were killed when their car crossed the center line and collided with a logging truck. The key snatchings have even prompted Prime Minister John Key to weigh in on the issue, advising that \"people taking the law into their own hands is not sensible.\" New Zealand Police Assistant Commissioner of Road Policing Dave Cliff said the confiscations existed in a legal gray area, as there was no explicit statute dealing with the issue. Although there might be exceptional circumstances where taking someone's keys could be legally justified, such as preventing drunken driving, he said, \"in the vast majority of cases, it won't be.\" \"That extends to physically assaulting or abusing someone in response to their driving, which is simply not acceptable, and anyone found doing so should expect to face the consequences,\" he said. Diesel mechanic Robert Penman of Dunedin made headlines last month after he took the keys of a Chinese couple who had stopped their car on a narrow single-lane road to take pictures, causing a backlog of vehicles behind them. \"I was coming into town with my wife and son and came around the corner, and there was a car stopped in the middle of the road,\" he told CNN affiliate TVNZ. He called police and took their keys as \"a safety thing, you know, timeframe for police to get there,\" he said. The New Zealand Transport Agency later revealed that Penman was driving on an expired license himself. Penman told local media it was not the first time he had taken a tourist's car keys. Only 6% of crashes in New Zealand involve foreign drivers, according to the latest figures provided by the Ministry of Transport. But in some remote regions of the South Island particularly popular with tourists for their scenery -- such as the Mackenzie, Southland, Queenstown-Lakes and Kaikoura districts -- foreign drivers are involved in about a quarter of all crashes. In Westland District, on the South Island's rugged West Coast, foreign drivers are involved in 37% of road crashes resulting in death or injury. Tony Kokshoorn, mayor of the neighboring Grey District, said tourist driving behavior was a major problem in the region and attributed the issue to tourists from countries that drive on the right. New Zealanders drive on the left. \"There's a huge number of tourists coming through to these destinations because of the scenery, but the scenery is a problem,\" he said. \"There's so many beautiful sights to see that they're not concentrating on their driving. Once they lose their concentration, they tend to fall into old habits and drive on the right. Even for 20 seconds, it can cause damage.\" A Ministry of Transport spokesperson said that while this was a factor, figures showed that Australian and British drivers were involved in the most crashes overall, \"so unfamiliarity with which side of the road to drive on is not the only factor.\" Kokshoorn said that he had seen three cases of tourists driving on the wrong side of the road recently but that the vigilante approach -- which had seen a visitor to his town punched in the face as he was stripped of his keys last month -- was \"disgraceful.\" He said the best approach was better education on local driving conditions for foreign drivers, particularly at the rental companies where they picked up their vehicles. Anyone with a foreign drivers license or permit is able to drive in New Zealand for up to a year. Associate Transport Minister Craig Foss said the government recognized that \"many people are concerned with poor driving behavior on challenging roads in and around popular tourist destinations\" and had established a project in response. The measures include improvements to roading, such as \"keep left\" signage and no-passing markings on the extensive stretches of single-lane highway, and educational resources targeted at visiting drivers, including many targeting the growing Chinese market. Kokshoorn said it was important to \"strike the right balance\" in getting the message to foreign drivers to take care on the unfamiliar roads. \"We value tourism and the dollars it brings to New Zealand, especially to our region here. We don't want to put tourists off, but we want them to be safe in our country,\" he said. \"You cross that center line, and anything's possible.\" CNN's David Molko contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Five tourists have had their car keys snatched by vigilantes this year .\nThe confiscations have taken place amid rising concern over tourist driving behavior .\nIn popular Westland, 37% of serious crashes have involved foreign drivers ."
-    },
-    {
-        "input": "(CNN)Relief workers reported \"unbelievable destruction\" after Tropical Cyclone Pam smashed the capital of Vanuatu, the Australian Red Cross said Saturday. The Australian Red Cross said via Twitter that \"humanitarian needs will be enormous. Many people have lost their homes. Shelter, food and water (are) urgent priorities\" in Port Vila. Meteorologists said the storm has weakened some, but it was still pounding the islands after hours of fierce winds and torrential rain. \"#CyclonePam still tearing through #Vanuatu. 'Much bigger than expected,' says our colleague in Port Vila. Initial reports of devastation,\" the Australian branch of UNICEF said on Twitter earlier. Pam, one of the strongest storms seen in the South Pacific in years, earlier made a direct hit on the capital, raising fears of mass destruction. In its 8 a.m. Saturday update, the Vanuatu Meteorological Services warned of \"very destructive hurricane force winds\" of 155 mph (250 kilometers per hour) in Shefa and Tafea provinces, with several others facing \"very rough to phenomenal seas with heavy swells.\" The Joint Typhoon Warning Center in Pearl Harbor, Hawaii, said the cyclone had weakened from the equivalent of a Category 5 hurricane to a Category 4. It has dumped more than 9 inches of rain on the capital in a 36-hour period and has wind gusts of up to 190 mph (305 kph). Track the storm . Pam is forecast to move southeastward along the western edge of the southern islands of Vanuatu. It isn't expected to make any additional landfalls before dissipating. It will continue to weaken as it crosses cooler waters and encounters higher wind shear. Pam is expected to pass east of New Zealand on Sunday and into Monday. It could bring heavy rainfall to North Island, including Auckland. The capital, the biggest city in the Vanuatu island chain, sits on the coastline, which is vulnerable to storm surges during powerful cyclones. Tropical Cyclone Pam is the strongest storm to make landfall since the devastating Super Typhoon Haiyan hit the Philippines in 2013. Chloe Morrison, an emergency communications director for the humanitarian organization World Vision, told CNN that she could see some light between her boarded-up windows but it was still hard to see the totality of the destruction outside the house where she and seven others huddled in a back room during the storm. \"Seven hours hunkered down and it's still not safe to go outside,\" she said. \"The winds are still really howling. We're really lucky to be in a concrete house.\" Part of the tin roof twisted off the house and landed by one of the windows, she said. She also noted that two trees outside didn't get uprooted, but they had not one bit of fruit or any leaves remaining. Despite the \"quite terrifying ordeal,\" Morrison said, their house didn't have any damage. Through the night, the wind and torrential rain made it sound like an angry ocean was just outside their doors, she said. World Vision, which says it's been based in Vanuatu for more than 30 years, said its emergency assessment  team would head out to view the damage when it is safe. The staff helped prepare communities on the islands for the cyclone by positioning water, food, blankets, tarpaulins, and shelter, hygiene and kitchen kits in key places, it said, as well as advising on disaster preparedness. Residents have been advised to seek shelter in places such as churches, universities and schools. \"The strongest thing they've got is cement churches,\" said Inga Mepham, program director for CARE International for the Vanuatu program. \"Some of them don't have that. It's hard to find a structure that you'd think would be able to withstand a Category 5 (storm).\" Earlier, photographer Michael McLennan -- who said the storm is being called the \"Vanuatu Monster\" -- told CNN that the \"strength of winds is incredible.\" Video footage he shot earlier Friday showed palm trees being whipped by wind and rain. Evacuation alerts have been issued for several parts of the country. Even before the sun rose Saturday, Vanuatu was already getting hit hard, with most communication and power cut off due to the storm. \"The wind outside is terrifying,\" Michael Wolfe, World Vision's national director in Vanuatu, told that organization. \"I can't imagine what it's like for families out there who weren't able to find safe shelter before the storm.\" The archipelago nation northeast of Australia contains 83 small islands, many of which have little infrastructure and lack the strong housing structures that can endure a walloping storm. Because of the tropical climate in Vanuatu, some of the housing is made of lighter building materials, including straw and corrugated metals. Residents prepared for the storm, boarding up windows and chopping down trees that stand close to buildings in case they could become debris or cause further damage or injury. CNN host Bill Weir, who was recently in Vanuatu on assignment, said that while Port Vila has some modern development in the form of an airport, large hotels and a convention center that's under construction, life on some of the islands has remained almost unchanged for thousands of years. \"For search and rescue, for bringing food and supplies and medicine, it's extremely challenging,\" he said. Pam has triggered watches and warnings in the Solomon Islands and New Zealand. CNN's Greg Botelho and Derek Van Dam contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Red Cross says Tropical Cyclone Pam destroyed many homes .\nUNICEF tweets there are reports of devastation in Vanuatu's capital .\nStorm has made a direct hit on the capital of Vanuatu, Port Vila ."
-    },
-    {
-        "input": "Esfahan, Iran (CNN)If you're looking for the Jewish community in the Iranian town of Esfahan, you won't have to search for long. The main synagogue is on Palestine Square, right in the heart of Iran's third largest city. There are public prayers several times a day -- sometimes with more than a hundred people in attendance. The Jewish community in Iran does not hide its heritage. At the synagogue, Michael Malakon leads the prayer service. He says he is proud of his Jewish identity. And even in a country that is so hostile towards Israel, Malakon says he can practice freely and that he has many Muslim friends. \"I hang around with all kinds of young people and I have a lot of Muslim friends,\" Malakon tells CNN  after finishing the noon prayer on a Monday. About 20 people were in attendance, usually from local businesses around the synagogue. None of them tried to hide the fact that they were Jewish -- and inside the synagogue the Star of David is proudly displayed in many places, alongside passages from the Torah. There have been Jews in Iran for more than 2,500 years. Many left the country after the Islamic Revolution in 1979 that brought Ayatollah Ruhollah Khomeini to power. Khomeini said Iran wanted to destroy Israel, but he also issued a fatwa, a religious decree, saying that Iranian Jews were different to those in Israel and should be considered an integral part of the Islamic Republic. Sion Mahgrefte is the head of the Jewish community in Esfahan. He declined to comment directly on political matters, especially in the current heated environment, but he did say that the members of his community felt very much at home in Iran. \"Israel and Iran are countries,\" he said. \"And we consider ourselves Iranian Jews, not Israeli Jews. So the hostilities between Israel and Iran do not affect us.\" There is even a Jewish representative in Iran's parliament. And aside from the vibrant Jewish community in Esfahan -- there are 13 synagogues in the city -- there are also several Orthodox Cathedrals representing a sizable Christian community. Most of Esfahan's Jews are business people. In the center of town there is a shopping mall, known to people here as the \"Jewish Passage\" because so many businesses are Jewish-owned. When our crew arrived there they found Muslim and Jewish shop owners joking around. One of them is Said Shemyon, who owns a clothing store. His friends call him \"Mordechai,\" and he showed us some Hebrew prayer books he always keeps in the store. Like most others, Shemyon was not keen to talk about Israeli Prime Minister Benjamin Netanyahu's recent address to U.S. Congress, or the problems between Israel and Iran. He says those politics have no place here. \"We just want peace,\" he said. \"We really hope that all these problems can be solved one day, God willing. We are just hoping for unity and peace.\" There are about 1,500 Jews in Esfahan these days. The community's leaders conduct religious studies for the younger members of the congregation. While Sion Mahgrefte is adamant that they have no problems with their Shia neighbors, he does acknowledge that friends living abroad often worry about them. \"Of course sometimes people we know who live in Israel or elsewhere are very concerned about us, and they tell us we are crazy to live here,\" he says. \"But then we tell them how things are and they calm down.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Jews have been living in Iran for more than 2,500 years .\n1,500 Jews call Esfahan home despite tensions between Iran and Israel .\nCommunity leader: \"Israel and Iran are countries, and we consider ourselves Iranian Jews\""
-    },
-    {
-        "input": "(CNN)Are the upcoming elections in Israel crucial? Will the results -- a fourth term for Prime Minister Benjamin Netanyahu or a first for opposition leader Isaac Herzog -- change Israel's policy in a significant way? As Israelis prepare to go to the polls, the answer seems to be negative. Many observers in Europe and the United States seem to be under the impression that during Netanyahu's time in power, Israel was hijacked by right wing zealots -- and that Herzog, leader of the Zionist Union opposition party, is the man who will save the country and return it to the path of sanity. But the fact is that when it comes to the most crucial question regarding the future of Israel and the Middle East -- what to do about the Palestinians -- the difference between the two candidates is negligible, a divergence that is short on substance but long on style and rhetoric. While Netanyahu's insistence on Israel's continued ruling over millions of Palestinians is expressed in aggressive, often religious and nationalistic language, Herzog's justifications for doing the same thing would sound much softer and easier to digest in the Western world. But at the end of the day, neither Netanyahu nor Herzog have any real intention to put an end to the occupation of the Palestinian territories. Netanyahu says it clearly and openly; Herzog and his coalition partners only hint at it. READ MORE: 5 key issues for next Israeli leader . This whitewash has been so evident throughout the entire election campaign that a foreigner arriving in Israel would have had a hard time understanding anything about what is really going on here. The main issues that have been discussed during the campaign are economic ones -- cost of living, housing prices, government oversight -- but even then, slogans and populist promises have largely replaced the in-depth debates we should be having. But the topics that seem to have consumed most of the headlines range from pure gossip to scandals, some of them concerning the behavior of Netanyahu and his wife Sara. There was the \"bottlegate\" affair -- claims that the wife of the Prime Minister had pocketed cash from bottle recycling at their official residence. The Netanyahus say they reimbursed the government. Israel is a country that is still looking for its way forward; one that lacks internationally recognized borders; one that has not yet decided whether it is a Western society or a Middle Eastern one; one that cannot decide whether it wants to be religious or secular, Jewish or bi-national. All of these critical issues -- none of them decided on -- have been cast aside, ignored, covered up or denied by a country that has busied itself with the important business of recycled bottles at the prime minister's residence. There is a big elephant in the room, but Israel is turning its back to it. There is a big elephant in the room, but Israel believes that if nobody talks about it, the elephant does not exist. This elephant is absent from the Israeli discourse on a day-to-day basis, and it is absent during elections -- a time when public discourse should be only be focused on what really matters. The elephant in the Israeli room is the unending occupation of Palestinian territories, and nobody is talking about it. Most of the parties running in the elections have not even bothered to try to answer questions about what their plans are for the Palestinians. But the occupation also continues to be a non-issue for too many ordinary Israelis, who know very little -- and care even less -- about the cruel reality for the millions of Palestinians who live without civil rights in the West Bank and who are kept under siege in Gaza. For those who regard Herzog and the center-left as the great hope of this campaign, it is important to remember that it was the Israeli Labor Party who established the occupation and settlement projects in Palestinian territory. Golda Meir, Moshe Dayan, Shimon Peres, Yitzhak Rabin: Those Labor leaders, so beloved by the West, are the founding fathers of the most significant venture of modern Israel -- the illegal transfer of a Jewish population to stolen land. It was a project whose purpose was to prevent any kind of equal division of the land, or a settlement with the Palestinians -- and as such it has been a great success story. Labor never had a real plan for the Palestinians, except to time and again renew the endless peace process, which may be the longest in history. This is still true now, on the eve of elections that stand a real chance of bringing the Zionist Union to power. The more than 500,000 settlers in the West Bank and East Jerusalem have created what may be an irreversible reality, but Herzog's only answer to it is to get back to the negotiation table. In private circles he mentions five years as the time he needs to conclude the process. There is no better indication that Herzog has no intention of ending the occupation any time soon. Numerous peace plans have already been worked out in great detail; all that is needed is for one courageous Israeli leader to implement any one of them. Herzog, at least at this stage, does not seem to be the one capable of doing this. On Tuesday the \"only democracy in the Middle East\" will elect a new parliament and a new government, in what is frequently described as a celebration of \"people power.\" But the reality is that here in Israel, it is only the masters who will vote and decide on the future -- not only for themselves, but for the millions of Palestinians who have for nearly half a century been living under their control, directly in the West Bank and indirectly in Gaza. And yet, it seems, their fate is not a topic worthy of discussion. To call this a democracy in 2015 is rather problematic.",
-        "instruction": "Summarize the following news article:",
-        "output": "Levy: Israel's occupation of Palestinian territory has gone mostly undiscussed ahead of Tuesday's election .\nLevy: Netanyahu, Herzog don't have the courage to end the occupation ."
-    },
-    {
-        "input": "(CNN)A video purportedly shows three British schoolgirls preparing to cross the Turkish border last month into Syria, where the teens are believed to have traveled to join ISIS, a Turkish TV network reported Friday. If authentic, the video would represent the first publicly released images showing the girls at the Turkey-Syria border. The video, released by Turkish TV network A Haber and distributed by Reuters, shows three warmly dressed females who resemble the missing teens standing with luggage outside a car and talking to at least one man who is helping them with the bags. The footage purportedly was recorded February 19 in the southeastern Turkish city of Gaziantep, just north of Aleppo, Syria, A Haber reported. Two days before, East London classmates Shamima Begum, 15, Kadiza Sultana, 16, and Amira Abase, 15, boarded a Turkish Airlines plane from London's Gatwick Airport to Istanbul without their families' knowledge, according to British police. \"Put your bags. ... Hurry, don't stop here,\" a man says in the video. The girls in the video wear coats, two with fur-lined hoods covering their heads. A third wears a hijab. One appears to be looking at a cell phone. \"He just hung up,\" one of them says. A man takes a bag from the trunk of a yellow vehicle. The vehicle's license plate has a number that indicates it is registered in the Gaziantep area. It wasn't immediately clear how A Haber obtained the video. CNN couldn't immediately verify the footage's authenticity. The video's release came a day after Turkey's foreign minister said his country arrested a \"spy\" who allegedly helped the British girls get into Syria. The alleged agent was working for a country that is part of an international anti-ISIS coalition, Foreign Minister Mevlut Cavusoglu said Thursday. Separately, a Turkish official speaking on condition of anonymity said that the suspect is not Turkish, and is not a citizen of the country for which he was working. This month, a Turkish broadcaster released images that it said showed the British girls boarding a bus in Istanbul after they arrived from London. They are thought to have crossed the Turkish border into Syria within days. A trip to Gaziantep would have been long: The border town is more than 525 miles to the southeast of Istanbul. Authorities have said they have no reason to believe the girls are still in Turkey and believe they have crossed into Syria, parts of which have been taken over by Islamist terror group ISIS. The girls' parents have publicly begged for them to come home. Days before they left for Turkey, at least one of the girls allegedly contacted a young woman, Aqsa Mahmood, who left her home in Scotland to travel to Syria in 2013 and is accused of trying to recruit others via social media. She has posted tips for girls and young women wanting to travel to Syria to marry jihadis, as she did. Her blog also has links to advice posted by another jihad supporter, which recommends that those traveling to Syria seek to pack the essentials but not too much, since they may need to move often and at short notice, while remaining inconspicuous.",
-        "instruction": "Summarize the following news article:",
-        "output": "In video, someone helps three females with luggage in Gaziantep, Turkey, network reports .\nTurkish network: Video was shot February 19, two days after trio boarded plane to Istanbul ."
-    },
-    {
-        "input": "(CNN)Kerry Washington is headed for HBO. The \"Scandal\" star has signed on to play Anita Hill and executive produce a TV movie for the premium cable network, The Hollywood Reporter has learned. Called \"Confirmation,\" the telepic — which is in development from HBO Films — will detail the explosive 1991 Clarence Thomas-Hill Supreme Court nomination hearings, which brought the country to a standstill and forever changed the way people think about sexual harassment, victims' rights and modern-day race relations. See more on-set with the gladiators: inside the fast-paced world of ABC's 'Scandal' Academy Award nominated writer Susannah Grant (\"Erin Brockovich\") will pen the script and executive produce the drama, which is being produced by Groundswell Productions in association with ABC Signature Studios. Groundswell CEO Michael London (ABC's upcoming soap \"Astronaut Wives Club\") and production president Janice Williams will executive produce alongside Grant and star Washington. Sources tell THR that Rick Famuyiwa (Sundance hit \"Dope,\" \"The Wood\") is in talks to direct the pic, but no deal is in place. See more: 40 Years of HBO . Confirmation comes as Washington was poised to film Warner Bros.' feature drama \"Unforgettable\" in the summer for director Amma Assante but the HBO project may take over the \"Scandal\" star's hiatus from her ABC Shonda Rhimes series. For HBO, \"Confirmation\" comes as the premium cabler is building its roster of original films. The cabler this week tapped Elizabeth Banks and Paul Giamatti to play tennis legends Billie Jean King and Bobby Riggs, respectively, in a movie detailing the duo's legendary Battle of the Sexes match. Additionally, HBO is also prepping an adaptation of Bryan Cranston's Tony winner \"All the Way\" with Jay Roach attached to direct. Read more: TV Ratings: 'Empire' Audience Grows, Though Demo Finally Holds . The \"Confirmation\" deal also marks a big off-network sale for ABC Signature, the boutique arm of ABC Studios. It's also HBO's latest buy from an outside studio following series including Warner Bros. Television's \"The Leftovers\" and upcoming \"Westworld.\" Grant is repped by CAA and Kleinberg Lange; Washington is with CAA, Washington Square Films and Hansen Jacobson; Famuyiwa is with CAA, Oasis Media Group and Del Shaw. ©2015 The Hollywood Reporter. All rights reserved.",
-        "instruction": "Summarize the following news article:",
-        "output": "\"Scandal\" actress Kerry Washington to star as Anita Hill in HBO movie .\nHill is the former colleague of Supreme Court Justice Clarence Thomas who accused him of sexual harassment ."
-    },
-    {
-        "input": "(CNN)Just when you didn't think Republicans could top themselves in finding ways to disrespect the President of the United States, on Monday, 47 Republican Senators -- that is, all but seven -- sent a letter to Iran that appeared aimed at scuttling President Obama's diplomatic efforts to prevent Tehran developing nuclear weapons capacity. The move comes on the heels of House Republicans inviting Israeli Prime Minister Benjamin Netanyahu to address Congress -- without even informing the White House of the plan to issue an invitation. But this latest measure went a step further, seeing Republicans in Congress literally stepping on the constitutional toes of the executive branch and formally intruding on U.S. diplomacy. Under the Logan Act passed by Congress way back in 1799, U.S. citizens are prevented from influencing \"disputes or controversies\" involving the United States and a foreign government, without express authority to do so. Now, it appears the Iran letter was carefully drafted by Sen. Tom Cotton, a smart constitutional lawyer, to come as close as possible to the line of legality without crossing it, specifically by not explicitly taking a position against the Iran negotiations. Yet the intent to interfere with the executive branch is clear. It's one thing for Congress to demand it ratify such a deal (as Democrats did in 2002 when President Bush negotiated arms reductions with Russian President Vladimir Putin). But it's something else entirely to undermine the authority of the White House by sending a partisan letter to a foreign government (something Democrats didn't do with Russia). Sadly, this is only the most recent and extreme example of dishonor and disrespect. In the Washington Post, columnist Jonathan Capehart recently outlines what he sees as the top six instances of disrespect toward President Obama. His list includes the example of former New York Mayor Rudy Giuliani saying, \"I do not believe that the President loves America.\"  Also on Capehart's list are the time a reporter from the conservative Daily Caller website interrupted the President during remarks at the White House about immigration reform in 2012, and the time South Carolina congressman Joe Wilson, a Republican, heckled the President during his speech to a joint session of Congress about health care reform. (Wilson shouted \"You lie!\" in response to remarks by the President that, it should be noted, were in fact quite true). And there are plenty more examples of Republican slights against President Obama. Most famously, there was the widespread questioning of Obama's citizenship and academic credentials that included signs at conservative rallies describing Obama as a \"lyin' African.\" And there was the Republican women's group that created an image of \"food stamp dollars\" with Obama's face on them (as well as a bucket of KFC and watermelon). There was also Republican presidential candidate and former Speaker of the House Newt Gingrich decrying Obama's \"Kenyan, anti-colonial behavior.\"  Plus there was then-Arizona Gov. Jan Brewer waving her finger at President Obama when she greeted Air Force One on the tarmac. Brewer later alleged she \"felt a little bit threatened\" by the President. Taken as a whole, this list suggests a troubling pattern of profound disrespect for the President. But more than this, it seems to reflect an anti-Obama fever that has gripped the Republican Party -- one that the letter to Iran suggests shows no signs of breaking, and is a sure sign of sickness in the Republican Party. Is the lack of respect being shown toward President Obama about race, as some have suggested? Yes and no. Certainly, for a segment of the country, it really is about being uncomfortable with a black president. Indeed, even former Secretary of State Colin Powell, a Republican, noted what he described as a \"dark vein\" of intolerance in the party and parts of the country in general. But beyond that hopefully shrinking share of the population that is explicitly racist, there is a far wider issue of implicit racial bias -- an unconscious, yet nonetheless real tendency, to subject President Obama to extra scrutiny because of the color of his skin. None of this is to suggest that all of the criticism aimed at President Obama stems from implicit racial bias. But such bias appears to be a factor in the ease and extremity of disrespect aimed at America's first black president.  And this disrespect mirrors the disrespect shown against black Americans in general. \"Honorable people can disagree over policy,\" Vice President Joe Biden said in a statement Tuesday.  Yes, and they should do so honorably. Whatever Republicans' gripes about President Obama's vision and leadership, the party and its followers are too often expressing their critiques in an ugly way. And that rampant pattern of disrespect -- for any fellow human being, let alone the president of the United States of America -- is what truly undermines the principles and values of our nation.",
-        "instruction": "Summarize the following news article:",
-        "output": "On Monday, 47 Republican senators sent an open letter to Iran .\nSally Kohn: Only most extreme example of dishonor and disrespect ."
-    },
-    {
-        "input": "Aberdeenshire, Scotland (CNN)The Scottish Highlands on a wintry morning in January made for an awe-inspiring interview setting. We traveled along narrow roads winding among snow-capped mountains, eventually arriving at Birkhall -- the Scottish home to Prince Charles and his wife, Camilla, Duchess of Cornwall. I had pitched for interviews with the couple several times over the years and finally Prince Charles accepted. I wanted to speak to them about their 10 years of marriage and their upcoming U.S. tour. The couple's first tour as newlyweds was also to the United States, so it felt like an appropriate moment to take stock. When you reach the house, it strikes you as an idyllic royal hideaway. It has the sense of a castle but is welcoming like a farmhouse. It has a pinkish hue and green window frames that take in the stunning views. We went in the back entrance and were escorted through the house to the dining room at the front where we set things up. Rarely have cameras been allowed in, and never to this part of the house. As we were getting ready, an aide announced someone wanted to say hello. The Duchess of Cornwall appeared from around the corner, keen to welcome the crew. We spoke about what all Brits talk about -- the weather. Prince Charles opens up about love and life . She is not what you might expect when you meet her. She's more charismatic, which is something I hope comes across in the CNN special, \"Spotlight: Charles and Camilla.\" Unfortunately she had declined our request to be interviewed. She's never done one and doesn't plan to. She wants the limelight to be on her husband. The Duchess of Cornwall had however allowed our cameras to get close enough to pick up sound during some of her public appearances. That would give viewers a greater sense of her character than they had had before. When Prince Charles arrived, he also insisted on meeting the crew, which was great but was also eating into our interview time. In the end, we had the full 20 minutes as planned. You can make your own judgements of the TV special, but I think we did manage to give you a good sense of him. He was comfortable enough to be himself. He was the same person on camera as off.  Afterward, we walked through the house and into the garden for some additional shots. The Duchess of Cornwall joined us on the way and was worried I would be cold without a coat, but the coat was inaccessible and I didn't want to hold things up. You can't tell an awful lot about someone by spending just a morning with them, particularly as a journalist, but my impression was that they are close, connected and have fun. I then left them to take a stroll on their own with the crew, who captured some shots that I think have a small place in royal history -- a glimpse of a couple enjoying each other's company in their private wilderness, before he ascends to the throne.",
-        "instruction": "Summarize the following news article:",
-        "output": "Max Foster spent time with Prince Charles and Camilla, Duchess of Cornwall .\nCamilla declined to be interviewed .\nFoster says couple seem close and connected ."
-    },
-    {
-        "input": "(CNN)University of Virginia student leaders have scheduled a campus forum with police Friday to discuss allegations of brutality following the bloody arrest of a 20-year-old student that prompted protests at UVA. The incident early Wednesday involved uniformed alcohol control agents and Martese Johnson, an African-American student at UVA. It made headlines around the country and prompted Gov. Terry McAuliffe to order an independent Virginia State Police investigation into what happened. The on-campus forum is scheduled for 1 p.m. Friday. In a Facebook post, the university's Student Council said representatives from Charlottesville, Virginia, police, Albemarle County police and the state Alcohol Beverage Control were to attend the forum \"to engage in a conversation about their relationship with students, or lack thereof, and about the issue of police brutality.\" Johnson's arrest prompted protests among students demanding \"Justice for Martese\" after images circulated showing his bloodied face and clothing. While what happened Wednesday remains under investigation, video from the incident shows Johnson pinned to the ground, screaming: \"I go to UVA! ... You f****** racists! What the f***? How did this happen?\" An officer can be heard telling the man to stop fighting. According to the student's attorney, Daniel Watkins, \"just before handcuffing him, police took Martese to the ground, striking his head on the pavement and causing him to bleed profusely from the gash on his head.\" He needed 10 stitches to close the gash in his head, Watkins said. Johnson was charged with public intoxication and obstruction of justice, according to the Virginia Department of Alcoholic Beverage Control, which acknowledged he suffered injuries during his arrest. The agents were placed on administrative duties during the investigation, the agency said. The Black Student Alliance said the arrest reminds black UVA students \"of the gruesome reality that we are not immune to injustice.\" In a news release, the university called the arrest disturbing, and University President Teresa Sullivan issued a statement saying that \"every member of our community should feel safe from the threat of bodily harm and other forms of violence.\" Virginia Attorney General Mark Herring appealed for patience. \"Certainly the images that we've all been looking at in Charlottesville are shocking, but it's really important that we get the facts out and understand exactly what happened,\" Johnson said. Friday's forum will be held at a theater on the UVA campus. The Student Council said media would not be allowed to ask questions or bring cameras into the forum but said the event would be live-streamed on the Internet. \"This is a student space -- a chance for students to ask the questions they feel are most pressing to Virginia's most senior law enforcement officials,\" the group said on Facebook. Students leaders asked their contemporaries to tweet questions in advance. As of midmorning Friday, only a handful of tweets had crossed using the proposed hashtag \"#policedialogue.\" \"Is the problematic influence of implicit bias discussed at all during police training,\" Twitter user yaejmeister asked. Among other questions, one student asked why alcohol control agents have police powers. A  third wondered \"is catering to UVa's hypersensitivity preparing our students for the world outside of 'grounds'?\" The incident comes amid a continuing nationwide debate over the use of force by police, particularly involving African-Americans, following the deaths last summer of Michael Brown in Ferguson, Missouri, and Eric Garner in New York. Grand juries in both communities declined to indict white police officers in the deaths, leading to angry protests nationwide and calls for renewed attention to claims of police bullying and brutality.",
-        "instruction": "Summarize the following news article:",
-        "output": "Forum set for Friday afternoon to discuss Martese Johnson arrest .\nArrest prompted protests after images circulated showing student's bloodied face ."
-    },
-    {
-        "input": "(CNN)I met Kelly Gissendaner in January 2010 in a nondescript classroom at Metro State Prison for Women in Atlanta. She arrived for class beaming with excitement about the journey she was about to begin -- participation in a yearlong academic theology program sponsored jointly by four Atlanta seminaries. Since she has been sentenced to the death penalty and lives in solitary confinement, Kelly was particularly eager to share community with others, if only one morning a week. And she was grateful for the opportunity to explore the Bible and theology in a rigorous manner that would nurture and deepen her devotional life. That image of her on the first day of class remains vivid to me because it captures the core of who Kelly is — who she has become: someone full of contagious joy and gratitude, open to others and to new experiences for growth and ministry. Kelly's process of transformation began shortly after she arrived in prison following her conviction for murder in the death of her husband, Doug Gissendaner. A pastor began visiting her and initiated a series of difficult, yet compassionate, conversations that urged her toward courageous self-reflection. This same pastor has been visiting Kelly for almost 16 years. Her commitment to Kelly, along with that of the prison chaplain and chaplaincy interns, provided steady, ongoing love that fostered change. So by the time I met Kelly in 2010 she had already undergone a significant transformation. She was, in the words of the Apostle Paul, a \"new creation\" (2 Cor. 5:17). In the theology program, Kelly found her own voice and came to see that her reflections on Christian faith could be a gift to the wider church \"on the outside,\" as well as in prison. By studying historical and contemporary Christian thinkers, Kelly became part of the conversations that make the Christian tradition dynamic. She asked honest questions about her relationship to God, others and the world. She read scripture and grappled with centuries-old theological questions. She discovered her authentic theological voice in the midst of this work. \"From the start of the theology class I felt this hunger,\" she said in her 2011 graduation speech. \"I became so hungry for theology, and what all the classes had to offer; you could call me a glutton.\" One of the great joys of being a theology professor is getting to know students holistically, not only as thinkers, but also as human beings wrestling with some of life's most urgent questions. My relationship with Kelly had this quality from the start. But it deepened six months into the year when a new warden arrived at the prison. In her graduation speech, Kelly described this moment: . \"There came a time when ... my worst fears became my reality -- I was pulled from the courses. I was taken from my theological community. Being pulled from the program devastated me as badly as if someone had just told me one of my appeals had been turned down. \"Since I couldn't go to the theology class ... the instructors came to me. Still, this was far from being ideal because now I had to have class and community through a gate. It was hard ... but I pushed on. I pushed on because of that hunger. That gate ... was meant to keep everyone and everything separated from me. But that gate couldn't keep out the knowledge that I was so hungry for, nor friendship and community. And it sure couldn't keep out God.\" This change afforded us the chance to have two hours of one-on-one conversation every Friday. We continued to read theological texts together, including a book by then-Archbishop of Canterbury Rowan Williams. Williams describes healing and restoration as the act of facing our painful memories, \"the ruins of the past,\" and building from them here and now. Restoration, Williams writes, \"is going back to the memories of the painful, humiliating past and bringing them to redemption in the present ... to Christ [who] comes to repair the devastation.\" I sat with Kelly as she went back over some of her own painful memories, took responsibility for them and showed profound remorse about who she had been and what she had done. Indeed, the power of these moments -- when Kelly looked me in the eye and confessed concrete sins, when we spoke of God's love and forgiveness -- will stay with me forever. Kelly embraced what Dietrich Bonhoeffer, a German pastor-theologian and Nazi resister, calls \"costly grace.\" For Bonhoeffer, \"cheap grace\" is seeking God's forgiveness as a \"cover-up for one's sins, for which one has no remorse and ... even less desire to be set free.\" In contrast, costly grace requires rigorously following Jesus in a way that leads to continuous, visible transformation, what the New Testament calls the fruits of redemption. The fruits of Kelly's redemption are now well-documented: reconciliation with her children, ministry to inmates full of despair, counsel to troubled youth and daily concern for others. On the night Kelly thought would be her last, she spent the evening writing a letter to her fellow inmates, urging them not to worry about her, but to be encouraged. Most poignant for Kelly are the writings of German theologian Jürgen Moltmann, who is widely known as the \"theologian of hope\" and with whom Kelly began corresponding in 2010. Moltmann shows that biblical hope is not a hope that gives up on this life and looks for something better beyond the grave. Rather, hope makes manifest the kingdom of God now --God's intended social order \"on earth as it is in heaven\" (Matt. 6:10). Biblical hope \"revolutionizes and transforms the present.\"  It is the hope of the psalmist who \"looks for the goodness of God in this life\" (Ps. 27:13).  In the words of Kelly's favorite scripture, it is a hope that proclaims: \"I shall not die but live, and declare the works of the Lord\" (Ps 118:17). In Kelly's own words: . \"The theology program has shown me that hope is still alive and that, despite a gate or a guillotine hovering over my head, I still possess the ability to prove that I am human. Labels on anyone can be notoriously misleading and unforgiving things. But no matter the label attached to me, I have the capacity and the unstoppable desire to accomplish something positive and have a lasting impact ... Even prison cannot erase my hope or conviction that the future is not settled for me, or anyone.\" Many people have asked me in recent days how I have been transformed by my friendship with Kelly and by Kelly's journey of hope. While I struggle to find adequate words, what I do know is that Kelly's story pushes the logic of Christian faith to its outermost limits. It pushes Christians to reexamine and reaffirm the truths we proclaim about repentance, forgiveness, redemption and hope. Indeed, even professional theologians and life-long pastors struggle with the weight of the claims we make. Today, Kelly's life hangs in the balance. The state of Georgia on Monday issued a last-minute postponement of her execution, citing concerns about the drugs that were to be used to kill her. The Department of Corrections has said that it plans to resume executions once an analysis of the drugs is complete. While Gov. Nathan Deal does not have the formal power to commute Kelly's sentence to life in prison, I join more than 1,100 faith leaders from across the nation, including more than 500 from Georgia, who have signed a letter urging the governor to use his political influence to save Kelly's life and to speak out publicly against her execution. I call on all people of good will to reach out to Gov. Deal and to Georgia state legislators to demand a more just, merciful, and accountable system of justice -- for Kelly and for all. As theologian Richard Amesbury wrote, \"If the life even of a convicted murderer can be turned around and so radically redirected, then none of us is without hope.\" We need to hear Kelly Gissendaner proclaim to us -- as much as we proclaim to her -- that the promises of God are real.",
-        "instruction": "Summarize the following news article:",
-        "output": "The execution of Kelly Gissendaner was postponed due to concerns over injection drugs .\nMcBride: In her time on death row, Gissendaner has discovered hope through theology ."
-    },
-    {
-        "input": "Boston (CNN)Eight minutes of sheer terror. That's what police officers from Watertown, Massachusetts, described in heart-stopping detail Monday, revealing details of a chaotic shootout with the Boston Marathon bombing suspects. One of the suspects, Tamerlan Tsarnaev, died from injuries he sustained that day -- wounded in a gun battle, then run over by his brother, Dzhokhar. Dzhokhar Tsarnaev managed to escape, but was later caught by police. Now he's on trial, facing 30 charges for the marathon bombings, which killed three people and injured more than 260 others. The shootout with police in the early morning hours of April 19, 2013, marked one of the most dramatic chapters in the manhunt for the suspects who paralyzed the Boston metropolitan area for days. It started, patrolman Joseph Reynolds testified, soon after he locked eyes with Tamerlan Tsarnaev, who he spotted driving an SUV that matched a description of a stolen vehicle. Reynolds called for backup. It wasn't long before Tamerlan and Dzhokhar Tsarnaev were firing guns, throwing bombs and trying to run officers over with a stolen SUV, Reynolds said. \"Tamerlan Tsarnaev got out of the driver's side door and began shooting at my cruiser,\" Reynolds said. Soon, the officer said he ran out of bullets. Sgt. Jeffrey Pugliese saw what looked like muzzle flashes as soon as he arrived at the scene. \"I put my vehicle in park, I took a round through the windshield, I was sprayed with glass and I knew, OK, we were being fired on,\" said Sgt. John MacLellan. Then the two brothers began throwing improvised explosives, including pipe bombs and a pressure cooker bomb, the officers testified. \"I noticed one was bigger than the other, and they had different styles when they were throwing the devices,\" MacLellan said. \"One was throwing like a baseball.\" MacLellan said the pressure cooker bomb \"was incredibly loud. I had to holster my weapon. My eyes were shaking violently in my head. I couldn't see.\" Pugliese said he opened fire when he saw one of the men, Tamerlan Tsarnaev, charging toward him. Pugliese fired and the man threw his pistol at the officer, hitting him in the bicep. Pugliese tackled him. And with Tamerlan Tsarnaev, who was wounded from gunfire, on the ground, the three officers tried to put him in handcuffs. They thought they'd be able to arrest him. But then something changed. \"We were wrestling with Tamerlan, and all of a sudden I could hear an engine revving,\" Reynolds said . The SUV, Reynolds said, was heading straight toward the officers. The dramatic descriptions of the shootout Monday sounded like a scene from a Hollywood film, said Boston Globe columnist Kevin Cullen. But the most extraordinary revelation in court, he said, was that the night could have ended very differently. \"Dzhokhar Tsarnaev did not have to go back and run his brother over. He actually did a three-point turn and reversed the vehicle. He could have sped off and run away,\" Cullen told CNN's \"The Lead with Jake Tapper.\" He was in a much better position to flee. But apparently he decided to do a U-turn and come back.\" The sight of the stolen SUV speeding toward them caught the officers by surprise. \"I reached down and I grabbed Tamerlan by the back of the belt and tried to drag him out of the street so he wouldn't be hit,\" Pugliese said. \"The black SUV, it was right in my face. ... I kind of laid back and felt the wind from the vehicle as it went by.\" But they didn't move Tamerlan in time. His body became hung up in the rear wheels and he was dragged a short distance, Pugliese said. The prosecutor asked Pugliese if there was something in the road that forced the SUV driver to go directly at the officers. \"No,\" he said. \"It was accelerating at a very high rate of speed.\" Later that day, Tamerlan Tsarnaev was pronounced dead at a local hospital, with the cause listed as \"traumatic injuries\" to the head and torso. His fingerprints led to the identification of the suspects. Officers discovered that Richard Donohue, a Massachusetts Bay Transportation Authority officer, had been hit by friendly fire during the shootout. He survived, but nearly bled to death. Dzhokhar Tsarnaev was arrested about 8:45 p.m. that same day, hiding in a boat called the \"Slip Away\" that was stowed in a backyard in Watertown. Jurors saw photos of the boat last week. But on Monday, they got a chance to see the boat, which has become a key piece of evidence in the trial, in person. Before the trial started, prosecutors and defense attorneys had sparred over how much of the boat jurors would get to see. The prosecution sought to remove a panel on which Tsarnaev allegedly scrawled incriminating messages so that jurors could see it with their own eyes. Defense attorney David Bruck argued that cutting out a panel would take the written words out of context and wouldn't fairly reflect Tsarnaev's state of mind. In South Boston, about a mile from the courthouse, jurors intently looked at the entire boat Monday. It had been loaded onto a semi truck and moved to the location for viewing by the jury. Dzhokhar Tsarnaev, wearing a dark jacket and no handcuffs or shackles, watched the jurors but showed little emotion or expression. Jurors appeared to strain to make out the words Tsarnaev scrawled inside the boat. The boat was riddled with more than 100 bullet holes -- some of which punctured Tsarnaev's words. Aaron Cooper reported from Boston. Catherine E. Shoichet and Ralph Ellis wrote the story in Atlanta. CNN's Jake Tapper and Ann O'Neill contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "3 police officers described an intense gun battle with Boston Marathon bomb suspects .\nThey said the 2 men shot at them, threw bombs and tried to run them over .\nJurors see the boat where Dzhokhar Tsarnaev hid before his arrest ."
-    },
-    {
-        "input": "(CNN)All 11 service members aboard an Army Black Hawk helicopter that crashed in waters off the Florida Panhandle two days ago are believed dead, and the operation has transitioned from rescue to recovery, an Air Force official said Thursday. The military is now focused on recovering the helicopter, which searchers found at the bottom of the Santa Rosa Sound near Eglin Air Force Base, and the bodies of all seven Marines and four Louisiana Army National Guard members who were aboard, Air Force Col. Monte Cannon said. Also ahead: Trying to determine why the aircraft went down in thick fog during a training mission Tuesday, Cannon said. \"The decision to suspend is always difficult,\" said Layne Carter, search and rescue mission coordinator of the U.S. Coast Guard. \"With heavy hearts, we have decided to suspend active search and rescue operations. Our thoughts and prayers are with the friends and family of the members involved in this tragedy.\" The bodies of at least two National Guard members were recovered Thursday, and the remains of the other two were believed to be in the underwater wreckage 25 feet below the surface, the Guard said. The troops were with 1-244th Assault Helicopter Battalion from Hammond, Louisiana, according to a statement from the Louisiana National Guard. The military had previously said that some remains had washed ashore Wednesday. Military officials declined to say Thursday how many bodies still were missing, and they did not release the names of the service members involved in the crash. Two Black Hawk helicopters were training near Eglin in heavy fog when one of them went down in the sound -- a narrow body of water between mainland northern Florida and a barrier island -- about 8:30 p.m. Tuesday, the military has said. A couple of military boats already were in the water for safety purposes as part of the exercise, but \"nobody saw anything because of the dense fog,\" said Mark Giuliano, chief of the Eglin Air Force Base's fire department. \"People said they heard a loud bang, and that was it,\" Giuliano said Thursday. The second Black Hawk -- which, like the first, was assigned to the 1-244th Assault Helicopter Battalion based in Hammond, Louisiana -- safely returned to the base, some 40 miles east of Pensacola. \"Whatever the trouble was with the one aircraft, it did not involve the second helicopter that was participating in the exercise,\" Eglin spokesman Andy Bourland said earlier this week. Giuliano said his fire department, which has a unit on Santa Rosa Island near the crash site, was called to help at about 10 p.m. Tuesday, roughly 90 minutes after the crash. The military boats that were part of the mission already had been searching for survivors and wreckage, he said. Investigators from the Army and the Marines will try to determine what caused the wreck, Cannon said Thursday. Military officials have said it's too early to know whether the fog contributed to the crash. The 11 service members, all men, were involved in a seven-day training exercise of amphibious operations, said Capt. Barry Morris, a spokesman for the U.S. Marines Corps Special Operations Command. It involved small boats, and inserting and extracting Marines from the water via helicopter. Morris would not say which phase of the training the Marines were in Tuesday night. The seven Marines were with a Marine Special Operations regiment out of Camp Lejeune, North Carolina. The initial search was hampered by dense fog, to the point that searchers on boats \"could not see,\" Giuliano said. On Wednesday morning, after searchers got a sonar hit, a diver found the bulk of the wreck at the bottom of the sound, Giuliano said. Divers then went into the water \"to start retrieving the aircrew,\" Giuliano said, without specifying how many bodies were recovered. The helicopter had broken into several pieces in a \"high-impact crash,\" he said. The Coast Guard has hired a salvage company from Mobile, Alabama, to take the wreckage from the water, but that company might not arrive until Thursday evening, and weather conditions may delay the recovery until Friday, Giuliano said. Gen. Martin Dempsey, chairman of the Joint Chiefs of Staff, expressed his condolences on Wednesday. He said the crash was \"a reminder to us that those who serve put themselves at risk, both in training and in combat.\" \"We will work with the services to ensure that ... their family members will be well cared for.\" A Marine died Wednesday when a T-59 Hawk, an aircraft operated by a civilian contractor, crashed at Marine Corps Air Station Yuma, Arizona. The aircraft hit a government vehicle, killing the Marine.  The pilot and passenger of the plane were examined and released by emergency services. CNN's John Newsome, Brian Carberry, Victor Blackwell, Jamie Crawford and Brad Lendon contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Guardsmen aboard were based in Louisiana .\nFour National Guard aircrew members and seven Marines were aboard the helicopter .\nCrash happened Tuesday in foggy conditions, which hampered initial search ."
-    },
-    {
-        "input": "Tunis (CNN)The bodies of four Italian tourists slain in a terror attack on Tunisia's Bardo Museum three days ago arrived back in Italy on Saturday, an official with the Tunis Crisis Center told CNN, but 14 victims' remains still lie in the morgue. As Tunisia gets back on its feet, the investigation into Wednesday's shooting at the Tunis landmark continues. Most of the 23 victims were foreigners, making the process of identification more complicated. Nineteen of them were tourists who'd been on two cruise ships that docked in Tunis. French, Spanish, Italian, British, Japanese, Russian and Colombian citizens are among those to have been formally identified so far. The bodies of the Italians were met in Rome by Italian Prime Minister Matteo Renzi, who paid his respects to the victims and their families in a brief ceremony. Eleven people who were injured in the attack remained in the hospital in Tunisia on Saturday, the official at the Tunis Crisis Center said. Tunisian Prime Minister Habib Essid identified two suspects in the attack, Yassine Labidi and Saber Khachnaou, in an interview with French radio station RTL on Thursday, though it wasn't immediately clear if they were the pair killed at the museum by Tunisian security forces. He said Yassine was \"known to the security services, he was flagged and monitored,\" but not known or being followed for anything special. Authorities have arrested nine people in connection with the attack, including four directly linked to it, according to a statement from Tunisian President Beji Caid Essebsi. An uncle of Yassine Labidi, Abeld Malik Labidi, told CNN on Friday that no one he knew had seen any signs of extremism in his 26-year-old nephew. But he said Yassine Labidi was one of the two gunmen killed at the museum. \"It's true that Yassine carried out this terrorist attack, he was killed; his head, his body, we don't have it back,\" he said. But, he said, he believed Yassine and other young Tunisians like him were also victims of terrorism -- of the recruiters who paid them money, organized the logistics and took them to places like Syria and Libya to train as fighters. He had known his nephew well, he said. \"After the revolution of 2011 he started to pray, before he would drink beers from time to time, like a young Tunisian. He wasn't extreme in any way.\" The only thing that raised questions was that Yassine had disappeared for about a month, he said. Although his nephew said he'd gone to the Tunisian city of Sfax to work, his family now suspected he had been in Libya because of the phone numbers he called from. \"When he came back his behavior was the same: he was still himself, calm, serious. Nobody noticed anything, even the neighbors I spoke to,\" said Abeld Malik Labidi. \"He said hello to everyone, he prayed, he took his coffee, even on the day of the attack he took his coffee with his family and went to work.\" Abeld Malik Labidi said Yassine's father, sister and brother had undergone lengthy interrogations by anti-terror police since the attack. Officers had seized his nephew's computer and phone, as well as taking samples of his fingerprints, he said. A cousin of Yassine, who asked not to be named, told CNN that the family was shocked by what had happened. \"We are all shocked, we lost someone even if what he did was wrong, may God forgive him. Those he killed were innocent, why would you go and harm Australians or Japanese ... our Islam doesn't mention about killing people, Islam has never been this,\" he said. He also said he had no idea how his cousin had been radicalized, saying he was \"a normal Tunisian guy ... but not an extremist.\" Security Minister Rafik Chelly said on Friday that the two extremists who attacked the museum got weapons training at camps in Libya. The suspects were activated from sleeper cells in Tunisia, he said. He did not say which group activated them, or with whom they trained. \"They left the country illegally last December for Libya, and they were able to train with weapons there,\" he told private broadcaster AlHiwar Ettounsi TV. Like Tunisia, Libya saw its longtime leader Moammar Gadhafi ousted during the regional wave of revolutions known as the Arab Spring. But unlike its neighbor to the west, Libya has been fraught with more instability and violence -- much of it perpetrated by Islamist militants. In an audio message posted online Thursday, ISIS claimed responsibility for Wednesday's attack, which it said targeted \"crusaders and apostates\" with \"automatic weapons and hand grenades.\" CNN cannot independently verify the legitimacy of the audio statement. That bloodshed is \"just the start,\" the ISIS message warned -- a threat that may or may not be hollow, but nonetheless adds extra urgency for Tunisian investigators. CNN's Claudia Rebaza reported from Tunis and Laura Smith-Spark wrote from London. CNN's Radina Gigova and Greg Botelho contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The bodies of four Italian tourists killed in Tunis are returned to Italy .\nThe uncle and cousin of one of the suspected gunmen say they had no idea he had been radicalized ."
-    },
-    {
-        "input": "Ferguson, Missouri (CNN)Ferguson's police chief and other city officials are already out. Now some residents have told the Missouri city they'll try to oust the mayor, too. Five residents Friday filed an affidavit saying they'll try to force a referendum on whether to remove Mayor James Knowles -- capping a tumultuous week in a community grappling not only with last year's fatal shooting of teenager Michael Brown, but also Wednesday's wounding of two police officers shot during a protest. A group supporting the filing, Organization for Black Struggle, a 35-year-old St. Louis-based activist group, said the five would try to collect enough signatures -- 15% of the city's registered voters -- in the next 60 days. The five \"initiated recall (attempt) due to Mayor Knowles' failure to adequately rein in an out-of-control police department during the protests following Mike Brown's death,\" the activist group said. The city government acknowledged Saturday that it received the affidavit. Earlier this week, the police chief and the city manager resigned in the wake of a U.S. Justice Department report alleging institutionalized racism at just about every level of Ferguson's municipal government, including the police department. Residents elect City Council members, who in turn appoint the city manager. The city manager directs and supervises all city departments, including police. The top municipal court clerk was fired earlier in connection with racist emails. This month's Justice Department report was initiated after one of Ferguson's police officers, Darren Wilson, shot and killed African-American teen Michael Brown in August, a shooting that spurred months of protests. Wilson, a white officer who said he shot Brown in self-defense, will not be charged in the case -- a grand jury declared it wouldn't indict him in November, and the Justice Department said this month that it would not bring federal civil rights charges in the case. Wilson resigned from the department in November, citing security concerns. Knowles indicated Friday he won't be stepping down. \"I think it's important to recognize that there's a lot of people who may be angry at the situation; there's a lot of people who are frustrated in this community with the way things have gone down,\" the mayor, who is white, told CNN. \"But there's a lot of people who still -- and who have expressed this to me -- express confidence in both my willingness, and members of the (City) Council's willingness, to listen, to be responsive, and to make changes as necessary,\" he continued. \"People in the community recognize this, now, not everybody. I didn't win every time with 100% of the vote. But I can tell you there are ways to remove me if that is the will of the people,\" the mayor said. Blake Ashby, a white resident of Ferguson, said Friday that he believed Knowles has \"consciously tried to reach out to all parts\" of the city of 21,000 people. \"If we lose Mayor Knowles, we lose a force for change, and it will be harder to make the changes that the DOJ (Department of Justice) is asking for,\" Ashby said. Rasheen Aldridge, a black member of a commission charged with recommending reforms in Ferguson, said  that Knowles needed to resign in the wake of the DOJ report. \"He knew what was going on during his watch,\" said Aldridge, a member of the Ferguson Commission, which Gov. Jay Nixon formed last year. Investigators still are seeking breaks in the case of two police officers who were shot Wednesday night during a protest in Ferguson. While the demonstrators' focus was Ferguson, neither of the wounded officers works for that police department. One is from Webster Groves, a St. Louis suburb 13 miles south of Ferguson. The officer -- a 32-year-old with seven years' experience -- was shot at the high point of his cheek, just under his right eye, police said. The other was hit in the shoulder and the bullet came out the middle of his back. He is a 41-year-old officer with the St. Louis County Police who has been in law enforcement for 14 years. Both were treated and released. \"I cannot tell you an arrest is imminent, and there's certainly no one in custody,\" St. Louis County Police Chief Jon Belmar told reporters Friday afternoon. \"The detectives are working this investigation around the clock, and they will not rest until we have a conclusion in this investigation,\" the chief said. Police have \"several leads,\" he said. \"I think we have a pretty good general idea of where we think the shots came from.\" Sara Sidner reported from Ferguson, Missouri. Jason Hanna reported and wrote from Atlanta. CNN's Ed Payne and Michael Martinez contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Group says mayor failed to rein in police during protests after Michael Brown's 2014 death .\nMayor won't step down, saying \"there are ways to remove me\""
-    },
-    {
-        "input": "(CNN)Her murder shocked a nation, but according to the slain woman's father, it also woke Turkey up. Ozgecan Aslan's body was found last month, burned and abandoned, just days after her family reported her missing. She allegedly fought off a sexual assault before being killed by the driver of a bus she'd taken to go home. Her death sparked widespread protests. \"A country woke up,\" Aslan's father, Mehmet Aslan, told CNN in a phone interview. \"There was no way for so many people to come together over a single killing. But, I know that my Ozgecan was sent for a reason. And as devastating it is, it was her part, her destiny to wake people up.\" Aslan, 20, was a first-year university student. She studied psychology. \"My Ozge lived for peace,\" her father said. \"For peace, love and beauty. She believed in a better tomorrow.\" Since her death, hundreds of thousands of protesters have taken to the streets and online, demanding Turkey do more to protect women. They say the problem is cultural; it's also criminal. Bianet, a Turkish group that tracks violence against women, reports that at least 281 women were killed in Turkey in 2014 -- up 31% from the previous year. According to Hulya Gulbahar, a Turkish activist and women's rights lawyer, laws are irregularly applied. \"In a majority of Turkey's female abuse cases, the perpetrators receive mitigations in their sentences due to court's detection of consent granted by the victim,\" Gulbahar said. \"In some cases, wearing a miniskirt or some cleavage got the woman's rapist mitigation in his punishment, while in some cases the consent was attached to her wearing red.\" It's unclear what will happen in Aslan's case. Three suspects have been arrested in her death, Turkey's semiofficial news agency Anadolu has reported. The agency identified the main suspect as 26-year-old Ahmet Suphi Altindoken and said that he had confessed. Aslan reportedly fought him using pepper spray, which had been given to her by her mother. Although he is heartbroken and angry, Mehmet Aslan isn't out for revenge. In fact, his family has received donations from across the country and is planning to use those to set up a rehabilitation center -- not for women who are abused but for men who abuse. \"We must surrender to love,\" he told CNN. \"Otherwise, we all lose.\" Aslan's murder is similar in many ways to well-known case in India. There, a student was attacked by five men on a public bus in 2012. She was raped and later died from her injuries. Like Aslan's murder, her death triggered massive outrage and nationwide protests. And like Turkey, India has a serious problem when it comes to women's rights. Out of 142 countries, Turkey ranks 125th (India ranks 114th) on the World Economic Forum's 2014 gender gap index. On Sunday, some 3,000 women marched in Istanbul to mark International Women's Day, Anadolu reported. The female-only rally was dedicated in Aslan's memory. \"Ozgecan's screams have indeed been heard. Maybe not that very instant, but later -- and in millions of hearts,\" said Mehmet Aslan.",
-        "instruction": "Summarize the following news article:",
-        "output": "Ozgecan Aslan allegedly fought off a sexual assault before being killed .\nHer death triggered widespread protests, including one on International Women's Day ."
-    },
-    {
-        "input": "(CNN)Lindsey Vonn may have missed out on gold at last month's world championships, but the American skier has set her sights on end-of-season glory after claiming a record-extending 65th World Cup win on Sunday. Vonn's victory in the super-G event at Garmisch-Partenkirchen in Germany gave her the outright lead in the speed discipline ahead of this month's finals in France. Having finished seventh in Saturday's downhill, the 30-year-old rebounded by coming home 0.2 seconds ahead of overall World Cup leader Tina Maze, whose coach set up the course. It put Vonn eight points ahead of super-G world champion Anna Fenninger, who placed third to give back the 20 points she'd earned over Maze the day before -- when their positions were reversed. \"I think it was set probably against Anna,\" said Vonn, who took bronze behind Maze at last month's world championships in Colorado. \"That was a wise choice by Tina's coach. But it also really suited me and I liked it.\" \"It's going to be a close fight in downhill and super-G, so I will really have to ski my best at the finals in Meribel. Hopefully I can get two titles,\" added Vonn, who is 35 points ahead of Fenninger in the downhill standings. But Vonn is well off the pace in the fight for the overall crown -- which she last won in 2012, her fourth success -- in third place almost 200 points behind the Austrian. Maze is another 44 points ahead of Fenninger, with just two slalom events in Sweden next week before the March 16-22 finale. \"Of course it's a great result but I prefer to win,\" the Slovenian said. \"Lindsey was simply better today and she deserves this win. \"My whole team stands behind me and supports me and I am really thankful to them. Valerio did a great job setting the course. It was a fast and nice course set with not too many turns.\" Fenninger, who also won the world giant slalom title at Beaver Creek, said she will focus on her strengths after claiming a 12th podium this season. \"It was a fast super-G and that's maybe not what I am the best at,\" said the 25-year-old. \"Overall it's been a good weekend. Honestly I thought Tina was going to be better than me in the downhill and that I would catch up in the super-G. It turned out to be different. \"I am focusing on my strengths, this is why I am not going to race slalom next week. Now I have to train giant slalom in order to show my best skiing and maximize my chances to win the giant slalom globe.\" Meanwhile, Kjetil Jansrud clinched the men's Super-G title with victory on home snow in in Kvitfjell on Sunday. The Norwegian, who missed out on securing the downhill crystal globe when he finished seventh on Saturday, opened up an unassailable 123-point lead over Italian Dominik Paris. \"This is the first globe of my career,\" said the 29-year-old. \"It's good to not have to go into the final weekend feeling like there are two guys pushing me on the cup and I have to fend off both of them. That's a big thing because I can leave the super-G and focus on the downhill.\" Jansrud has a 20-point lead over Hannes Reichelt in the downhill, but is less confident of overhauling his 52-point deficit on another Austrian, three-time defending champion Marcel Hirscher, in the overall standings. The men next travel to Slovenia for slalom events next weekend. \"I think he will gain some extra points in Kranjska Gora, so I'm not very concerned about the overall,\" Jansrud said of Hirscher. \"I think he'll pull through. But it might get exciting. He has no room for error and anything can happen.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Lindsey Vonn notches the 65th World Cup win of her skiing career .\nAmerican has an eight-point lead in super-G discipline ahead of finals .\nVonn bounces back after Saturday disappointment in downhill .\nTina Maze and Anna Fenninger are battling for overall World Cup crown ."
-    },
-    {
-        "input": "(CNN)Melissa was sold into the sex trade by a family member when she was only 12 years old. Her life became a prison: Chained to a bed in a warehouse, she endured regular beatings, rapes and once was even set on fire by her captors as she was forced to serve 5 to 30 men every day. The experience was enough for her to wish she was dead. And this all took place in Texas -- right here, in our own country. Melissa miraculously escaped her captors but, as is the case with so many other victims, she struggled for years to distance herself from her past, even finding herself in jail on multiple occasions. And while institutionalized slavery has long been cast from America's history books, thousands of children still suffer from the bonds of sex trafficking. Indeed, with the average age of a victim only 13 years old, these horrific crimes are stealing the innocent childhoods of kids across America. And the problem isn't confined to one part of the country. In, Minnesota, for example, a 12-year-old girl received a text message that she thought was from a friend. It invited her to go to a party and asked her to go to a fast food restaurant nearby. But she didn't get to a party. Instead, she was taken to a hotel and raped by a 34-year-old pimp, forced to take explicit photos of herself that were then posted on Craigslist, before being forced to have sex with two more men who saw the post. These heartbreaking stories demonstrate what is broken about our current system, and why we both feel committed to fixing it. Instead of treating people like these young girls as the victims they are, they are often thrown into the criminal justice system, labeled prostitutes and left with few options but to return to a nightmare that shockingly still exists in the United States. Meanwhile, instead of treating their perpetrators as child rapists and traffickers, they are often simply treated as \"Johns\" who, if caught, often pay a fine and go on their way. This is outrageous, and should not be tolerated any longer. That is why we've joined forces with more than 200 advocacy groups to fight this modern-day slave trade and provide hope and opportunity for its victims. Two bills we have introduced to combat this horrific crime recently passed the Senate Judiciary Committee with strong support and will have the opportunity for consideration by the Senate. The Justice for Victims of Trafficking Act (JVTA) would support victims by taking fines and criminal assets from convicted human traffickers and directing them toward services and treatment to help victims restore their lives. The JVTA doesn't only help victims, it also arms law enforcement with more tools to go after human traffickers. It ensures these criminals, including \"Johns,\" are brought to justice under our laws, because a financial transaction should not mask assault on a child. The second bill is modeled after Minnesota's \"Safe Harbor\" law, which helps make sure minors sold for sex aren't prosecuted as defendants, but are instead treated as victims. The Stop Exploitation Through Trafficking Act (SETT) would give incentives for all states to have a safe harbor provision. When a state passes a safe harbor law, it means that kids sold for sex are steered towards child protection services, rather than being arrested, charged, or convicted under a state's criminal statutes. All across the country, from Texas to Minnesota, young people are being robbed of their innocence and subjected to unthinkable horrors and abuse. We must do everything we can to free them from the shackles of sex trafficking. We must join together and take action to combat this modern-day slavery once and for all.",
-        "instruction": "Summarize the following news article:",
-        "output": "Sens. Cornyn and Klobuchar: Trafficking stealing kids' childhoods .\nTwo bills introduced to combat problem passed Judiciary Committee, they say ."
-    },
-    {
-        "input": "(CNN)\"I will come to Tunisia.\" It's a bold pledge, some might say. But people across the globe are making that vow despite the deadly attack in the Tunis on Wednesday. Twenty-three people, most of them tourists, were killed after gunmen opened fire inside the Bardo Museum, a stone's throw from the parliament building. The \"I will come to Tunisia\" social media campaign is aimed at protecting the North African nation's tourism industry -- which is critical to the country's economy. The social media campaign features users holding up signs saying they will travel to the North African country. \"I will travel to Tunisia this summer,\" reads a sign held up by a girl from China. A Dutch man's paper says: \"Heroes! I will visit Tunisia and celebrate your Independence Day.\" Tens of thousands of people online have expressed support and solidarity with #JeSuisBardo (\"I am Bardo\"), #JeSuisTunisien (\"I am Tunisian\"), and #PrayForTunisia trending on Twitter. Some one in five Tunisians rely on tourism for their living, the nation's tourism minister said in 2013. Travel warnings in place in the United States, the United Kingdom and other countries could cripple Tunisia's economy. It's a fresh wound after a painful few years in the birthplace of the Arab Spring. Visits to the country dropped significantly after the unrest of the 2010-2011 Jasmine Revolution, but the numbers had been bouncing back before the museum attack. An online advertisement to visit Tunisia, posted long before the attacks, shows people holding hands at some of Tunisia's most treasured landmarks. Human chains form across beaches, mountains, ancient ruins, the desert. The message: \"All united to welcome you.\" This week, those words were repeated in social media. A Tunisian woman, also part of the campaign, holds a sign: \"Welcome To Tunisia. We are not terrorists,\" followed by the hashtag #notafraid. In this online campaign, fear itself is the enemy. A Tunisian lawmaker spoke to that this week, sharing with the world: . \"We are not afraid.\"",
-        "instruction": "Summarize the following news article:",
-        "output": "Campaign to support Tunisian tourism takes hold on social media .\nTourism official: One in five Tunisians depends on tourists to make a living .\n#JeSuisBardo and #PrayForTunisia are trending ."
-    },
-    {
-        "input": "(CNN)To come back or not come back? That is the question former Wimbledon champion Marion Bartoli posed on Twitter, prompting a raft of replies and retweets. The Frenchwoman was crowned champion at the All England Club in July 2013, but announced her retirement a mere one month later due to persistent injury problems. But now it appears the former world No. 7 is pondering a return to the court, with eight-time grand slam champion Jimmy Connors warning Bartoli to only consider it if she is fully committed. Bartoli wouldn't be the first women's star to come out of retirement. Veterans Martina Hingis, Jennifer Capriati and Kimiko Date-Krumm all chose to reenter the fray after saying their initial farewells. Bartoli won eight WTA titles during a 13-year professional career, reaching the quarterfinals of the Australian and U.S. Opens and the last four of her home grand slam in Paris.",
-        "instruction": "Summarize the following news article:",
-        "output": "Marion Bartoli asks whether she should make a comeback over Twitter .\nShe won Wimbledon in 2013 but retired one month later due to injuries .\nBartoli won eight WTA titles during a 13-year professional career ."
-    },
-    {
-        "input": "(CNN)Actress Liza Minnelli has entered a rehab facility for her addictions, a spokesman said. \"Liza Minnelli has valiantly battled substance abuse over the years and whenever she has needed to seek treatment she has done so,\" said spokesman Scott Gorenstein.  \"She is currently making excellent progress at an undisclosed facility.\" The 68-year-old has struggled with addictions to alcohol and painkillers in the past. Minnelli won an Oscar in 1973 for her performance in \"Cabaret.\" CNN's Topher Gauk-Roger contributed to this report .",
-        "instruction": "Summarize the following news article:",
-        "output": "Minnelli won an Academy Award for her performance in \"Cabaret\"\nShe has struggled with addictions to alcohol and painkillers in the past ."
-    },
-    {
-        "input": "Moscow (CNN)The primary suspect jailed in connection with the shooting death of Boris Nemtsov withdrew his confession Wednesday, saying he had been under duress during his admission and isn't guilty. Zaur Dadayev told two members of Russia's human rights council that he pleaded guilty after being detained because he was scared. He told Eva Merkacheva and Andrey Babushkin he had been tortured and the well-being of his family and friends were threatened. Dadayev was one of two suspects recently charged in the case. Three other suspects have not been charged. Merkacheva and Babushkin were accused of \"interfering\" with the investigation and will be summoned for questioning over their motives, authorities said. Russia's Investigative Committee stated that the two rights activists violated the law when they visited and questioned Dadayev in a Moscow jail. Amnesty International condemned the Russian officials' decision to go after the activists, saying that a \"threat to bring criminal charges\" against the rights activists \"raises alarming questions over the fairness of the investigation.\" Anton Tsvetkov, the head of the Public Chamber's commission for public security, said Dadyev and two other suspects denied being involved in the shooting of the Russian opposition leader. He said a doctor found no signs of torture during an examination of the three men. Shagit Gubashev and Anzor Gubashev had no bruises or wounds despite the fact they allege they were beaten by police, Tsvetkov said. Dadayev did have a cracked tooth and handcuff marks on his wrists. Nemtsov, one of President Vladimir Putin's most outspoken critics, was shot in the back on a Moscow bridge as he walked with his girlfriend near the Kremlin in February 27. The three suspects visited by Tsvetkov deny they are guilty and have appealed their arrests, he said. Putin has condemned Nemtsov's killing and ordered three law enforcement agencies to investigate, the Kremlin has said. He also wrote to Nemtsov's mother, saying he shared her grief, and promised to bring those behind the killing to justice. CNN's Matthew Chance and Alla Eshchenko reported from Moscow, and Steve Almasy wrote from Atlanta. CNN's Elwyn Lopez and Karen Smith contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "Zaur Dadayev says he was tortured while he was in custody and, because he was scared, he confessed .\nTwo other suspects reported they were tortured .\nAll three tell officials they are innocent ."
-    },
-    {
-        "input": "(CNN)Empathy. Confidence. Passion. These are the traits the next police chief in Ferguson, Missouri, will need to shrink the ocean of distrust between community and police. So says Cecil Smith. And maybe he should know. Smith is the police chief in Sanford, Florida, another community rocked by racial tensions and poor police-community relations after the high-profile shooting of a black teenager -- the 2012 death of Trayvon Martin. And if Smith's experience is any guide, it will take \"a lot of prayer and a little goading\" to convince someone to step into the job vacated this week by Chief Thomas Jackson. \"That community, as we see, has been hurting and struggling for some time now,\" he says. And how: . -- The community remains deeply scarred by the events last year, after the August shooting death of unarmed African-American teenager Michael Brown by a white Ferguson Officer Darren Wilson, and the November decision by the St. Louis County Grand Jury not to charge Wilson, who later resigned. Occasionally violent protests and sometimes heavy-handed responses by police deepened divisions and distrust. -- The police department was already groaning under the epic weight of months of nearly constant protest and last week's release of a damning Department of Justice report that found evidence of discriminatory conduct on the part of Ferguson officials. -- Add to that the obvious fears facing officers following the shooting early Thursday of two police officers only hours after Jackson resigned. The decision for Jackson to step aside was a mutual one between the chief and city, Mayor James Knowles told reporters. He'll get severance and a year of health insurance and will turn the reins over to Ferguson police Lt. Col Al Eickhoff next week. The city will launch a nationwide search for a permanent replacement, Knowles said. \"The City of Ferguson looks to become an example of how a community can move forward in the face of adversity.  We are committed to keeping our police department and having one that exhibits the highest degree of professionalism and fairness,\" said Knowles -- who has himself been targeted by protesters demanding he resign. To get there, it might look to Sanford for guidance. There, trust in the police department bottomed out in the aftermath of the Trayvon Martin shooting. While no police officers were involved in the shooting, anger over their perceived failure to arrest the teenager's killer, George Zimmerman, pushed relations in the community to a boiling point, eventually resulting in the firing of Chief Bill Lee. Zimmerman eventually was arrested, and a jury acquitted him. Smith watched the chaos play out from Elgin, Illinois, where he was deputy police chief. A former boss in Illinois suggested he look at the job, so he traveled to Florida on his own time, getting to know the community and learning what divided residents and police. He still isn't sure how to explain why he took the job. \"The first year, I was still wondering, was it a smart move?\" Smith joked. But things are better now, he says. After taking the job, Smith made it a point of spending time \"walking and talking\" in Sanford, building relationships with community leaders and everyday residents. Inside the department, stepped up training in engagement and ethical policing. He also stepped up recruitment of African-American officers. \"One of the things that's going on is we don't have people who look like us in the community,\" said Smith, himself an African-American. He handed out long-delayed promotions. And he even presided over a makeover, enlisting officers to help choose new uniforms. The new chief in Ferguson will likely need to do some of the same things, Smith says. Community leaders and other officials agree. \"We need to deal with the culture issue here to make sure whoever is coming in behind Chief Jackson is not a new face and a new name on the same type of issues,\" Patricia Bynes, a Democratic committeewoman for Ferguson Township,\" told CNN Wednesday. \"We need to seriously deal with the culture of the police department and the municipal courts and the way the city is run.\" New York police Commissioner William Bratton said there are two issues confronting the heads of police forces. There is the lack of trust on the community's part and the lack of confidence some officers have in their leadership. On the first issue, he said: \"You need to be willing to embrace that there is a need to change.\" To police leadership consultant John Vanek, whoever takes over the Ferguson department will need to have the same leadership traits valued in boardrooms across the world -- the ability to form partnerships across organizational lines, to think differently, to turn failure into success and to do it all in a harsh media spotlight. But that's easier said than done. \"There's going to be a lot of hostility in Ferguson for a long time,\" he said. So will the new chief need to be African-American to help defuse the racial tensions roiling the city? Here's the politic answer: Find the right candidate, regardless of race. \"You want to find the best candidate to be police chief,\" said attorney Benjamin Crump, who has represented the families of both Trayvon Martin and Michael Brown.  \"But diversity is very important. It's very important that the police officers understand the communities, at least want to engage with members of the community they are going to be protecting and serving.\" \"There's an old saying in the black community that everybody else is protected and served but we are policed,\" Crump said. \"We don't want to be policed. We want to be protected and served as any American citizen.\" CNN's Nick Valencia contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "NYPD commissioner: Chiefs need to embrace need for change .\nFerguson might look to Sanford, Florida's experience when choosing a new chief .\nThat city also had to find ways to heal divisions between community and police ."
-    },
-    {
-        "input": "(CNN)There can be no March without the madness. When it comes to college basketball, the first month of spring ushers in a frantic slew of games to determine seedings for one of the most fervently watched sporting events in the U.S., known simply as the NCAA Tournament, or more often, March Madness. Like English soccer's FA Cup, much of the 68-team tournament's appeal is its egalitarian system of pitting traditional powerhouse programs like Duke, UCLA and Kentucky against relative minnows like Saint Mary's, Butler and Davidson. And both competitions offer the carrot of being able to impress scouts from top pro teams -- a first-round NBA pick can expect to earn anywhere from $755,000 to $5.75 million. Not bad for a graduate. Last year's tournament reached 102 million viewers, while almost 740,000 fans attended the 36 sessions. \"When you go to college, maybe your number one goal before you graduate is you want to be able to play in the NCAA tournament,\" says Bryce Drew, head coach and former player at Valparaiso University in Indiana (28-5, Horizon League champions). \"It's the biggest stage for college basketball, and it's one of the biggest national events in all of America.\" Drew knows just how big an impact that high-profile stage can have on aspiring professionals from beyond the major college ranks. His last-second shot for Valparaiso to beat the University of Mississippi in the first round of the 1998 tournament is one of the most replayed moments in March Madness history. How it works: . Selection Sunday (March 15) A 10-member panel determines seedings for the 68 teams that make the knockout  tournament . First Four  (March 17-18) A format started in 2011, 8 teams compete to qualify in the main draw . Second round (March 19-20) 64 teams split into 4 regions: Midwest, West, South and East . #1 plays #16 in each group . Third round (March 21-22) Down to 32 teams . Regional semifinals (March 26-27) Known as the \"Sweet Sixteen\" Regional finals (March 28-29) The \"Elite Eight\" National semifinals (April 4) The \"Final Four\" is played at Lucas Oil Stadium in Indianapolis; minimum capacity 70,000 . Championship Game (April 6) One of the most-watched events in U.S. sport . \"I had always dreamed of winning the game in the NCAA tournament (and) always dreamed of making the last-second shot. For God to bless (me) with both of things at one time is something that I'll always cherish and remember,\" Drew says. His father, Homer Drew, was Valparaiso's coach at the time, and his brother Scott (now men's basketball coach at Baylor University) was an assistant, completing the Hollywood scenario. \"I don't think any of us realized that they would show that replay the next year, and even the next year, and the next year. We were just so happy in the moment and never thought that we'd be able to talk about it over a decade later,\" he says. Showing the highlight is virtually a prerequisite whenever Drew coaches a nationally televised game, and is sure to come up next week when his Crusaders play in the \"Big Dance\" (one more nickname for the tournament.) Drew was a senior nearing graduation at the time. Although he had heard about professional scouts attending his games, being drafted by an NBA team was still considered a longshot.  After all, the last time a Valparaiso graduate dribbled a basketball in the NBA was in 1956. But that was before \"The Shot.\" Although Drew says private workouts for pro teams solidified his spot as the first Valparaiso player to be drafted in the NBA's first round (as the 16th pick,) he confesses that tournament attention is likely what got his foot in the door. \"It helped me get my name out there, because they got to see me play against a different type of athlete in the NCAA tournament on a different stage,\" he says. Observing athletes during March Madness -- when competition is at its highest level -- is the best way to gauge if they will succeed as pros, according to one player representative. \"It definitely helps an agent to learn how a player will perform under a national stage,\" says Jared Karnes, co-founder of A3 Athletics agency in Knoxville, Tennessee. \"If they want to play in the NBA, they are going to be expected to play under a tremendous amount of pressure and attention.\" Michael Beasley, the second overall selection in the 2008 NBA draft, is one of Karnes' clients who has failed to live up to that pressure so far. Now 26, he recently found his way back to the NBA with the Miami Heat after a spell in China. Karnes, a former player himself at little-known Belmont University who \"had the unfortunate assignment of having to guard Bryce Drew,\" says he enjoys recruiting clients from smaller programs because they lack the air of entitlement that can weigh on household names. \"A lot of times they've had to work their way into the spotlight, and so they develop a hard work ethic,\" he says. \"They really have an appreciation about what's happened to them; the more you can identify a talent with a level of high character, as an agent you've just found a terrific prospect.\" Among the most successful small-program players in the NBA is former Davidson standout Stephen Curry -- the current MVP frontrunner with the Golden State Warriors. Norris Cole, who played at Cleveland State and won two championships with the Miami Heat, and Gordon Hayward, a starting forward for the Utah Jazz, have also made their mark. Cole strikes a particular chord with Drew since they both came out of the unheralded Horizon League, a Division One conference made up of nine Midwestern schools. It was the less glamorous side of the game that set Cole apart from other prospects. \"His defense is what helped him get drafted from our league,\" says Drew, who suited up for four NBA teams in six seasons before playing in Italy and Spain. \"A lot of guys can score and shoot, but when you get to the NBA, really being able to defend someone separates you.\" Hayward led Butler -- a so-called \"mid-major\" program that exited the Horizon League two years ago -- all the way to the 2010 National Championship Game against Duke, only to watch his desperation half-court shot narrowly miss at the buzzer. Although Hayward turned pro after the two-point loss, Butler's exposure to potential recruits allowed the Bulldogs to return to the championship game the next year, and to the third round in 2013. With an enrollment of only 4,500 students, it is the smallest school to play in the final for over 30 years. In 2013, Butler's 36-year-old Brad Stevens became the youngest head coach in the NBA when he signed a $22 million contract with the Boston Celtics. \"Even though they are so wildly successful as a team (now,) Butler right before that wasn't well known,\" says Karnes, adding that it was Hayward who first put it on the map. \"You can find diamonds in these smaller schools.\" Drew hopes to emulate Butler's past success this postseason. He's already led Valparaiso to its highest victory total and winning percentage in school history. Led by 6-foot-10-inch Jamaican senior Vashil Fernandez (five blocked shots against Green Bay in the Horizon League tournament final) and 6' 9\" sophomore Alec Peters, the Crusaders are hoping for a high seed when a 10-member committee meets this \"Selection Sunday\" to rank the tournament's 68 participants. Valparaiso's nightmare scenario would be an early-round pairing with last year's runner-up Kentucky. The Wildcats (31-0) aim to become the first team to go undefeated and win the national championship since the Indiana Hoosiers in 1976. Drew calls coach John Calipari's team -- powered by a front line of 7-footers Willie Cauley-Stein and Dakari Johnson, along with 6' 11\" Karl-Anthony Towns -- as the deepest he's ever seen in college basketball. Six of Kentucky's players are considered worthy of June's NBA draft. While Karnes remains guarded about which college basketball players he's scouting, inking deals with Kentucky players after the tournament is probably a longshot. \"You talk about these smaller schools that are out there, we're that way as a boutique agency,\" he says.",
-        "instruction": "Summarize the following news article:",
-        "output": "Field of 68 gives small colleges a chance to shine .\nBryce Drew turned tournament fame into lasting pro career .\nPro scouts look for diamonds in the rough ."
-    }
-]
diff --git a/triton_backend/tools/dataset/mini_cnn_eval_spec_decoding.json b/triton_backend/tools/dataset/mini_cnn_eval_spec_decoding.json
deleted file mode 100644
index c4cfc9b43a5..00000000000
--- a/triton_backend/tools/dataset/mini_cnn_eval_spec_decoding.json
+++ /dev/null
@@ -1,77 +0,0 @@
-[
-    {
-        "input": "March 10, 2015 . We're truly international in scope on Tuesday. We're visiting Italy, Russia, the United Arab Emirates, and the Himalayan Mountains. Find out who's attempting to circumnavigate the globe in a plane powered partially by the sun, and explore the mysterious appearance of craters in northern Asia. You'll also get a view of Mount Everest that was previously reserved for climbers. On this page you will find today's show Transcript and a place for you to request to be on the CNN Student News Roll Call. TRANSCRIPT . Click here to access the transcript of today's CNN Student News program. Please note that there may be a delay between the time when the video is available and when the transcript is published. CNN Student News is created by a team of journalists who consider the Common Core State Standards, national standards in different subject areas, and state standards when producing the show. ROLL CALL . For a chance to be mentioned on the next CNN Student News, comment on the bottom of this page with your school name, mascot, city and state. We will be selecting schools from the comments of the previous show. You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call! Thank you for using CNN Student News!",
-        "instruction": "Summarize the following news article:",
-        "output": "This page includes the show Transcript .\nUse the Transcript to help students with reading comprehension and vocabulary .\nAt the bottom of the page, comment for a chance to be mentioned on CNN Student News.  You must be a teacher or a student age 13 or older to request a mention on the CNN Student News Roll Call."
-    },
-    {
-        "input": "(CNN)The search for a comic book artist missing in the Cayman Islands since Thursday is now being called a recovery mission. Norman Lee, an artist for DC and Marvel comics, went missing while snorkeling with his wife off the eastern coast of Grand Cayman, CNN affiliate WCVB reported. Strong currents hindered the search, which lasted until Friday evening, Cayman 27 reported. \"It is unlikely that we will make any recovery at this stage,\" Chief Inspector Brad Ebanks told Cayman 27. Lee, 47, of Weymouth, Massachusetts, was known and for his work on \"Wolverine Annual,\" \"Supergirl,\" \"Starman\" and other comic book titles. Tributes flooded his Facebook page and Twitter from friends, fans and colleagues who knew him from art school and comic conventions. \"I cannot express how shaken I am that I will never get the chance to see that smile again, and it saddens me that this world has lost a wonderful man in Norman Lee. To his wife Jan, and his family and all his friends and fans that loved him, my sincerest condolences,\" friend and fellow graphic artist Chris Kinniery said on Facebook. \"I'm so sorry to hear about Norman Lee's disappearance. My condolences go out to his family. ... He was an amazing talent in the industry and it was always a pleasure to work with him,\" freelance artist .",
-        "instruction": "Summarize the following news article:",
-        "output": "Comic book artist Norman Lee went missing in the Cayman Islands on Thursday .\nAuthorities called off search on Friday evening ."
-    },
-    {
-        "input": "(CNN)Jessie Usher, the lead of Starz's series \"Survivor's Remorse,\" has been cast in Fox's sequel \"Independence Day 2.\" Liam Hemsworth has also been cast in a lead role and Jeff Goldblum will reprise his role as David Levinson from the original. It has been nearly 20 years since the Will Smith-starring sci-fi film hit theaters, grossing more than $800 million worldwide at the box office. Director Roland Emmerich will return to helm the film. \"Independence Day 2\" is set for release on June 24, 2016 — almost exactly 20 years from when the first film hit theaters on July 3, 1996. Dean Devlin, Emmerich and Harald Kloser are producing. Independence Day 2 is a big get for Usher, who will play a central character in the new film as the son of Smith's character. Smith's role in the first blockbuster helped establish him as a star, kicking his career up to the next level. Usher stars in the Lebron James-produced Starz TV series \"Survivor's Remorse\" as Cam Calloway, a basketball player who moves with his family to Atlanta after he signs a pro contract. The first season premiered Oct. 4, 2014, and Starz quickly renewed the show for a second season, which will air later this year. The rising star's film credits includes TriStar football drama When the Game Stands Tall, which hit theaters in August. On the TV side, he previously starred in the Cartoon Network's series Level Up. He's repped by Paradigm and Williams Unlimited. ©2015 The Hollywood Reporter. All rights reserved.",
-        "instruction": "Summarize the following news article:",
-        "output": "Jeff Goldblum set to reprise his role in \"Independence Day 2\"\nLiam Hemsworth and Jessie Usher also cast in key roles .\nSequel will be released 20 years after the original box office smash about an alien invasion ."
-    },
-    {
-        "input": "(CNN)Former Vice President Walter Mondale was released from the Mayo Clinic on Saturday after being admitted with influenza, hospital spokeswoman Kelley Luckstein said. \"He's doing well. We treated him for flu and cold symptoms and he was released today,\" she said. Mondale, 87, was diagnosed after he went to the hospital for a routine checkup following a fever, former President Jimmy Carter said Friday. \"He is in the bed right this moment, but looking forward to come back home,\" Carter said during a speech at a Nobel Peace Prize Forum in Minneapolis. \"He said tell everybody he is doing well.\" Mondale underwent treatment at the Mayo Clinic in Rochester, Minnesota. The 42nd vice president served under Carter between 1977 and 1981, and later ran for President, but lost to Ronald Reagan. But not before he made history by naming a woman, U.S. Rep. Geraldine A. Ferraro of New York, as his running mate. Before that, the former lawyer was  a U.S. senator from Minnesota. His wife, Joan Mondale, died last year.",
-        "instruction": "Summarize the following news article:",
-        "output": "Walter Mondale was released from the Mayo Clinic on Saturday, hospital spokeswoman said .\nThe former vice president, 87, was treated for cold and flu symptoms ."
-    },
-    {
-        "input": "Tokyo (CNN)Police in Japan say they have arrested a 40-year-old man accused of fatally stabbing five neighbors in a farming community in Sumoto city. The man has admitted stabbing three women aged 59, 76 and 84, as well as two men aged 62 and 82, Deputy Police Chief Keizo Okumoto told CNN. He said the accused refused to comment further as he was awaiting his lawyer. The victims -- two couples and the 84-year-old woman -- lived within 100 meters (330 feet) of the suspect's home, police said. According to local media, the accused and the victims shared the same surname, but it is unclear if they are related. Sumoto city is on Awaji Island, Hyogo prefecture, in Japan. CNN's Susannah Cullinane contributed to this report from London.",
-        "instruction": "Summarize the following news article:",
-        "output": "Police in Japan say they have arrested a man, 40, after five neighbors were fatally stabbed .\nThe accused shares the same surname as the victims, aged 59 to 84, local media say .\nA police official says the man has admitted to the stabbings but refused to comment further ."
-    },
-    {
-        "input": "(CNN)A suicide attacker detonated a car bomb near a police vehicle in the capital of southern Afghanistan's Helmand province on Tuesday, killing seven people and injuring 23 others, the province's deputy governor said. The attack happened at about 6 p.m. in the Bolan area of Lashkar Gah city, said Mohammad Jan Rasoolyar, deputy governor of Helmand. Several children were among the wounded, and the majority of casualties were civilians, Rasoolyar said. Details about the attacker's identity and motive weren't immediately available.",
-        "instruction": "Summarize the following news article:",
-        "output": "Car bomb detonated near police vehicle in Lashkar Gah, deputy governor says ."
-    },
-    {
-        "input": "(CNN)Pakistan's highest court Friday ordered the release of Zaki-ur-Rehman Lakhvi, the alleged mastermind behind the Mumbai attacks, calling his detention illegal. Lakhvi, a top leader of the terrorist group Lashkar-e-Taiba, was not present at Friday's court proceeding. The terror attacks in India left more than 160 people dead in November 2008. In the attacks, heavily armed men stormed landmark buildings around Mumbai, including luxury hotels, the city's historic Victoria Terminus train station and a Jewish cultural center. On Friday, India summoned the Pakistan high commissioner \"to convey our strong feelings about (the) Lakhvi verdict,\" said India's external affairs spokesman Syed Akbaruddin. Last year, the court granted Lakhvi bail, a decision the Pakistani government had said it would challenge. Many in India are still angry over the attacks and had criticized the bail decision. \"It is very disappointing that the accused of the Mumbai attacks has been granted bail,\" the nation's home minister, Rajnath Singh, said in December. India executed the last surviving gunman from the attacks in 2012. Other suspects were all killed during the series of attacks, which went on for three days. CNN's Harmeet Shah Singh contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The terror attacks in India left more than 160 people dead .\nA  court granted the suspect bail last year ."
-    },
-    {
-        "input": "(CNN)More than 100 schools in Pakistan have been renamed in honor of the children killed in a Taliban siege at a Peshawar school last year. According to the provincial government of Khyber Pakhtunkhwa, 107 local schools now bear the name of a student killed during the December 16 attack on the Army Public School. Government spokesman Mushtaq Ghani told CNN that the gesture was a way to remember the bravery of the students who were taking exams in an auditorium  when Taliban militants strapped with explosives laid siege. Some six hours later, 145 were dead, 132 of them children, according to authorities. In February, Pakistani authorities arrested a suspect, accusing him of commanding the attack. The Army Public School is home to about 1,100 students and staff, most of them sons and daughters of army personnel from around Peshawar. CNN's Sophia Saifi and Mariano Castillo contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "More than 100 schools renamed in honor of students killed in a Taliban schoolhouse attack .\nA six-hour siege at a Peshawar school killed 145 people in December ."
-    },
-    {
-        "input": "(CNN)The Salvation Army in South Africa is using #thedress to spread a different kind of message about the colors black and blue. The organization is using the viral sensation to spread awareness about domestic violence. An ad shows a model wearing a white and gold dress resembling the one that caused an Internet meltdown last week as people debated whether the garment wasn't actually black and blue instead. The model also sports bruises on her face and body. The caption reads \"Why is it so hard to see black and blue? The only illusion is if you think it was her choice.\" The ad has received mostly positive response. The dress, which is sold by the British company Roman, is actually black and blue. Julia Haller, the ophthalmologist-in-chief at Wills Eye Hospital in Philadelphia, explained that different people saw different colors because eyes perceive colors in a slightly different way, based on genes.",
-        "instruction": "Summarize the following news article:",
-        "output": "#thedress was an Internet sensation: Is it blue and black or white and gold?\nThe Salvation Army in South Africa is using it to raise awareness .\nThe reaction has been mostly positive ."
-    },
-    {
-        "input": "(CNN)Myanmar warplanes fighting rebels dropped a bomb at a sugarcane field in China, killing four civilians, the latter's state media reported Saturday. In addition to the fatalities, nine others were wounded, according to Xinhua news agency. Shortly after the incident Friday, China sent fighter jets to patrol over their shared border. The jets are there to \"track, monitor, warn and chase away\" Myanmar military planes, China's air force told state media. China summoned Myanmar's ambassador in Beijing after the incident in the border city of Lincang. Liu Zhenmin, the vice foreign minister for China, called on Myanmar to investigate and bring those behind the attack to justice. Myanmar forces have been battling ethnic separatist rebels in the rugged border region across from Yunnan province. In recent incidents, stray gunfire has damaged property on the Chinese side of the border, prompting Beijing to warn Myanmar to ensure safety. There was no immediate reaction from Myanmar.",
-        "instruction": "Summarize the following news article:",
-        "output": "China sends fighter jets to patrol over border with Myanmar following the incident .\nReport: In addition to the fatalities, nine others were wounded ."
-    },
-    {
-        "input": "(CNN)A medical helicopter carrying at least three people crashed late Thursday in Eufaula, Oklahoma, the Federal Aviation Authority  said. It was flying from Tulsa to McAlester when it went down west of Lake Eufaula, near the Canadian River. Three employees from medical air transport company EagleMed LLC were on board, according to FAA spokesman Lynn Lunsford. FAA investigators are headed to the crash scene. The McIntosh County Sheriff's Office is handling the crash for local law enforcement, but a spokeswoman said early Friday that it had no information to give out so far. Earlier this week, an Army Black Hawk helicopter crashed in waters off the Florida Panhandle. All 11 service members aboard are believed dead, an Air Force official said Thursday. The military has been working to recover the helicopter, which searchers found at the bottom of the Santa Rosa Sound near Eglin Air Force Base, as well as the bodies of all seven Marines and four Louisiana Army National Guard members who were aboard. CNN's Greg Botelho contributed to this report.",
-        "instruction": "Summarize the following news article:",
-        "output": "The chopper was registered to an air ambulance company .\nIt was en route from Tulsa to McAlester ."
-    },
-    {
-        "input": "(CNN)Closed for over a decade, a trail that's been described as \"the world's most dangerous path\" will no longer be off limits to the public thanks to a massive reconstruction project. Due to reopen March 26, Spain's stunning Caminito del Rey features a cliff face boardwalk that hangs 100 meters above the Guadalhorce River. The trail, which begins in the village of El Chorro in southern Malaga province, has been around since the early 1900s, originally built to provide access to two waterfalls for hydroelectric workers. Over the years, the boardwalk deteriorated and, following a string of fatal accidents in 1999 and 2000, the government demolished the access points to the walkway. The reopening, which comes over a year since reconstruction efforts kicked off, coincides with Spain's annual Holy Week celebrations. According to the Spanish daily El Pais, the provincial government allocated 5.5 million euros ($5.8 million) to the project. The entire route is 7.7 kilometers long, with boardwalks covering 2.9 kilometers of the trail. The most famous section includes the Balconcillo de los Gaitanes bridge, which spans the Gaitanes Gorge. Tourism officials say it takes between four and five hours to walk the entire route, which includes some steep slopes. Entry will be free for the first six months after the attraction reopens on March 26. Visiting hours are 10 a.m.-5 p.m. from April 1 to October 31, and 10 a.m.-2 p.m. from November 1 to March 31. Those who want to visit need to book a spot on the official website, Caminitodelrey.info.",
-        "instruction": "Summarize the following news article:",
-        "output": "Spain's '\"most dangerous path\" due to reopen this month after extensive reconstruction efforts .\nOriginally built in the 1900s, the trail's boardwalks deteriorated over time, forcing the local government to close it ."
-    },
-    {
-        "input": "Jakarta (CNN)Turkish officials have detained 16 Indonesian citizens who confessed they were planning to cross the border into Syria, Indonesia's foreign ministry said. The group comprises a man, four women, three girls and eight boys, said spokesman Arrmanatha Nasir. The ages of the children have not been released. Nasir told reporters on Friday that the group was stopped in the southeastern Turkish city of Gaziantep, just north of the city of Aleppo in Syria. They admitted to officials they planned to enter Syria, Nasir said. The Indonesian government is sending a security team to Turkey to investigate the group's objectives and plans and to increase cooperation with Turkish security officials, Nasir said. This group of 16 Indonesians is different from another 16 Indonesians who reportedly deserted their tour group recently in Turkey and are feared to have crossed into Syria. Nasir confirmed these are two separate groups.",
-        "instruction": "Summarize the following news article:",
-        "output": "The group is made up of one man, four women, 11 children .\nThey were stopped just north of Aleppo, Syria ."
-    },
-    {
-        "input": "(CNN)To come back or not come back? That is the question former Wimbledon champion Marion Bartoli posed on Twitter, prompting a raft of replies and retweets. The Frenchwoman was crowned champion at the All England Club in July 2013, but announced her retirement a mere one month later due to persistent injury problems. But now it appears the former world No. 7 is pondering a return to the court, with eight-time grand slam champion Jimmy Connors warning Bartoli to only consider it if she is fully committed. Bartoli wouldn't be the first women's star to come out of retirement. Veterans Martina Hingis, Jennifer Capriati and Kimiko Date-Krumm all chose to reenter the fray after saying their initial farewells. Bartoli won eight WTA titles during a 13-year professional career, reaching the quarterfinals of the Australian and U.S. Opens and the last four of her home grand slam in Paris.",
-        "instruction": "Summarize the following news article:",
-        "output": "Marion Bartoli asks whether she should make a comeback over Twitter .\nShe won Wimbledon in 2013 but retired one month later due to injuries .\nBartoli won eight WTA titles during a 13-year professional career ."
-    },
-    {
-        "input": "(CNN)Actress Liza Minnelli has entered a rehab facility for her addictions, a spokesman said. \"Liza Minnelli has valiantly battled substance abuse over the years and whenever she has needed to seek treatment she has done so,\" said spokesman Scott Gorenstein.  \"She is currently making excellent progress at an undisclosed facility.\" The 68-year-old has struggled with addictions to alcohol and painkillers in the past. Minnelli won an Oscar in 1973 for her performance in \"Cabaret.\" CNN's Topher Gauk-Roger contributed to this report .",
-        "instruction": "Summarize the following news article:",
-        "output": "Minnelli won an Academy Award for her performance in \"Cabaret\"\nShe has struggled with addictions to alcohol and painkillers in the past ."
-    }
-]
diff --git a/triton_backend/tools/dataset/short_input_end_id.csv b/triton_backend/tools/dataset/short_input_end_id.csv
deleted file mode 100755
index 34ca8cf129f..00000000000
--- a/triton_backend/tools/dataset/short_input_end_id.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,3189,28809,28707,7234,574,3441,1236,28723,28705,26991,27130,1900,9063,286,935,3356,9822,13407,5376,28705,28750,28734,28725,28705,28750,28734,28740,28787,28705,1984,1141,349,26991,28723,315,837,272,2971,297,272,8802,302,272,1868,500,811,912,10064,6965,22280,466,28569,28723,28705,415,2609,3522,3370,506,750,741,302,272,1080,287,27770,297,586,1411,304,28725,356,272,7873,28725,315,1073,736,349,7949,297,10681,586,2659,395,272,1526,579,369,14785,993,347,5996,477,378,28723,661,349,586,3317,369,28725,586,2838,4579,11510,264,2268,297,2496,10086,28725,1560,297,272,8885,8455,14130,2764,304,297,799,5080,28723,28705,315,7298,500,811,912,10064,297,4074,28705,28750,28734,28740,28782,390,264,6275,4806,10340,16797,354,19387,1682,15800,1214,4165,28723,1984,727,736,403,7444,26104,28747,272,771,403,14361,28725,272,1918,403,1598,28725,304,272,4466,403,14987,304,20458,28723,1682,297,544,28725,264,1179,1633,298,347,28723,4529,2363,28725,2905,403,459,3659,28725,562,767,18077,1235,459,506,4382,304,1370,28733,532,28733,1466,1104,19351,28804,28705,851,4648,356,272,3970,302,4624,28705,28740,28774,362,28705,28750,28734,28740,28787,28725,739,384,5675,28710,25020,28725,272,15954,9298,438,500,811,912,10064,2261,528,298,506,3142,395,264,11140,3512,28725,27932,354,516
diff --git a/triton_backend/tools/dataset/short_input_end_id_medusa.csv b/triton_backend/tools/dataset/short_input_end_id_medusa.csv
deleted file mode 100644
index 151b4e13034..00000000000
--- a/triton_backend/tools/dataset/short_input_end_id_medusa.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,4779,29871,29896,29900,29892,29871,29906,29900,29896,29945,869,1334,29915,276,19781,6121,297,6874,373,323,1041,3250,29889,1334,29915,276,6493,292,12730,29892,12710,29892,278,3303,10387,2812,381,1078,29892,322,278,379,3039,388,273,28418,29889,10987,714,1058,29915,29879,15661,304,3449,1227,3723,403,278,15482,915,297,263,10694,3081,287,22039,491,278,6575,29892,322,26987,278,22277,2738,10097,310,2181,10412,297,14622,14325,29889,887,29915,645,884,679,263,1776,310,8040,18274,342,393,471,9251,21676,363,10784,2596,29889,1551,445,1813,366,674,1284,9826,29915,29879,1510,4103,924,322,263,2058,363,366,304,2009,304,367,373,278,29696,15740,10130,21809,8251,29889,10014,2190,7187,24290,869,16297,1244,304,2130,278,1301,924,310,9826,29915,29879,29696,15740,10130,1824,29889,3529,4443,393,727,1122,367,263,9055,1546,278,931,746,278,4863,338,3625,322,746,278,1301,924,338,6369,29889,29696,15740,10130,338,2825,491,263,3815,310,8955,2879,1058,2050,278,13103,10239,4306,6679,3163,29892,4797,20801,297,1422,4967,10161,29892,322,2106,20801,746,20811,278,1510,29889,16641,2208,315,9818,869,1152,263,8825,304,367,5276,373,278,2446,29696,15740,10130,29892,3440,373,278,5970,310,445,1813,411,596,3762,1024,29892,5516,26235,29892,4272,322,2106,29889,1334,674,367,18851,12462,515,278,6589,310,278,3517,1510,29889,887,1818,367,263,15703,470,263,8368,5046,29871,29896,29941,470,9642,304,2009,263,3585,373,278,29696,15740,10130,21809,8251,29991,3374,366,363,773,29696,15740,10130,29991
diff --git a/triton_backend/tools/dataset/short_output_end_id.csv b/triton_backend/tools/dataset/short_output_end_id.csv
deleted file mode 100755
index f3bd88c718f..00000000000
--- a/triton_backend/tools/dataset/short_output_end_id.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,3189,28809,28707,7234,574,3441,1236,28723,28705,26991,27130,1900,9063,286,935,3356,9822,13407,5376,28705,28750,28734,28725,28705,28750,28734,28740,28787,28705,1984,1141,349,26991,28723,315,837,272,2971,297,272,8802,302,272,1868,500,811,912,10064,6965,22280,466,28569,28723,28705,415,2609,3522,3370,506,750,741,302,272,1080,287,27770,297,586,1411,304,28725,356,272,7873,28725,315,1073,736,349,7949,297,10681,586,2659,395,272,1526,579,369,14785,993,347,5996,477,378,28723,661,349,586,3317,369,28725,586,2838,4579,11510,264,2268,297,2496,10086,28725,1560,297,272,8885,8455,14130,2764,304,297,799,5080,28723,28705,315,7298,500,811,912,10064,297,4074,28705,28750,28734,28740,28782,390,264,6275,4806,10340,16797,354,19387,1682,15800,1214,4165,28723,1984,727,736,403,7444,26104,28747,272,771,403,14361,28725,272,1918,403,1598,28725,304,272,4466,403,14987,304,20458,28723,1682,297,544,28725,264,1179,1633,298,347,28723,4529,2363,28725,2905,403,459,3659,28725,562,767,18077,1235,459,506,4382,304,1370,28733,532,28733,1466,1104,19351,28804,28705,851,4648,356,272,3970,302,4624,28705,28740,28774,362,28705,28750,28734,28740,28787,28725,739,384,5675,28710,25020,28725,272,15954,9298,438,500,811,912,10064,2261,528,298,506,3142,395,264,11140,3512,28725,27932,354,516,263,304,68,335
diff --git a/triton_backend/tools/dataset/short_output_end_id_eagle.csv b/triton_backend/tools/dataset/short_output_end_id_eagle.csv
deleted file mode 100644
index 8b5257944ee..00000000000
--- a/triton_backend/tools/dataset/short_output_end_id_eagle.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,4779,29871,29896,29900,29892,29871,29906,29900,29896,29945,869,1334,29915,276,19781,6121,297,6874,373,323,1041,3250,29889,1334,29915,276,6493,292,12730,29892,12710,29892,278,3303,10387,2812,381,1078,29892,322,278,379,3039,388,273,28418,29889,10987,714,1058,29915,29879,15661,304,3449,1227,3723,403,278,15482,915,297,263,10694,3081,287,22039,491,278,6575,29892,322,26987,278,22277,2738,10097,310,2181,10412,297,14622,14325,29889,887,29915,645,884,679,263,1776,310,8040,18274,342,393,471,9251,21676,363,10784,2596,29889,1551,445,1813,366,674,1284,9826,29915,29879,1510,4103,924,322,263,2058,363,366,304,2009,304,367,373,278,29696,15740,10130,21809,8251,29889,10014,2190,7187,24290,869,16297,1244,304,2130,278,1301,924,310,9826,29915,29879,29696,15740,10130,1824,29889,3529,4443,393,727,1122,367,263,9055,1546,278,931,746,278,4863,338,3625,322,746,278,1301,924,338,6369,29889,29696,15740,10130,338,2825,491,263,3815,310,8955,2879,1058,2050,278,13103,10239,4306,6679,3163,29892,4797,20801,297,1422,4967,10161,29892,322,2106,20801,746,20811,278,1510,29889,16641,2208,315,9818,869,1152,263,8825,304,367,5276,373,278,2446,29696,15740,10130,29892,3440,373,278,5970,310,445,1813,411,596,3762,1024,29892,5516,26235,29892,4272,322,2106,29889,1334,674,367,18851,12462,515,278,6589,310,278,3517,1510,29889,887,1818,367,263,15703,470,263,8368,5046,29871,29896,29941,470,9642,304,2009,263,3585,373,278,29696,15740,10130,21809,8251,29991,3374,366,363,773,29696,15740,10130,29991,13,13,29924,1279,29871,29896,29896,29892,29871,29906,29900,29896,29945,869,20628,591,29915,276,3902,8253,278,3186,310,10466,29892,15483,29892,21639,29892,322,5844,29889,1334,29915,276,6493,292,278,3303,3900,29892,5546,29892,322,278,3303,12626,29889,10987,714,1048,263,716,6559,393,14661,278,11563,29915,29879,15611,1746,338,2805,591,5790,29892,322,5110,1048,263,716,1134,310,21635,3038,393,1033,1207,21635,5864,901,21750,519,29889,887,29915,645,884,679,263,1106,472,263,716,1134,310,4799,22116,21612,393,1033,1207,4799,9850,901,8543,29889,1551,445,1813,366,674
diff --git a/triton_backend/tools/dataset/short_output_end_id_medusa.csv b/triton_backend/tools/dataset/short_output_end_id_medusa.csv
deleted file mode 100644
index f3f966cbf7a..00000000000
--- a/triton_backend/tools/dataset/short_output_end_id_medusa.csv
+++ /dev/null
@@ -1 +0,0 @@
-1,4779,29871,29896,29900,29892,29871,29906,29900,29896,29945,869,1334,29915,276,19781,6121,297,6874,373,323,1041,3250,29889,1334,29915,276,6493,292,12730,29892,12710,29892,278,3303,10387,2812,381,1078,29892,322,278,379,3039,388,273,28418,29889,10987,714,1058,29915,29879,15661,304,3449,1227,3723,403,278,15482,915,297,263,10694,3081,287,22039,491,278,6575,29892,322,26987,278,22277,2738,10097,310,2181,10412,297,14622,14325,29889,887,29915,645,884,679,263,1776,310,8040,18274,342,393,471,9251,21676,363,10784,2596,29889,1551,445,1813,366,674,1284,9826,29915,29879,1510,4103,924,322,263,2058,363,366,304,2009,304,367,373,278,29696,15740,10130,21809,8251,29889,10014,2190,7187,24290,869,16297,1244,304,2130,278,1301,924,310,9826,29915,29879,29696,15740,10130,1824,29889,3529,4443,393,727,1122,367,263,9055,1546,278,931,746,278,4863,338,3625,322,746,278,1301,924,338,6369,29889,29696,15740,10130,338,2825,491,263,3815,310,8955,2879,1058,2050,278,13103,10239,4306,6679,3163,29892,4797,20801,297,1422,4967,10161,29892,322,2106,20801,746,20811,278,1510,29889,16641,2208,315,9818,869,1152,263,8825,304,367,5276,373,278,2446,29696,15740,10130,29892,3440,373,278,5970,310,445,1813,411,596,3762,1024,29892,5516,26235,29892,4272,322,2106,29889,1334,674,367,18851,12462,515,278,6589,310,278,3517,1510,29889,887,1818,367,263,15703,470,263,8368,5046,29871,29896,29941,470,9642,304,2009,263,3585,373,278,29696,15740,10130,21809,8251,29991,3374,366,363,773,29696,15740,10130,29991,13,13,29924,1279,29871,29896,29896,29892,29871,29906,29900,29896,29945,869,20628,591,29915,276,3902,8253,278,3186,310,10466,322,15483,29889,1334,29915,276,6493,292,278,3303,3900,29892,5546,29892,322,278,3303,12626,29889,10987,714,1048,263,716,6559,393,14661,278,11563,29915,29879,15611,1746,338,2805,591,5790,29892,322,5110,1048,263,716,1134,310,21635,3038,393,1033,1207,21635,5864,901,21750,519,29889,887,29915,645,884,679,263,1106,472,263,716,19964,393,508,10784,29890,14603,322,263,716,1134,310,29871,29941,29928,14010,15483,29889,1551,445,1813,366,674
diff --git a/triton_backend/tools/fill_template.py b/triton_backend/tools/fill_template.py
deleted file mode 100644
index 5c629f7cc2d..00000000000
--- a/triton_backend/tools/fill_template.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#! /usr/bin/env python3
-from argparse import ArgumentParser
-from string import Template
-
-
-def split(string, delimiter):
-    """Split a string using delimiter. Supports escaping.
-
-    Args:
-        string (str): The string to split.
-        delimiter (str): The delimiter to split the string with.
-
-    Returns:
-        list: A list of strings.
-    """
-    result = []
-    current = ""
-    escape = False
-    for char in string:
-        if escape:
-            current += char
-            escape = False
-        elif char == delimiter:
-            result.append(current)
-            current = ""
-        elif char == "\\":
-            escape = True
-        else:
-            current += char
-    result.append(current)
-    return result
-
-
-def main(file_path, substitutions, in_place):
-    with open(file_path) as f:
-        pbtxt = Template(f.read())
-
-    sub_dict = {
-        "max_queue_size": 0,
-        'max_queue_delay_microseconds': 0,
-    }
-    for sub in split(substitutions, ","):
-        key, value = split(sub, ":")
-        sub_dict[key] = value
-
-        assert key in pbtxt.template, f"key '{key}' does not exist in the file {file_path}."
-
-    pbtxt = pbtxt.safe_substitute(sub_dict)
-
-    if in_place:
-        with open(file_path, "w") as f:
-            f.write(pbtxt)
-    else:
-        print(pbtxt)
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument("file_path", help="path of the .pbtxt to modify")
-    parser.add_argument(
-        "substitutions",
-        help=
-        "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
-    )
-    parser.add_argument("--in_place",
-                        "-i",
-                        action="store_true",
-                        help="do the operation in-place")
-    args = parser.parse_args()
-    main(**vars(args))
diff --git a/triton_backend/tools/gpt/benchmark_core_model.py b/triton_backend/tools/gpt/benchmark_core_model.py
deleted file mode 100644
index 80947fb5aa4..00000000000
--- a/triton_backend/tools/gpt/benchmark_core_model.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-import statistics as s
-from builtins import range
-from datetime import datetime
-
-import numpy as np
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-w',
-                        '--warm_up',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable warm_up before benchmark')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-p',
-                        '--request_parallelism',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify request parallelism')
-    parser.add_argument('-m',
-                        '--mode',
-                        type=str,
-                        required=False,
-                        default='sync',
-                        help='Mode ("sync"/"async").')
-    parser.add_argument('-b',
-                        '--batch_size',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify batch size')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-s',
-                        '--start_len',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify input length')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument(
-        '-n',
-        '--num_runs',
-        type=int,
-        default=1,
-        required=False,
-        help="Spedifty number of runs to get the average latency")
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-    input_start_ids = np.random.randint(0,
-                                        50255,
-                                        size=(FLAGS.batch_size,
-                                              FLAGS.start_len),
-                                        dtype=np.int32)
-    input_len = np.array([[input_start_ids.shape[1]]
-                          for _ in range(input_start_ids.shape[0])], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids,
-                                  input_len,
-                                  pad_id=0,
-                                  end_id=2,
-                                  flags=FLAGS)
-
-    # warm up
-    if FLAGS.warm_up:
-        print("[INFO] sending requests to warm up")
-        with utils.create_inference_server_client(
-                FLAGS.protocol,
-                FLAGS.url,
-                concurrency=FLAGS.concurrency,
-                verbose=FLAGS.verbose) as client:
-            utils.send_requests('tensorrt_llm',
-                                inputs,
-                                client,
-                                request_parallelism=2)
-
-    latencies = []
-    for i in range(FLAGS.num_runs):
-        start_time = datetime.now()
-
-        with utils.create_inference_server_client(
-                FLAGS.protocol,
-                FLAGS.url,
-                concurrency=FLAGS.concurrency,
-                verbose=FLAGS.verbose) as client:
-            if FLAGS.mode == 'sync':
-                utils.send_requests('tensorrt_llm', inputs, client,
-                                    FLAGS.request_parallelism)
-            else:
-                if FLAGS.protocol == "http":
-                    async_requests = utils.send_requests_async(
-                        'tensorrt_llm', inputs, client, FLAGS,
-                        FLAGS.request_parallelism)
-                    results = utils.get_http_results(async_requests)
-                else:
-                    user_data = utils.send_requests_async(
-                        'tensorrt_llm', inputs, client, FLAGS,
-                        FLAGS.request_parallelism)
-                    results = utils.get_grpc_results(user_data,
-                                                     FLAGS.request_parallelism)
-
-        stop_time = datetime.now()
-        latencies.append((stop_time - start_time).total_seconds() * 1000.0 /
-                         FLAGS.request_parallelism)
-
-    if FLAGS.num_runs > 1:
-        latency = s.mean(latencies)
-    else:
-        latency = latencies[0]
-    latency = round(latency, 3)
-    throughput = round(1000 / latency * FLAGS.batch_size, 3)
-    print(
-        f"[INFO] Batch size: {FLAGS.batch_size}, Start len: {FLAGS.start_len}, Output len: {FLAGS.output_len}"
-    )
-    print(f"[INFO] Latency: {latency} ms")
-    print(f"[INFO] Throughput: {throughput} sentences / sec")
diff --git a/triton_backend/tools/gpt/client.py b/triton_backend/tools/gpt/client.py
deleted file mode 100644
index cbf365c1237..00000000000
--- a/triton_backend/tools/gpt/client.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-from datetime import datetime
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-t',
-                        '--text',
-                        type=str,
-                        required=False,
-                        default='Born in north-east France, Soyer trained as a',
-                        help='Input text')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    line = tokenizer.encode(FLAGS.text)
-    input_start_ids = np.array([line], np.int32)
-    input_len = np.array([[len(line)]], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id,
-                                  FLAGS)
-
-    start_time = datetime.now()
-
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        results = utils.send_requests('tensorrt_llm',
-                                      inputs,
-                                      client,
-                                      request_parallelism=1)
-    output_ids = results[0].as_numpy("output_ids")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Latency: {latency} ms")
-
-    output_ids = output_ids.reshape(
-        (output_ids.size, )).tolist()[input_start_ids.shape[1]:]
-    output_text = tokenizer.decode(output_ids)
-    print(f'Input: {FLAGS.text}')
-    print(f'Output: {output_text}')
diff --git a/triton_backend/tools/gpt/client_async.py b/triton_backend/tools/gpt/client_async.py
deleted file mode 100644
index fca97b15f62..00000000000
--- a/triton_backend/tools/gpt/client_async.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-from datetime import datetime
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-t',
-                        '--text',
-                        type=str,
-                        required=False,
-                        default='Born in north-east France, Soyer trained as a',
-                        help='Input text')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
-        exit(1)
-
-    client_util = httpclient if FLAGS.protocol == "http" else grpcclient
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    line = tokenizer.encode(FLAGS.text)
-    input_start_ids = np.array([line], np.int32)
-    input_len = np.array([[len(line)]], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id,
-                                  FLAGS)
-
-    start_time = datetime.now()
-
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        if FLAGS.protocol == "http":
-            async_requests = utils.send_requests_async('tensorrt_llm',
-                                                       inputs,
-                                                       client,
-                                                       FLAGS,
-                                                       request_parallelism=1)
-            results = utils.get_http_results(async_requests)
-        else:
-            user_data = utils.send_requests_async('tensorrt_llm',
-                                                  inputs,
-                                                  client,
-                                                  FLAGS,
-                                                  request_parallelism=1)
-            results = utils.get_grpc_results(user_data, request_parallelism=1)
-    output_ids = results[0].as_numpy("output_ids")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Latency: {latency} ms")
-
-    output_ids = output_ids.reshape(
-        (output_ids.size, )).tolist()[input_start_ids.shape[1]:]
-    output_text = tokenizer.decode(output_ids)
-    print(f'Input: {FLAGS.text}')
-    print(f'Output: {output_text}')
diff --git a/triton_backend/tools/gpt/end_to_end_test.py b/triton_backend/tools/gpt/end_to_end_test.py
deleted file mode 100644
index cbfee5d40da..00000000000
--- a/triton_backend/tools/gpt/end_to_end_test.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    model_name = 'preprocessing'
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        input0 = [["Blackhawks\n The 2015 Hilltoppers"],
-                  ["Data sources you can use to make a decision:"],
-                  ["\n if(angle = 0) { if(angle"],
-                  ["GMs typically get 78% female enrollment, but the "],
-                  ["Previous Chapter | Index | Next Chapter"],
-                  ["Michael, an American Jew, called Jews"],
-                  ["Born in north-east France, Soyer trained as a"],
-                  ["Data sources you can use to make a comparison:"]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len
-        bad_words_list = np.array(
-            [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]],
-            dtype=object)
-        stop_words_list = np.array(
-            [[""], [""], [""], [""], [""], [""], [""], ["month, month"]],
-            dtype=object)
-        inputs = [
-            utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len,
-                                 FLAGS.protocol),
-        ]
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("INPUT_ID")
-            output1 = result.as_numpy("REQUEST_INPUT_LEN")
-            output2 = result.as_numpy("REQUEST_OUTPUT_LEN")
-            output3 = result.as_numpy("BAD_WORDS_IDS")
-            output4 = result.as_numpy("STOP_WORDS_IDS")
-        except Exception as e:
-            print(e)
-
-    model_name = "tensorrt_llm"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=1,
-                                              verbose=FLAGS.verbose) as client:
-        inputs = utils.prepare_inputs(output0, output1, pad_id, end_id, FLAGS)
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("output_ids")
-        except Exception as e:
-            print(e)
-
-    model_name = "postprocessing"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        inputs = [utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol)]
-        inputs[0].set_data_from_numpy(output0)
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("OUTPUT")
-            print("============After postprocessing============")
-            batch_size = len(input0)
-            output0 = output0.reshape([-1, batch_size]).T.tolist()
-            output0 = [[char.decode('UTF-8') for char in line]
-                       for line in output0]
-            output0 = [''.join(line) for line in output0]
-            for line in output0:
-                print(f"{line}")
-            print("===========================================\n\n\n")
-        except Exception as e:
-            print(e)
-
-    model_name = "ensemble"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        input0 = [["Blackhawks\n The 2015 Hilltoppers"],
-                  ["Data sources you can use to make a decision:"],
-                  ["\n if(angle = 0) { if(angle"],
-                  ["GMs typically get 78% female enrollment, but the "],
-                  ["Previous Chapter | Index | Next Chapter"],
-                  ["Michael, an American Jew, called Jews"],
-                  ["Born in north-east France, Soyer trained as a"],
-                  ["Data sources you can use to make a comparison:"]]
-        bad_words_list = np.array(
-            [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]],
-            dtype=object)
-        stop_words_list = np.array(
-            [[""], [""], [""], [""], [""], [""], [""], ["month, month"]],
-            dtype=object)
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len
-        runtime_top_k = (FLAGS.topk *
-                         np.ones([input0_data.shape[0], 1])).astype(np.int32)
-        runtime_top_p = FLAGS.topp * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        temperature = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        len_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        repetition_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        seed = 0 * np.ones([input0_data.shape[0], 1]).astype(np.uint64)
-        output_log_probs = True * np.ones([input0_data.shape[0], 1
-                                           ]).astype(bool)
-        beam_width = (FLAGS.beam_width *
-                      np.ones([input0_data.shape[0], 1])).astype(np.int32)
-        pad_ids = pad_id * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        end_ids = end_id * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        min_tokens = 1 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        presence_penalty = 0.0 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.float32)
-        frequency_penalty = 0.0 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.float32)
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list, FLAGS.protocol),
-            utils.prepare_tensor("pad_id", pad_ids, FLAGS.protocol),
-            utils.prepare_tensor("end_id", end_ids, FLAGS.protocol),
-            utils.prepare_tensor("beam_width", beam_width, FLAGS.protocol),
-            utils.prepare_tensor("top_k", runtime_top_k, FLAGS.protocol),
-            utils.prepare_tensor("top_p", runtime_top_p, FLAGS.protocol),
-            utils.prepare_tensor("temperature", temperature, FLAGS.protocol),
-            utils.prepare_tensor("length_penalty", len_penalty, FLAGS.protocol),
-            utils.prepare_tensor("repetition_penalty", repetition_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("min_tokens", min_tokens, FLAGS.protocol),
-            utils.prepare_tensor("presence_penalty", presence_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("frequency_penalty", frequency_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("seed", seed, FLAGS.protocol),
-            utils.prepare_tensor("output_log_probs", output_log_probs,
-                                 FLAGS.protocol),
-        ]
-
-        try:
-            result = client.infer(model_name, inputs)
-            ensemble_output0 = result.as_numpy("text_output")
-            print("============After ensemble============")
-            batch_size = len(input0)
-            ensemble_output0 = ensemble_output0.reshape([-1, batch_size
-                                                         ]).T.tolist()
-            ensemble_output0 = [[char.decode('UTF-8') for char in line]
-                                for line in ensemble_output0]
-            ensemble_output0 = [''.join(line) for line in ensemble_output0]
-            for line in ensemble_output0:
-                print(f"{line}")
-        except Exception as e:
-            print(e)
-
-    assert output0 == ensemble_output0
diff --git a/triton_backend/tools/gpt/gen_input_data.py b/triton_backend/tools/gpt/gen_input_data.py
deleted file mode 100644
index 11eeaa9d14d..00000000000
--- a/triton_backend/tools/gpt/gen_input_data.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import argparse
-import json
-
-import numpy as np
-
-
-def add_sample(sample, name, array):
-    sample[name] = {'content': array.flatten().tolist(), 'shape': array.shape}
-
-
-def main(args):
-    data = {'data': []}
-    input_start_ids = np.random.randint(0,
-                                        50255,
-                                        size=(args.start_len),
-                                        dtype=np.int32)
-    input_len = np.array([input_start_ids.shape[0]], np.int32)
-    output_len = np.ones([1]).astype(np.int32) * args.output_len
-    runtime_top_k = (args.topk * np.ones([1])).astype(np.int32)
-    runtime_top_p = args.topp * np.ones([1]).astype(np.float32)
-    beam_search_diversity_rate = 0.0 * np.ones([1]).astype(np.float32)
-    temperature = 1.0 * np.ones([1]).astype(np.float32)
-    len_penalty = 1.0 * np.ones([1]).astype(np.float32)
-    repetition_penalty = 1.0 * np.ones([1]).astype(np.float32)
-    seed = 0 * np.ones([1]).astype(np.uint64)
-    # is_return_log_probs = True * np.ones([1]).astype(bool)
-    beam_width = (args.beam_width * np.ones([1])).astype(np.int32)
-    # start_ids = 50256 * np.ones([1]).astype(np.int32)
-    # end_ids = 50256 * np.ones([1]).astype(np.int32)
-    # bad_words_list = np.concatenate([
-    #     np.zeros([1, 1]).astype(np.int32),
-    #     (-1 * np.ones([1, 1])).astype(np.int32)
-    # ],
-    #                                 axis=1)
-    # stop_word_list = np.concatenate([
-    #     np.zeros([1, 1]).astype(np.int32),
-    #     (-1 * np.ones([1, 1])).astype(np.int32)
-    # ],
-    #                                 axis=1)
-
-    for _ in range(args.num_samples):
-        sample = {}
-        add_sample(sample, 'input_ids', input_start_ids)
-        add_sample(sample, 'input_lengths', input_len)
-        add_sample(sample, 'request_output_len', output_len)
-        add_sample(sample, 'runtime_top_k', runtime_top_k)
-        add_sample(sample, 'runtime_top_p', runtime_top_p)
-        add_sample(sample, 'beam_search_diversity_rate',
-                   beam_search_diversity_rate)
-        add_sample(sample, 'temperature', temperature)
-        add_sample(sample, 'len_penalty', len_penalty)
-        add_sample(sample, 'repetition_penalty', repetition_penalty)
-        add_sample(sample, 'seed', seed)
-        add_sample(sample, 'beam_width', beam_width)
-        # add_sample(sample, 'top_p_decay', top_p_decay)
-        # add_sample(sample, 'top_p_min', top_p_min)
-        # add_sample(sample, 'top_p_reset_ids', top_p_reset_ids)
-        data['data'].append(sample)
-
-    with open('input_data.json', 'w') as f:
-        json.dump(data, f, indent=4)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-b',
-                        '--batch_size',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify batch size')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-s',
-                        '--start_len',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify input length')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--num_samples',
-                        type=int,
-                        default=10000,
-                        required=False,
-                        help='Specify number of samples to generate')
-    args = parser.parse_args()
-    main(args)
diff --git a/triton_backend/tools/gpt/input_data.json b/triton_backend/tools/gpt/input_data.json
deleted file mode 100644
index 955bbaaccbe..00000000000
--- a/triton_backend/tools/gpt/input_data.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e41bfd6d7fe657be684dda8b257e44e3f293cdfd2a33a51f4e54d38c917bdb45
-size 22420023
diff --git a/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py b/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py
deleted file mode 100644
index f79b6954eaf..00000000000
--- a/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py
+++ /dev/null
@@ -1,518 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import json
-import sys
-import time
-from datetime import datetime
-from functools import partial
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-
-def callback(user_data, result, error):
-    user_data._completed_requests.put((result, error))
-    if result is None:
-        # There was an error.
-        return
-    try:
-        # GRPC
-        req_id = result.get_response().id
-    except:
-        # HTTP
-        req_id = result.get_response()["id"]
-    start_time = user_data._start_time_dict[req_id]
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    user_data._latencies.append(latency)
-    user_data._latency_dict[req_id] = latency
-    user_data._stop_time_dict[req_id] = stop_time
-
-
-def append_pad_id_to_tensors(pad_id, inputs):
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-    else:
-        pad_id_data = np.ones_like([[1]]).astype(np.int32) * 0
-
-    inputs += [utils.prepare_tensor("pad_id", pad_id_data, FLAGS.protocol)]
-
-
-def append_end_id_to_tensors(end_id, inputs):
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-    else:
-        end_id_data = np.ones_like([[1]]).astype(np.int32) * 1
-
-    inputs += [utils.prepare_tensor("end_id", end_id_data, FLAGS.protocol)]
-
-
-def test_performance(client,
-                     input_start_ids,
-                     input_lens,
-                     output_lens,
-                     delays,
-                     FLAGS,
-                     pad_id=None,
-                     end_id=None):
-    model_name = "tensorrt_llm"
-
-    print(f"[INFO] Warm up for benchmarking.")
-    if FLAGS.decoupled:
-        client.start_stream(callback=lambda result, error: None,
-                            stream_timeout=FLAGS.stream_timeout)
-    for i in range(10):
-        model_name = FLAGS.tensorrt_llm_model_name[i % len(
-            FLAGS.tensorrt_llm_model_name)]
-        output0_len = np.ones_like([[1]]).astype(np.int32) * 100
-        if FLAGS.test_llmapi:
-            input_data = np.array(
-                [input_start_ids[0]], dtype=object
-            )  ## TODO: [JIRA-4496] support batching in llmapi backend and add tests here.
-            inputs = [
-                utils.prepare_tensor("text_input", input_data, FLAGS.protocol),
-                utils.prepare_tensor("sampling_param_max_tokens",
-                                     np.array([output_lens[0]], dtype=np.int32),
-                                     FLAGS.protocol),
-            ]
-        else:
-            inputs = [
-                utils.prepare_tensor("input_ids", input_start_ids[0],
-                                     FLAGS.protocol),
-                utils.prepare_tensor("input_lengths", input_lens[0],
-                                     FLAGS.protocol),
-                utils.prepare_tensor("request_output_len", output0_len,
-                                     FLAGS.protocol),
-            ]
-            append_pad_id_to_tensors(pad_id, inputs)
-            append_end_id_to_tensors(end_id, inputs)
-
-        if FLAGS.decoupled:
-            client.async_stream_infer(model_name, inputs, request_id=str(i))
-        else:
-            client.infer(model_name, inputs, request_id=str(i))
-    if FLAGS.decoupled:
-        client.stop_stream()
-
-    print(f"[INFO] Start benchmarking on {len(input_start_ids)} prompts.")
-    latency = 0
-    async_requests = []
-    start_time = datetime.now()
-    user_data = utils.UserData()
-
-    if FLAGS.decoupled:
-        client.start_stream(callback=partial(callback, user_data),
-                            stream_timeout=FLAGS.stream_timeout)
-    for i, ids in enumerate(input_start_ids):
-        model_name = FLAGS.tensorrt_llm_model_name[i % len(
-            FLAGS.tensorrt_llm_model_name)]
-        output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i]
-        if FLAGS.test_llmapi:
-            input_data = np.array(
-                [ids], dtype=object
-            )  ## TODO: [JIRA-4496] support batching in llmapi backend and add tests here.
-            inputs = [
-                utils.prepare_tensor("text_input", input_data, FLAGS.protocol),
-                utils.prepare_tensor("sampling_param_max_tokens",
-                                     np.array([output_lens[i]], dtype=np.int32),
-                                     FLAGS.protocol),
-            ]
-        else:
-            inputs = [
-                utils.prepare_tensor("input_ids", ids, FLAGS.protocol),
-                utils.prepare_tensor("input_lengths", input_lens[i],
-                                     FLAGS.protocol),
-                utils.prepare_tensor("request_output_len", output0_len,
-                                     FLAGS.protocol),
-            ]
-
-            append_pad_id_to_tensors(pad_id, inputs)
-            append_end_id_to_tensors(end_id, inputs)
-
-        time.sleep(delays[i])
-
-        user_data._start_time_dict[str(i)] = datetime.now()
-        if FLAGS.protocol == "http":
-            async_requests.append(
-                client.async_infer(model_name, inputs, request_id=str(i)))
-        elif FLAGS.protocol == "grpc":
-            if FLAGS.decoupled:
-                client.async_stream_infer(model_name, inputs, request_id=str(i))
-            else:
-                async_requests.append(
-                    client.async_infer(model_name,
-                                       inputs,
-                                       callback=partial(callback, user_data),
-                                       request_id=str(i)))
-    if FLAGS.decoupled:
-        client.stop_stream()
-    try:
-        if FLAGS.protocol == "http":
-            utils.get_http_results(async_requests)
-        elif FLAGS.protocol == "grpc":
-            responses = utils.get_grpc_results(user_data, len(input_start_ids))
-        else:
-            raise RuntimeError("Invalid protocol")
-
-        stop_time = datetime.now()
-        latency = (stop_time - start_time).total_seconds() * 1000.0
-        latency = round(latency, 3)
-        print(f"[INFO] Total Latency: {latency} ms")
-
-        # TODO(kaiyu): support `extract_print_stats` for http
-        # TODO(achartier): support `extract_print_stats` for LLMAPI
-        data_dict = None
-        if FLAGS.protocol == "grpc" and not FLAGS.test_llmapi:
-            request_latencies = 0.0
-            for latency in user_data._latencies:
-                request_latencies += latency
-            print(f"[INFO] Total request latencies: {request_latencies} ms")
-
-            ip_token_len_list = []
-            for ip in input_lens:
-                ip_token_len_list.append(
-                    ip[0][0])  #for some reason, two level nesting
-
-            data_dict = utils.extract_print_stats(ip_token_len_list, responses,
-                                                  user_data, FLAGS)
-
-        if FLAGS.check_perf_json:
-            check_performance(data_dict, FLAGS)
-
-    except Exception as e:
-        print("Failed receiving responses: " + str(e))
-        sys.exit(1)
-
-
-def check_performance(data_dict, FLAGS):
-    if not data_dict:
-        print(
-            "[ERROR] --check-perf-json was used, but no data was collected. Please use grpc protocol."
-        )
-    ref = json.load(open(FLAGS.check_perf_json, "r"))
-    if FLAGS.check_perf_key not in ref or len(ref[FLAGS.check_perf_key]) == 0:
-        print(
-            f"[ERROR] There are no reference numbers for {FLAGS.check_perf_key}, so the performance is not checked. Please add an entry to {FLAGS.check_perf_json}."
-        )
-        sys.exit(1)
-    for metric in ref[FLAGS.check_perf_key]:
-        if metric not in data_dict:
-            print(f"[ERROR] Data for '{metric}' was not found.")
-        np.testing.assert_allclose(
-            data_dict[metric],
-            ref[FLAGS.check_perf_key][metric],
-            rtol=FLAGS.check_perf_rtol,
-            atol=FLAGS.check_perf_atol,
-            err_msg=
-            f"'{metric}' check failed - did not match reference in '{FLAGS.check_perf_json}' for '{FLAGS.check_perf_key}'"
-        )
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    subparsers = parser.add_subparsers(dest='workload')
-
-    parser_dataset = subparsers.add_parser('dataset')
-    parser_dataset.add_argument('--dataset',
-                                type=str,
-                                required=True,
-                                help='Dataset path used for the test.')
-    parser_dataset.add_argument('--tokenizer-dir',
-                                type=str,
-                                required=True,
-                                help='Specify tokenizer directory')
-    parser_dataset.add_argument('--tokenizer-type',
-                                type=str,
-                                default='auto',
-                                required=False,
-                                choices=['auto', 't5', 'llama'],
-                                help='Specify tokenizer type')
-    parser_dataset.add_argument(
-        '--op-tokens-per-word',
-        type=float,
-        default=1.3,
-        required=False,
-        help=
-        'Specify op tokens/word ratio. Useful to have model generate exactly as many tokens as needed by the dataset'
-    )
-
-    parser_token_norm_dist = subparsers.add_parser('token-norm-dist')
-    parser_token_norm_dist.add_argument(
-        '--input-mean',
-        type=int,
-        required=True,
-        help='normal dist mean for input tokens')
-    parser_token_norm_dist.add_argument(
-        '--input-stdev',
-        type=int,
-        required=True,
-        help='normal dist stdev for input tokens')
-    parser_token_norm_dist.add_argument(
-        '--output-mean',
-        type=int,
-        required=True,
-        help='normal dist mean for output tokens')
-    parser_token_norm_dist.add_argument(
-        '--output-stdev',
-        type=int,
-        required=True,
-        help='normal dist stdev for output tokens')
-
-    parser_token_from_hist = subparsers.add_parser('token-from-histogram')
-    parser_token_from_hist.add_argument(
-        '--histogram-key',
-        type=str,
-        required=True,
-        help='key to retrieve histogram buckets,freqs defined in utils')
-
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        choices=['http', 'grpc'],
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument(
-        '--decoupled',
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        'Uses async_stream_infer which allows decoupled backends (must use grpc protocol)'
-    ),
-    parser.add_argument(
-        "-t",
-        "--stream-timeout",
-        type=float,
-        required=False,
-        default=None,
-        help="Stream timeout in seconds. Default is None.",
-    )
-    parser.add_argument(
-        "--tensorrt-llm-model-name",
-        type=str,
-        required=False,
-        default=["tensorrt_llm"],
-        action="append",
-        help=
-        "Specify the name of the TensorRT LLM model. Can be specified multiple times to use multiple models."
-    )
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=128,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=True,
-                        help='Specify max input length')
-    parser.add_argument('--request-rate',
-                        type=float,
-                        required=False,
-                        help="# of reqs/sec. -1 indicates SOL/Offline",
-                        default=-1.0)
-    parser.add_argument('--time-delay-dist',
-                        type=str,
-                        required=False,
-                        choices=["constant", "exponential_dist"],
-                        default="exponential_dist",
-                        help="# of reqs/sec. -1 indicates SOL/Offline")
-    parser.add_argument(
-        '--dump-perfetto-trace',
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        'Dumps trace of requests in a json (perfetto.json) to be visualized in perfetto'
-    ),
-    parser.add_argument('--op-stats-csv',
-                        type=str,
-                        default=None,
-                        help='csv filename to dump stats'),
-    parser.add_argument(
-        "--exclude-input-in-output",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Expect that output IDs do not contain input IDs",
-    )
-    parser.add_argument(
-        '--num-requests',
-        type=int,
-        required=False,
-        default=30000,
-        help=
-        'For dataset, requests = min(dataset, num_requests). number of requests to be generated by the client'
-    )
-    parser.add_argument(
-        '--check-perf-json',
-        type=str,
-        required=False,
-        help=
-        'If set, this will compare the latency to the value in this file under the key from --check-perf-key'
-    )
-    parser.add_argument(
-        '--check-perf-key',
-        type=str,
-        required=False,
-        help=
-        'Used with --check-perf-json to specify which entry in the file to compare with'
-    )
-    parser.add_argument('--check-perf-atol',
-                        type=float,
-                        required=False,
-                        help="Absolute tolerance for performance check",
-                        default=50)
-    parser.add_argument('--check-perf-rtol',
-                        type=float,
-                        required=False,
-                        help="Relative tolerance for performance check",
-                        default=0.05)
-    parser.add_argument('--test-llmapi',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help="Use LLMAPI for inference")
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-    if FLAGS.decoupled and FLAGS.protocol != 'grpc':
-        print("Protocol must be set to 'grpc' when using '--decoupled'.")
-        sys.exit(1)
-
-    try:
-        client = utils.create_inference_server_client(
-            FLAGS.protocol,
-            FLAGS.url,
-            concurrency=FLAGS.concurrency,
-            verbose=FLAGS.verbose)
-    except Exception as e:
-        print("channel creation failed: " + str(e))
-        sys.exit(1)
-
-    if FLAGS.request_rate == -1:
-        mean_time_bet_reqs = 0
-    else:
-        mean_time_bet_reqs = 1.0 / FLAGS.request_rate
-
-    input_start_ids = []
-    input_lens = []
-    output_lens = []
-    ratio = []
-
-    print(FLAGS.workload)
-    if FLAGS.workload == "dataset":
-        tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                                  legacy=False,
-                                                  padding_side='left')
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        pad_id = tokenizer.encode(tokenizer.pad_token,
-                                  add_special_tokens=False)[0]
-        end_id = tokenizer.encode(tokenizer.eos_token,
-                                  add_special_tokens=False)[0]
-
-        prompt_cnt = 0
-
-        with open(FLAGS.dataset, 'r') as f:
-            data_dict = json.load(f)
-            for req in data_dict:
-                prompt = req['input'] + ' ' + req['instruction']
-                output = req['output']
-                line = tokenizer.encode(prompt)
-                if len(line) > FLAGS.max_input_len:
-                    continue
-
-                prompt_cnt += 1
-                if prompt_cnt > FLAGS.num_requests:
-                    break
-
-                if FLAGS.test_llmapi:
-                    input_start_ids.append(prompt)
-                else:
-                    input_start_ids.append(np.array([line], np.int32))
-                input_lens.append(np.array([[len(line)]], np.int32))
-                output_lens.append(
-                    int(len(output.split(' ')) * FLAGS.op_tokens_per_word))
-                prompt_tokens = len(line)
-                prompt_words = len(prompt.split())
-                ratio.append(prompt_tokens / prompt_words)
-
-        print("Tokenizer: Tokens per word = ", round(np.mean(ratio), 3))
-        num_reqs = len(input_lens)
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-        test_performance(client, input_start_ids, input_lens, output_lens,
-                         delays, FLAGS, pad_id, end_id)
-
-    elif FLAGS.workload == "token-norm-dist":
-        assert not FLAGS.test_llmapi, "LLMAPI does not support token-norm-dist workload yet"
-        input_lens = utils.get_norm_dist_tokens(FLAGS.input_mean,
-                                                FLAGS.input_stdev,
-                                                FLAGS.num_requests)
-        pruned_ip_list = [
-            ip_len for ip_len in input_lens if ip_len <= FLAGS.max_input_len
-        ]
-        num_reqs = len(pruned_ip_list)
-        ip_lens_2d_array = [
-            np.array([[ip_len]], np.int32) for ip_len in pruned_ip_list
-        ]
-        output_lens = utils.get_norm_dist_tokens(FLAGS.output_mean,
-                                                 FLAGS.output_stdev, num_reqs)
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-
-        input_start_ids = utils.gen_random_start_ids(pruned_ip_list)
-        test_performance(client, input_start_ids, ip_lens_2d_array, output_lens,
-                         delays, FLAGS)
-
-    elif FLAGS.workload == "token-from-histogram":
-        assert not FLAGS.test_llmapi, "LLMAPI does not support token-from-histogram workload yet"
-        input_lens_orig = utils.get_token_list_from_histogram(
-            FLAGS.histogram_key + "_ip")
-        output_lens_orig = utils.get_token_list_from_histogram(
-            FLAGS.histogram_key + "_op")
-
-        final_lens = min(len(input_lens_orig), len(output_lens_orig))
-        input_lens = input_lens_orig[:final_lens]
-        output_lens = output_lens_orig[:final_lens]
-
-        num_reqs = len(input_lens)
-        ip_lens_2d_array = [
-            np.array([[ip_len]], np.int32) for ip_len in input_lens
-        ]
-        output_lens = utils.get_token_list_from_histogram(FLAGS.histogram_key +
-                                                          "_op")
-        print(len(input_lens), len(output_lens))
-        assert (len(input_lens) == len(output_lens))
-
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-
-        input_start_ids = utils.gen_random_start_ids(input_lens)
-        test_performance(client, input_start_ids, ip_lens_2d_array, output_lens,
-                         delays, FLAGS)
diff --git a/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py b/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py
deleted file mode 100644
index 42f6161a195..00000000000
--- a/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py
+++ /dev/null
@@ -1,638 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-import torch
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import ast
-import json
-from datetime import datetime
-from functools import partial
-
-import numpy as np
-from utils import utils
-
-
-def callback(user_data, start_time, result, error):
-    user_data._completed_requests.put((result, error))
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    user_data._latencies.append(latency)
-
-
-def verify_logits(expected_logits, input_logits, rtol=1e-02, atol=1e-02):
-    torch.cuda.synchronize()
-    result = np.allclose(expected_logits, input_logits, rtol, atol)
-    if not result:
-        ndiff = 0
-        a = expected_logits.reshape(-1)
-        b = input_logits.reshape(-1)
-        assert a.size == b.size
-        for i in range(a.size):
-            if a[i] != b[i]:
-                ndiff += 1
-                print(f"Expect value: {a[i]}, output value: {b[i]}")
-                if ndiff > 20:
-                    break
-    return result
-
-
-# Helper function to add parameter tensors
-def prepare_tensor(name, value, dtype, protocol, batch_size=1):
-    if value is not None:
-        shape = np.array(
-            [[value]], dtype=dtype) if batch_size > 1 else np.array([value],
-                                                                    dtype=dtype)
-        return utils.prepare_tensor(name, shape, protocol)
-    return None
-
-
-def async_stream_infer(client, model_name, inputs, outputs, protocol, user_data,
-                       request_id, use_llmapi):
-    assert use_llmapi, "Streaming is only supported for LLMAPI model"
-    assert protocol == "grpc", "Streaming is only supported for gRPC protocol"
-    client.start_stream(callback=partial(callback, user_data, datetime.now()))
-    client.async_stream_infer(model_name,
-                              inputs,
-                              outputs=outputs,
-                              request_id=str(request_id))
-    client.stop_stream()
-
-
-def test_functionality_llmapi(
-        client,
-        model_name,
-        prompts,
-        batch_size=1,  # TODO: [JIRA-4496] support batching in llmapi backend and add tests here.
-        streaming=False,
-        sampling_params=None,
-        output_config=None):
-    """Test basic model functionality with different prompts."""
-    print(f"[INFO] Start testing on {len(prompts)} prompts.")
-
-    results = []
-    user_data = utils.UserData() if streaming else None
-
-    for i, prompt in enumerate(prompts):
-        inputs = []
-        # Prepare text_input
-        input_data = np.array(
-            [prompt], dtype=object
-        )  ## TODO: [JIRA-4496] support batching in llmapi backend and add tests here.
-        inputs.append(
-            utils.prepare_tensor("text_input", input_data, FLAGS.protocol))
-
-        if streaming:
-            inputs.append(
-                utils.prepare_tensor("streaming", np.array([True], dtype=bool),
-                                     FLAGS.protocol))
-
-        # Convert sampling_params to tensors
-        if sampling_params is not None:
-            for param_name, param_value in sampling_params.items():
-                inputs.append(
-                    prepare_tensor("sampling_param_" + param_name, param_value,
-                                   type(param_value), FLAGS.protocol))
-
-        if output_config is not None:
-            for name, value in output_config.items():
-                inputs.append(prepare_tensor(name, value, bool, FLAGS.protocol))
-
-        return_finish_reason = output_config[
-            "return_finish_reason"] if output_config and "return_finish_reason" in output_config else False
-        return_stop_reason = output_config[
-            "return_stop_reason"] if output_config and "return_stop_reason" in output_config else False
-
-        # Only include needed outputs
-        outputs = utils.prepare_outputs(
-            FLAGS.protocol,
-            return_finish_reason=return_finish_reason,
-            return_stop_reason=return_stop_reason)
-
-        try:
-            if streaming:
-                assert user_data is not None
-                # async_stream_infer(client, model_name, inputs, outputs, protocol, user_data, request_id, use_llmapi)
-                async_stream_infer(client, model_name, inputs, outputs,
-                                   FLAGS.protocol, user_data, i, True)
-            else:
-                result = client.infer(model_name, inputs, request_id=str(i))
-                results.append(result)
-        except Exception as e:
-            print(f"[Functionality test] Failed to infer with error: {e}")
-            exit(1)
-
-    if streaming:
-        results = utils.get_grpc_results(user_data, len(prompts))
-
-    for result in results:
-        text_output = result.as_numpy("text_output")[0].decode("utf-8")
-        assert text_output, "Text output should not be empty."
-        if FLAGS.verbose:
-            print(f"Text output: {text_output}", flush=True)
-
-        if return_finish_reason:
-            finish_reason = result.as_numpy("finish_reason")
-            assert finish_reason, "Finish reason should not be empty."
-            if FLAGS.verbose:
-                print(f"Finish reason: {finish_reason}")
-
-        if return_stop_reason:
-            stop_reason = result.as_numpy("stop_reason")
-            assert stop_reason, "Stop reason should not be empty."
-            if FLAGS.verbose:
-                print(f"Stop reason: {stop_reason}")
-
-    print("[INFO] Functionality test succeeded.")
-
-
-def test_functionality_ifb(client,
-                           prompts,
-                           output_lens,
-                           vocabSizePadded=50257,
-                           return_log_probs=False,
-                           return_context_logits=False,
-                           return_generation_logits=False,
-                           test_bls=False):
-    print(f"[INFO] Start testing on {len(prompts)} prompts.")
-    for i, prompt in enumerate(prompts):
-
-        # 1. Ensemble models manually: preprocessing -> tensorrt_llm -> postprocessing
-        model_name = 'preprocessing'
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len,
-                                 FLAGS.protocol),
-        ]
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("INPUT_ID")
-        output1 = result.as_numpy("REQUEST_INPUT_LEN")
-        output2 = result.as_numpy("REQUEST_OUTPUT_LEN")
-        decoder_input_id = result.as_numpy("DECODER_INPUT_ID")
-        output_end_id = result.as_numpy("OUT_END_ID")
-        output_pad_id = result.as_numpy("OUT_PAD_ID")
-        inputIds = output0  # Use to check context logits shape
-
-        model_name = "tensorrt_llm"
-        inputs = [
-            utils.prepare_tensor("input_ids", output0, FLAGS.protocol),
-            utils.prepare_tensor("decoder_input_ids", decoder_input_id,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("input_lengths", output1, FLAGS.protocol),
-            utils.prepare_tensor("request_output_len", output2, FLAGS.protocol),
-            utils.prepare_tensor("end_id", output_end_id, FLAGS.protocol),
-            utils.prepare_tensor("pad_id", output_pad_id, FLAGS.protocol),
-        ]
-        if return_log_probs:
-            return_log_probs_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_log_probs", return_log_probs_flag,
-                                     FLAGS.protocol),
-            ]
-
-        if return_context_logits:
-            return_context_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_context_logits",
-                                     return_context_logits_flag,
-                                     FLAGS.protocol),
-            ]
-        if return_generation_logits:
-            return_generation_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_generation_logits",
-                                     return_generation_logits_flag,
-                                     FLAGS.protocol),
-            ]
-
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("output_ids").astype(np.int32)
-        seq_lengths = result.as_numpy("sequence_length")
-        if return_log_probs:
-            cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32)
-            output_log_probs = result.as_numpy("output_log_probs").astype(
-                np.float32)
-        if return_context_logits:
-            context_logits = result.as_numpy("context_logits").astype(
-                np.float32)
-            print(f"context_logits.shape: {context_logits.shape}")
-        if return_generation_logits:
-            generation_logits = result.as_numpy("generation_logits").astype(
-                np.float32)
-            print(f"generation_logits.shape: {generation_logits.shape}")
-
-        model_name = "postprocessing"
-        inputs = [
-            utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol),
-            utils.prepare_tensor("SEQUENCE_LENGTH", seq_lengths,
-                                 FLAGS.protocol),
-        ]
-        inputs[0].set_data_from_numpy(output0)
-        inputs[1].set_data_from_numpy(seq_lengths)
-
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("OUTPUT")
-
-        # 2. Use ensemble model
-        model_name = "ensemble"
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list, FLAGS.protocol),
-        ]
-
-        if return_log_probs:
-            return_log_probs_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_log_probs", return_log_probs_flag,
-                                     FLAGS.protocol),
-            ]
-        if return_context_logits:
-            return_context_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_context_logits",
-                                     return_context_logits_flag,
-                                     FLAGS.protocol),
-            ]
-        if return_generation_logits:
-            return_generation_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_generation_logits",
-                                     return_generation_logits_flag,
-                                     FLAGS.protocol),
-            ]
-
-        outputs = utils.prepare_outputs(FLAGS.protocol, return_log_probs,
-                                        return_context_logits,
-                                        return_generation_logits)
-
-        print(outputs)
-
-        result = client.infer(model_name,
-                              inputs,
-                              outputs=outputs,
-                              request_id=str(i))
-
-        # 3. Check the results between manually ensembled models and the ensemble model
-        ensemble_output = result.as_numpy('text_output')
-
-        print(f"ensemble output: {ensemble_output}")
-        assert output0 == ensemble_output
-        if return_log_probs:
-            ensemble_cum_log_probs = result.as_numpy('cum_log_probs')
-            ensemble_output_log_probs = result.as_numpy('output_log_probs')
-            assert cum_log_probs == ensemble_cum_log_probs
-            assert (output_log_probs == ensemble_output_log_probs).all()
-
-        if return_context_logits:
-            ensemble_context_logits = result.as_numpy('context_logits')
-            assert verify_logits(context_logits, ensemble_context_logits)
-            ensemble_context_logits_shape = ensemble_context_logits.shape
-            assert (len(ensemble_context_logits_shape) == 3)
-
-            # Expect shape [1, prompt_length, vocabSizePadded]
-            assert (ensemble_context_logits_shape[0] == 1)  # One request
-            assert (ensemble_context_logits_shape[1] == inputIds.size
-                    )  # Prompt length
-            assert (ensemble_context_logits_shape[2] == vocabSizePadded
-                    )  # VocabSizePadded
-
-        if return_generation_logits:
-            ensemble_generation_logits = result.as_numpy('generation_logits')
-            assert verify_logits(generation_logits, ensemble_generation_logits)
-            ensemble_generation_logits_shape = ensemble_generation_logits.shape
-            assert (len(ensemble_generation_logits_shape) == 4)
-
-            # Expect shape [1, beam_width, output_length, vocabSizePadded]
-            assert (ensemble_generation_logits_shape[0] == 1)  # One request
-            assert (ensemble_generation_logits_shape[1] == 1
-                    )  # Beam width (default)
-            assert (ensemble_generation_logits_shape[2] == output_lens[i]
-                    )  # Output length
-            assert (ensemble_generation_logits_shape[3] == vocabSizePadded
-                    )  # VocabSizePadded
-
-        if test_bls:
-            # 4. Use bls
-            model_name = "tensorrt_llm_bls"
-            input0 = [[prompt]]
-            input0_data = np.array(input0).astype(object)
-            output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-            bad_words_list = np.array([[""]], dtype=object)
-            stop_words_list = np.array([[""]], dtype=object)
-
-            inputs = [
-                utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-                utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-                utils.prepare_tensor("bad_words", bad_words_list,
-                                     FLAGS.protocol),
-                utils.prepare_tensor("stop_words", stop_words_list,
-                                     FLAGS.protocol),
-            ]
-            if return_context_logits:
-                return_context_logits_flag = np.array([[True]], dtype=bool)
-                inputs += [
-                    utils.prepare_tensor("return_context_logits",
-                                         return_context_logits_flag,
-                                         FLAGS.protocol),
-                ]
-            if return_generation_logits:
-                return_generation_logits_flag = np.array([[True]], dtype=bool)
-                inputs += [
-                    utils.prepare_tensor("return_generation_logits",
-                                         return_generation_logits_flag,
-                                         FLAGS.protocol),
-                ]
-
-            result = client.infer(model_name,
-                                  inputs,
-                                  outputs=outputs,
-                                  request_id=str(i))
-
-            # 5. Check the results between manually ensembled models and the bls model
-            bls_output = result.as_numpy('text_output')
-            assert output0 == bls_output
-
-            if return_log_probs:
-                result.as_numpy('cum_log_probs')
-                result.as_numpy('output_log_probs')
-                # Disabled due to flaky results
-                #assert cum_log_probs == bls_cum_log_probs
-                #assert (output_log_probs == bls_output_log_probs).all()
-
-            if return_context_logits:
-                bls_context_logits = result.as_numpy('context_logits')
-                bls_context_logits_shape = bls_context_logits.shape
-                # Disabled due to flaky results
-                #assert verify_logits(context_logits, bls_context_logits)
-
-                assert (len(bls_context_logits_shape) == 3)
-                # Expect shape [1, prompt_length, vocabSizePadded]
-                assert (bls_context_logits_shape[0] == 1)  # One request
-                assert (bls_context_logits_shape[1] == inputIds.size
-                        )  # Prompt length
-                assert (bls_context_logits_shape[2] == vocabSizePadded
-                        )  # VocabSizePadded
-
-            if return_generation_logits:
-                bls_generation_logits = result.as_numpy('generation_logits')
-                # Disabled due to flaky results
-                #assert verify_logits(generation_logits, bls_generation_logits)
-
-                bls_generation_logits_shape = bls_generation_logits.shape
-                assert (len(bls_generation_logits_shape) == 4)
-
-                # Expect shape [1, beam_width, output_length, vocabSizePadded]
-                assert (bls_generation_logits_shape[0] == 1)  # One request
-                assert (bls_generation_logits_shape[1] == 1
-                        )  # Beam width (default)
-                assert (bls_generation_logits_shape[2] == output_lens[i]
-                        )  # Output length
-                assert (bls_generation_logits_shape[3] == vocabSizePadded
-                        )  # VocabSizePadded
-
-        if FLAGS.verbose:
-            print('Response: {}'.format(result.get_response()))
-            print('Output: {}'.format(ensemble_output))
-    print(f"[INFO] Functionality test succeed.")
-
-
-def create_inputs(prompt, output_len, FLAGS, use_llmapi=False):
-    inputs = []
-    if not use_llmapi:
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_len
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list, FLAGS.protocol),
-        ]
-
-    else:
-        input_data = np.array(
-            [prompt], dtype=object
-        )  ## TODO: [JIRA-4496] support batching in llmapi backend and add tests here.
-        inputs.append(
-            utils.prepare_tensor("text_input", input_data, FLAGS.protocol))
-        inputs.append(
-            utils.prepare_tensor("sampling_param_max_tokens",
-                                 np.array([output_len], dtype=np.int32),
-                                 FLAGS.protocol))
-
-    return inputs
-
-
-def test_performance(client, prompts, output_lens, FLAGS, use_llmapi=False):
-    print(f"[INFO] Warm up for benchmarking.")
-    if FLAGS.model_name is None:
-        FLAGS.model_name = "ensemble"
-    print(f"FLAGS.model_name: {FLAGS.model_name}")
-
-    for i in range(min(10, len(prompts))):
-        inputs = create_inputs(prompts[0], output_lens[0], FLAGS, use_llmapi)
-        outputs = utils.prepare_outputs(FLAGS.protocol)
-        warmup_user_data = utils.UserData()
-
-        if FLAGS.streaming:
-            async_stream_infer(client, FLAGS.model_name, inputs, outputs,
-                               FLAGS.protocol, warmup_user_data, i, use_llmapi)
-        else:
-            client.infer(FLAGS.model_name,
-                         inputs,
-                         outputs=outputs,
-                         request_id=str(i))
-
-    print(f"[INFO] Start benchmarking on {len(prompts)} prompts.")
-    latency = 0
-    async_requests = []
-    start_time = datetime.now()
-    user_data = utils.UserData()
-    for i, prompt in enumerate(prompts):
-        inputs = create_inputs(prompt, output_lens[i], FLAGS, use_llmapi)
-        outputs = utils.prepare_outputs(FLAGS.protocol)
-
-        if FLAGS.streaming:
-            async_stream_infer(client, FLAGS.model_name, inputs, outputs,
-                               FLAGS.protocol, user_data, i, use_llmapi)
-        else:
-            if FLAGS.protocol == "http":
-                async_requests.append(
-                    client.async_infer(FLAGS.model_name,
-                                       inputs,
-                                       outputs=outputs,
-                                       request_id=str(i)))
-            elif FLAGS.protocol == "grpc":
-                async_requests.append(
-                    client.async_infer(FLAGS.model_name,
-                                       inputs,
-                                       outputs=outputs,
-                                       callback=partial(callback, user_data,
-                                                        datetime.now()),
-                                       request_id=str(i)))
-
-    if FLAGS.protocol == "http":
-        utils.get_http_results(async_requests)
-    elif FLAGS.protocol == "grpc":
-        utils.get_grpc_results(user_data, len(prompts))
-    else:
-        raise RuntimeError("Invalid protocol")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Total Latency: {latency} ms")
-    if FLAGS.protocol == "grpc":
-        request_latencies = 0.0
-        for latency in user_data._latencies:
-            request_latencies += latency
-        print(f"[INFO] Total request latencies: {request_latencies} ms")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        choices=['http', 'grpc'],
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=128,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=False,
-                        help='Specify max input length')
-
-    parser.add_argument('--dataset',
-                        type=str,
-                        required=True,
-                        help='Dataset path used for the test.')
-
-    parser.add_argument('--return-log-probs',
-                        action="store_true",
-                        default=False,
-                        help='Return log probs.')
-
-    parser.add_argument('--return-context-logits',
-                        action="store_true",
-                        default=False,
-                        help='Return context logits.')
-
-    parser.add_argument('--return-generation-logits',
-                        action="store_true",
-                        default=False,
-                        help='Return generation logits.')
-
-    parser.add_argument('--test-bls',
-                        action="store_true",
-                        default=False,
-                        help="test BLS model")
-    parser.add_argument('--test-llmapi',
-                        action="store_true",
-                        default=False,
-                        help="test LLMAPI model")
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        help="model name")
-    parser.add_argument('--streaming',
-                        action="store_true",
-                        default=False,
-                        help="streaming")
-    parser.add_argument('--output-config',
-                        type=ast.literal_eval,
-                        help='Output config dictionary')
-    parser.add_argument('--sampling-params',
-                        type=ast.literal_eval,
-                        help='Sampling parameter dictionary')
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    try:
-        client = utils.create_inference_server_client(
-            FLAGS.protocol,
-            FLAGS.url,
-            concurrency=FLAGS.concurrency,
-            verbose=FLAGS.verbose)
-    except Exception as e:
-        print("Encountered error: " + str(e))
-        sys.exit(1)
-
-    prompts = []
-    output_lens = []
-    with open(FLAGS.dataset, 'r') as f:
-        data_dict = json.load(f)
-        for req in data_dict:
-            prompt = req['input'] + ' ' + req['instruction']
-            output = req['output']
-            # 1.3 is a magic number that converts number of words to number of tokens
-            if int(len(prompt.split(' ')) / 1.3) > FLAGS.max_input_len:
-                continue
-            prompts.append(prompt)
-            # 1.3 is a magic number that converts number of words to number of tokens
-            output_lens.append(int(len(output.split(' ')) * 1.3))
-
-    vocabSizePadded = 50257  # gpt
-    # Parse llmapi specific arguments
-    if FLAGS.test_llmapi:
-        assert FLAGS.model_name is not None, "model_name is required for llmapi tests"
-        test_functionality_llmapi(client,
-                                  FLAGS.model_name,
-                                  prompts,
-                                  streaming=FLAGS.streaming,
-                                  sampling_params=FLAGS.sampling_params,
-                                  output_config=FLAGS.output_config)
-        test_performance(client, prompts, output_lens, FLAGS, use_llmapi=True)
-    else:
-        test_functionality_ifb(client, prompts, output_lens, vocabSizePadded,
-                               FLAGS.return_log_probs,
-                               FLAGS.return_context_logits,
-                               FLAGS.return_generation_logits, FLAGS.test_bls)
-        test_performance(client, prompts, output_lens, FLAGS, use_llmapi=False)
diff --git a/triton_backend/tools/inflight_batcher_llm/speculative_decoding_test.py b/triton_backend/tools/inflight_batcher_llm/speculative_decoding_test.py
deleted file mode 100644
index d3670db2fc2..00000000000
--- a/triton_backend/tools/inflight_batcher_llm/speculative_decoding_test.py
+++ /dev/null
@@ -1,375 +0,0 @@
-#!/usr/bin/python
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import sys
-
-utils_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-root_path = os.path.dirname(utils_path)
-sys.path.append(utils_path)
-sys.path.append(os.path.join(root_path, "inflight_batcher_llm"))
-
-import argparse
-import json
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from client import e2e_grpc_speculative_decoding_client, end_to_end_grpc_client
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-
-    parser.add_argument('--url-target',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the target model')
-
-    parser.add_argument('--url-draft',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the draft model')
-
-    parser.add_argument('--url-control',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the control model')
-
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=True,
-                        help='Max input length for input prompts')
-
-    parser.add_argument(
-        '--preprocessor-model-name',
-        type=str,
-        required=False,
-        default="preprocessing",
-        help='Name of the preprocessor model (should be hosted at url-draft)')
-
-    parser.add_argument(
-        '--postprocessor-model-name',
-        type=str,
-        required=False,
-        default="postprocessing",
-        help='Name of the postprocessor model (should be hosted at url-target)')
-
-    parser.add_argument(
-        '--draft-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-draft)')
-
-    parser.add_argument(
-        '--target-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm target model (hosted at url-target)')
-
-    parser.add_argument(
-        '--bls-speculative-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm_bls",
-        help=
-        'Name of the tensorrt_llm bls model (only supports the case of url-target == url-draft)'
-    )
-
-    parser.add_argument(
-        '--execute-bls-speculative-decoding',
-        action='store_true',
-        help='Executes the BLS speculative decoding model if set')
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument(
-        '--num-draft-tokens',
-        type=int,
-        default=5,
-        required=False,
-        help=
-        'Specify the number of speculative tokens for the draft model to generate per lookahead.'
-    )
-    parser.add_argument(
-        '--use-draft-logits',
-        default=False,
-        required=False,
-        action='store_true',
-        help='Use logits from draft model when performing speculative decoding')
-    parser.add_argument('--return-context-logits',
-                        default=False,
-                        required=False,
-                        action='store_true',
-                        help='Return context logits')
-    parser.add_argument('--return-generation-logits',
-                        default=False,
-                        required=False,
-                        action='store_true',
-                        help='Return generation logits')
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The end if token')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The pad if token')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument('--dataset',
-                        type=str,
-                        required=True,
-                        help='Dataset path used for the test.')
-
-    parser.add_argument('--disable-output-comparison',
-                        action='store_true',
-                        required=False,
-                        help='disable output check')
-
-    parser.add_argument(
-        "--return-draft-model-draft-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine"
-    )
-
-    parser.add_argument(
-        "--return-target-model-accepted-token-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine",
-    )
-
-    parser.add_argument(
-        "--return-perf-metrics",
-        default=False,
-        required=False,
-        action='store_true',
-        help="Return per-request perf metrics",
-    )
-
-    FLAGS = parser.parse_args()
-    if not FLAGS.url_target:
-        FLAGS.url_target = "localhost:8001"
-
-    if not FLAGS.url_draft:
-        FLAGS.url_draft = FLAGS.url_target
-
-    if not FLAGS.url_control:
-        FLAGS.url_control = FLAGS.url_target
-
-    try:
-        client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target)
-        client_draft = grpcclient.InferenceServerClient(
-            url=FLAGS.url_draft) if (FLAGS.url_target
-                                     != FLAGS.url_draft) else client_target
-        client_control = grpcclient.InferenceServerClient(
-            url=FLAGS.url_control) if (FLAGS.url_target
-                                       != FLAGS.url_control) else client_target
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    if (FLAGS.beam_width > 1):
-        raise Exception(
-            'Beam width > 1 is not yet supported with speculative decoding')
-
-    request_id = 1
-    total_count = 0
-    failed_count = 0
-    with open(FLAGS.dataset, 'r') as f:
-        data_dict = json.load(f)
-        for req in data_dict:
-            prompt = req['input'] + ' ' + req['instruction']
-            output = req['output']
-            # 1.3 is a magic number that converts number of words to number of tokens
-            if int(len(prompt.split(' ')) * 1.3) > FLAGS.max_input_len:
-                continue
-            # 1.3 is a magic number that converts number of words to number of tokens
-            output_len = int(len(output.split(' ')) * 1.3)
-            if FLAGS.verbose:
-                print(f"flags: {FLAGS}")
-                print(f"Prompt: {prompt}")
-                print(f"Output len: {output_len}")
-
-            # Calling control model only
-            if FLAGS.verbose:
-                print(f"Calling control model", flush=True)
-            processed_prompt, output_control = end_to_end_grpc_client.run_inference(
-                client_control, prompt, output_len, str(request_id),
-                FLAGS.repetition_penalty, FLAGS.presence_penalty,
-                FLAGS.frequency_penalty, FLAGS.temperature, FLAGS.stop_words,
-                FLAGS.bad_words, [], [], "ensemble", False, 1, False, None,
-                None, None, None, FLAGS.end_id, FLAGS.pad_id, False,
-                FLAGS.verbose)
-            assert (len(output_control) == 1)
-            output_control = output_control[0]
-            if FLAGS.verbose:
-                print(f"output_control: {output_control}", flush=True)
-
-            # Calling BLS speculative decoding
-            if FLAGS.execute_bls_speculative_decoding:
-                if FLAGS.verbose:
-                    print(f"Calling BLS speculative decoding model", flush=True)
-
-                return_generation_logits_data = None
-                if FLAGS.return_generation_logits:
-                    return_generation_logits_data = np.array(
-                        [[FLAGS.return_generation_logits]], dtype=bool)
-
-                return_perf_metrics_data = None
-                if FLAGS.return_perf_metrics:
-                    return_perf_metrics_data = np.array(
-                        [[FLAGS.return_perf_metrics]], dtype=bool)
-
-                processed_prompt, output_speculative = end_to_end_grpc_client.run_inference(
-                    client_target, prompt, output_len, str(request_id),
-                    FLAGS.repetition_penalty, FLAGS.presence_penalty,
-                    FLAGS.frequency_penalty, FLAGS.temperature,
-                    FLAGS.stop_words, FLAGS.bad_words, [], [],
-                    "tensorrt_llm_bls", False, 1, False, None, None,
-                    return_generation_logits_data, return_perf_metrics_data,
-                    FLAGS.end_id, FLAGS.pad_id, False, FLAGS.verbose,
-                    FLAGS.num_draft_tokens, FLAGS.use_draft_logits)
-                assert (len(output_speculative) == 1)
-                output_speculative = output_speculative[0]
-                if FLAGS.verbose:
-                    print(f"output_bls_speculative: {output_speculative}",
-                          flush=True)
-            else:
-                # Calling client-side coordination of speculative decoding
-                if FLAGS.verbose:
-                    print(f"Calling speculative client", flush=True)
-                output_speculative = e2e_grpc_speculative_decoding_client.run_speculative_inference(
-                    client_draft,
-                    client_target, prompt, output_len, FLAGS.num_draft_tokens,
-                    str(request_id), FLAGS.repetition_penalty,
-                    FLAGS.presence_penalty, FLAGS.frequency_penalty,
-                    FLAGS.temperature, FLAGS.stop_words, FLAGS.bad_words,
-                    FLAGS.end_id, FLAGS.pad_id, FLAGS.beam_width,
-                    FLAGS.preprocessor_model_name,
-                    FLAGS.draft_tensorrt_llm_model_name,
-                    FLAGS.target_tensorrt_llm_model_name,
-                    FLAGS.postprocessor_model_name,
-                    FLAGS.return_draft_model_draft_logits,
-                    FLAGS.return_target_model_accepted_token_logits,
-                    FLAGS.verbose)
-                if FLAGS.verbose:
-                    print(f"output_speculative: {output_speculative}",
-                          flush=True)
-
-            total_count = total_count + 1
-            if not FLAGS.disable_output_comparison:
-                if (output_control != output_speculative):
-                    failed_count = failed_count + 1
-                    print(f"{total_count}: Outputs don't match")
-                    print(f"Prompt:")
-                    print(f"{prompt}")
-                    print(f"Output control:")
-                    print(f"{output_control}")
-                    print(f"Output speculative:")
-                    print(f"{output_speculative}")
-                else:
-                    print(f"{total_count}: Outputs match")
-            else:
-                print("Not checking output")
-                if output_speculative == "":
-                    failed_count += 1
-            request_id = request_id + 1
-
-    print(f"failed/total: {failed_count}/{total_count}")
-    sys.exit(failed_count > 0)
diff --git a/triton_backend/tools/inflight_batcher_llm/test_max_queue_size.py b/triton_backend/tools/inflight_batcher_llm/test_max_queue_size.py
deleted file mode 100644
index 4621f6192af..00000000000
--- a/triton_backend/tools/inflight_batcher_llm/test_max_queue_size.py
+++ /dev/null
@@ -1,638 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import csv
-import os
-import queue
-import sys
-from functools import partial
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from transformers import AutoTokenizer
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-_str_to_np_dict = dict(
-    float16=np.float16,
-    float32=np.float32,
-    int32=np.int32,
-    bfloat16=np_bfloat16,
-)
-
-
-def curate_log_output(token_sequence,
-                      identifier="Input",
-                      log_max_sequence_len=256):
-    if len(token_sequence) > log_max_sequence_len:
-        print(f"{identifier} sequence starts with: ",
-              token_sequence[:log_max_sequence_len])
-    else:
-        print(f"{identifier} sequence: ", token_sequence)
-
-
-def str_dtype_to_np(dtype):
-    ret = _str_to_np_dict.get(dtype)
-    assert ret is not None, f'Unsupported dtype: {dtype}'
-    return ret
-
-
-def prepare_tensor(name, input, protocol):
-    client = httpclient if protocol == 'http' else grpcclient
-    t = client.InferInput(name, input.shape, np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def prepare_outputs(output_names, protocol):
-    client = httpclient if protocol == 'http' else grpcclient
-    outputs = []
-    for output_name in output_names:
-        outputs.append(client.InferRequestedOutput(output_name))
-    return outputs
-
-
-def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
-                   beam_width_data, temperature_data, repetition_penalty_data,
-                   presence_penalty_data, frequency_penalty_data,
-                   streaming_data, end_id, pad_id, prompt_embedding_table_data,
-                   prompt_vocab_size_data, lora_task_id_data, lora_weights_data,
-                   lora_config_data, return_log_probs_data, top_k_data,
-                   top_p_data, draft_ids_data, return_context_logits_data,
-                   return_generation_logits_data, decoder_input_ids_data,
-                   protocol):
-    inputs = [
-        prepare_tensor("input_ids", input_ids_data, protocol),
-        prepare_tensor("input_lengths", input_lengths_data, protocol),
-        prepare_tensor("request_output_len", request_output_len_data, protocol),
-        prepare_tensor("beam_width", beam_width_data, protocol),
-        prepare_tensor("temperature", temperature_data, protocol),
-        prepare_tensor("streaming", streaming_data, protocol),
-        prepare_tensor("end_id", end_id, protocol),
-        prepare_tensor("pad_id", pad_id, protocol),
-        prepare_tensor("return_log_probs", return_log_probs_data, protocol),
-        prepare_tensor("runtime_top_k", top_k_data, protocol),
-        prepare_tensor("runtime_top_p", top_p_data, protocol),
-    ]
-    if prompt_embedding_table_data is not None:
-        inputs += [
-            prepare_tensor("prompt_embedding_table",
-                           prompt_embedding_table_data, protocol),
-            prepare_tensor("prompt_vocab_size", prompt_vocab_size_data,
-                           protocol)
-        ]
-    if lora_task_id_data is not None:
-        inputs += [prepare_tensor("lora_task_id", lora_task_id_data, protocol)]
-    if lora_weights_data is not None:
-        inputs += [
-            prepare_tensor("lora_weights", lora_weights_data, protocol),
-            prepare_tensor("lora_config", lora_config_data, protocol),
-        ]
-    if repetition_penalty_data is not None:
-        inputs += [
-            prepare_tensor("repetition_penalty", repetition_penalty_data,
-                           protocol),
-        ]
-    if presence_penalty_data is not None:
-        inputs += [
-            prepare_tensor("presence_penalty", presence_penalty_data, protocol),
-        ]
-    if frequency_penalty_data is not None:
-        inputs += [
-            prepare_tensor("frequency_penalty", frequency_penalty_data,
-                           protocol),
-        ]
-    if draft_ids_data is not None:
-        inputs += [
-            prepare_tensor("draft_input_ids", draft_ids_data, protocol),
-        ]
-    if return_context_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_context_logits", return_context_logits_data,
-                           protocol),
-        ]
-    if return_generation_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_generation_logits",
-                           return_generation_logits_data, protocol),
-        ]
-    if decoder_input_ids_data is not None:
-        inputs += [
-            prepare_tensor("decoder_input_ids", decoder_input_ids_data,
-                           protocol),
-        ]
-    return inputs
-
-
-def test_http_client(args: argparse.Namespace, inputs, outputs, request_id):
-    with httpclient.InferenceServerClient(
-            url="localhost:8000",
-            verbose=False,
-            ssl=False,
-            concurrency=args.num_requests,
-    ) as triton_client:
-        try:
-            futures = []
-            # Send requests
-            for i in range(args.num_requests):
-                infer_future = triton_client.async_infer(args.model_name,
-                                                         inputs,
-                                                         outputs=outputs,
-                                                         request_id=request_id)
-                futures.append(infer_future)
-        except Exception as e:
-            err = "Encountered error: " + str(e)
-            print(err)
-            sys.exit(err)
-
-        passed = False
-
-        for i in range(len(futures)):
-            try:
-                futures[i].get_result()
-            except Exception as e:
-                if "Maximum queue size of" in f"{e}":
-                    passed = True
-
-        return passed
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-# Define the callback function. Note the last two parameters should be
-# result and error. InferenceServerClient would provide the results of an
-# inference as grpcclient.InferResult in result. For successful
-# inference, error will be None, otherwise it will be an object of
-# tritonclientutils.InferenceServerException holding the error details
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-def test_grpc_client(args: argparse.Namespace, inputs, outputs, request_id):
-    user_data = UserData()
-    with grpcclient.InferenceServerClient(
-            url=args.url,
-            verbose=False,
-            ssl=False,
-    ) as triton_client:
-        try:
-            # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data), )
-            # Send requests
-            for i in range(args.num_requests):
-                triton_client.async_stream_infer(
-                    args.model_name,
-                    inputs,
-                    outputs=outputs,
-                    request_id=request_id,
-                )
-            triton_client.stop_stream(cancel_requests=False)
-        except Exception as e:
-            err = "Encountered error: " + str(e)
-            print(err)
-            sys.exit(err)
-
-        passed = False
-
-        while True:
-            try:
-                result = user_data._completed_requests.get(block=False)
-            except Exception:
-                break
-
-            if type(result) == InferenceServerException:
-                if "Exceeds maximum queue size" in result.message():
-                    passed = True
-
-        return passed
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        type=str,
-        required=False,
-        default="localhost:8000",
-        help="Inference server URL. Default is localhost:8000.",
-    )
-    parser.add_argument(
-        '-p',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        choices=['http', 'grpc'],
-        help=
-        'Protocol to use when communicating with the inference server. Default is HTTP.'
-    )
-    parser.add_argument('--text',
-                        type=str,
-                        required=False,
-                        default='Born in north-east France, Soyer trained as a',
-                        help='Input text')
-
-    parser.add_argument('--input-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the input tokens')
-
-    parser.add_argument('--draft-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the draft tokens')
-
-    parser.add_argument(
-        '--output-tokens-csv',
-        type=str,
-        required=False,
-        default='',
-        help='Path to csv file containing the expected output tokens')
-
-    parser.add_argument(
-        '--end-id',
-        type=int,
-        required=False,
-        default=-1,
-        help='The token id for end token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        '--pad-id',
-        type=int,
-        required=False,
-        default=50256,
-        help='The token id for pad token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        "-s",
-        "--ssl",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable SSL encrypted channel to the server",
-    )
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument(
-        "--request-output-len",
-        type=int,
-        required=False,
-        default=16,
-        help="Request output length",
-    )
-    parser.add_argument('--tokenizer-dir',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Specify tokenizer directory')
-    parser.add_argument('--tokenizer-type',
-                        type=str,
-                        default='auto',
-                        required=False,
-                        choices=['auto', 't5', 'llama'],
-                        help='Specify tokenizer type')
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--prompt-embedding-table-path',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The prompt embedding table to use for ptuning')
-    parser.add_argument("--lora-path",
-                        type=str,
-                        default='',
-                        required=False,
-                        help="LoRA weights")
-    parser.add_argument("--lora-task-id",
-                        type=int,
-                        default=None,
-                        required=False,
-                        help="LoRA task id")
-    parser.add_argument(
-        "--exclude-input-in-output",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Expect that output IDs do not contain input IDs",
-    )
-
-    parser.add_argument('--prompt-task-id',
-                        type=int,
-                        default=0,
-                        required=False,
-                        help='The prompt task id in the prompt embedding table')
-
-    parser.add_argument('--dtype',
-                        type=str,
-                        default='float16',
-                        choices=['float16', 'float32', 'bfloat16'])
-
-    parser.add_argument(
-        "--return-log-probs",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable computation of log probs",
-    )
-
-    parser.add_argument(
-        "--return-context-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-generation-logits",
-        action="store_true",
-        required=False,
-        default=False,
-        help=
-        "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        required=False,
-        default=1,
-        help="top k value",
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        required=False,
-        default=0.,
-        help="top p value",
-    )
-
-    parser.add_argument('--requested-outputs',
-                        nargs='+',
-                        default=[],
-                        help='The requested output tensors')
-
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default='tensorrt_llm',
-                        help='Specify model name')
-
-    parser.add_argument(
-        '--num-requests',
-        type=int,
-        required=True,
-        help='Number of requests to send to try and fill up the queue')
-
-    FLAGS = parser.parse_args()
-
-    tokenizer = None
-    draft_ids = None
-    decoder_input_ids = None
-    if FLAGS.input_tokens_csv != "":
-        with open(FLAGS.input_tokens_csv) as csv_file:
-            csv_reader = csv.reader(csv_file, delimiter=",")
-            for row in csv_reader:
-                input_ids = [[int(val) for val in row]]
-                break
-
-            curate_log_output(input_ids[0], "Input")
-
-        if FLAGS.draft_tokens_csv != "":
-            with open(FLAGS.draft_tokens_csv) as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=",")
-                for row in csv_reader:
-                    draft_ids = [[int(val) for val in row]]
-                    break
-
-        end_id = FLAGS.end_id
-        pad_id = FLAGS.pad_id
-
-    else:
-        print('=========')
-        if (os.path.isdir(FLAGS.tokenizer_dir)
-                and not os.path.exists(FLAGS.tokenizer_dir)):
-            raise FileNotFoundError(
-                "Input tokens are not provided and tokenizer directory does"
-                f" not exist: {FLAGS.tokenizer_dir}", )
-
-        tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                                  legacy=False,
-                                                  padding_side='left',
-                                                  trust_remote_code=True)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        pad_id = tokenizer.encode(tokenizer.pad_token,
-                                  add_special_tokens=False)[0]
-        end_id = tokenizer.encode(tokenizer.eos_token,
-                                  add_special_tokens=False)[0]
-        print("Using pad_id: ", pad_id)
-        print("Using end_id: ", end_id)
-
-        input_ids = [tokenizer.encode(FLAGS.text)]
-        curate_log_output(input_ids[0], "Input")
-
-    end_id_data = np.array([[end_id]], dtype=np.int32)
-    pad_id_data = np.array([[pad_id]], dtype=np.int32)
-
-    #Get the prompt embedding table for the task id
-    prompt_embedding_table_data = None
-    prompt_vocab_size_data = None
-    if (FLAGS.prompt_embedding_table_path != ""):
-        prompt_table = np.load(FLAGS.prompt_embedding_table_path)
-        prompt_table = prompt_table.astype(str_dtype_to_np(FLAGS.dtype))
-        task_vocab_size = prompt_table.shape[1]
-
-        # squeeze the first 2 dimensions
-        prompt_embedding_table_data = prompt_table[FLAGS.prompt_task_id]
-        prompt_embedding_table_data = np.expand_dims(
-            prompt_table[FLAGS.prompt_task_id], axis=0)
-
-        prompt_vocab_size = [[task_vocab_size]]
-        prompt_vocab_size_data = np.array(prompt_vocab_size, dtype=np.int32)
-
-    lora_weights_data = None
-    lora_config_data = None
-    if (FLAGS.lora_path != ""):
-        lora_weights_data = np.load(
-            os.path.join(FLAGS.lora_path, "model.lora_weights.npy"))
-        try:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_config.npy"))
-        except Exception:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_keys.npy"))
-    lora_task_id_data = None
-    if FLAGS.lora_task_id is not None and FLAGS.lora_task_id != 0:
-        lora_task_id_data = np.array([[FLAGS.lora_task_id]], dtype=np.uint64)
-
-    input_ids_data = np.array(input_ids, dtype=np.int32)
-    input_lengths = [[len(ii)] for ii in input_ids]
-    input_lengths_data = np.array(input_lengths, dtype=np.int32)
-    request_output_len = [[FLAGS.request_output_len]]
-    request_output_len_data = np.array(request_output_len, dtype=np.int32)
-    beam_width = [[FLAGS.beam_width]]
-    beam_width_data = np.array(beam_width, dtype=np.int32)
-    top_k = [[FLAGS.top_k]]
-    top_k_data = np.array(top_k, dtype=np.int32)
-    top_p = [[FLAGS.top_p]]
-    top_p_data = np.array(top_p, dtype=np.float32)
-    temperature = [[FLAGS.temperature]]
-    temperature_data = np.array(temperature, dtype=np.float32)
-    return_log_probs = [[FLAGS.return_log_probs]]
-    return_log_probs_data = np.array(return_log_probs, dtype=bool)
-
-    return_context_logits_data = None
-    if FLAGS.return_context_logits:
-        return_context_logits_data = np.array([[FLAGS.return_context_logits]],
-                                              dtype=bool)
-
-    return_generation_logits_data = None
-    if FLAGS.return_generation_logits:
-        return_generation_logits_data = np.array(
-            [[FLAGS.return_generation_logits]], dtype=bool)
-
-    repetition_penalty_data = None
-    if FLAGS.repetition_penalty is not None:
-        repetition_penalty = [[FLAGS.repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty, dtype=np.float32)
-    presence_penalty_data = None
-    if FLAGS.presence_penalty is not None:
-        presence_penalty = [[FLAGS.presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-    frequency_penalty_data = None
-    if FLAGS.frequency_penalty is not None:
-        frequency_penalty = [[FLAGS.frequency_penalty]]
-        frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32)
-    streaming = [[FLAGS.protocol == 'grpc']]
-    streaming_data = np.array(streaming, dtype=bool)
-
-    draft_ids_data = None
-    if draft_ids is not None:
-        draft_ids_data = np.array(draft_ids, dtype=np.int32)
-
-    decoder_input_ids_data = None
-    if decoder_input_ids is not None:
-        decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)
-
-    inputs = prepare_inputs(
-        input_ids_data, input_lengths_data, request_output_len_data,
-        beam_width_data, temperature_data, repetition_penalty_data,
-        presence_penalty_data, frequency_penalty_data, streaming_data,
-        end_id_data, pad_id_data, prompt_embedding_table_data,
-        prompt_vocab_size_data, lora_task_id_data, lora_weights_data,
-        lora_config_data, return_log_probs_data, top_k_data, top_p_data,
-        draft_ids_data, return_context_logits_data,
-        return_generation_logits_data, decoder_input_ids_data, FLAGS.protocol)
-
-    if FLAGS.requested_outputs:
-        # Must have at least output_ids in requested outputs
-        if "output_ids" not in FLAGS.requested_outputs:
-            raise Exception(
-                "requested outputs must at least have \"output_ids\"")
-        outputs = prepare_outputs(FLAGS.requested_outputs, FLAGS.protocol)
-    else:
-        outputs = None
-
-    request_id = FLAGS.request_id
-
-    if FLAGS.protocol == 'http':
-        res = test_http_client(FLAGS, inputs, outputs, request_id)
-    else:
-        res = test_grpc_client(FLAGS, inputs, outputs, request_id)
-
-    if not res:
-        print(
-            "Expected requests to be rejected due to a full queue, but that didn't happen!"
-        )
-    sys.exit(not res)
diff --git a/triton_backend/tools/llmapi_client.py b/triton_backend/tools/llmapi_client.py
deleted file mode 100755
index bd63254faaf..00000000000
--- a/triton_backend/tools/llmapi_client.py
+++ /dev/null
@@ -1,275 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import queue
-import sys
-import time
-from functools import partial
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def _prepare_inputs(prompt, output_len):
-    inputs = [
-        prepare_tensor("text_input", prompt),
-        prepare_tensor("sampling_param_max_tokens",
-                       np.array([output_len], dtype=np.int32)),
-    ]
-    return inputs
-
-
-def prepare_stop_signals():
-
-    inputs = [
-        grpcclient.InferInput('text_input', [1], "BYTES"),
-        grpcclient.InferInput('stop', [1], "BOOL"),
-    ]
-
-    inputs[0].set_data_from_numpy(np.empty([1], dtype=np.bytes_))
-    inputs[1].set_data_from_numpy(np.array([True], dtype='bool'))
-
-    return inputs
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        type=str,
-        required=False,
-        default="localhost:8001",
-        help="Inference server URL. Default is localhost:8001.",
-    )
-    parser.add_argument('--text',
-                        type=str,
-                        required=False,
-                        default='Born in north-east France, Soyer trained as a',
-                        help='Input text')
-
-    parser.add_argument(
-        "-s",
-        "--ssl",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable SSL encrypted channel to the server",
-    )
-    parser.add_argument(
-        "-t",
-        "--stream-timeout",
-        type=float,
-        required=False,
-        default=None,
-        help="Stream timeout in seconds. Default is None.",
-    )
-    parser.add_argument(
-        "-C",
-        "--grpc-compression-algorithm",
-        type=str,
-        required=False,
-        default=None,
-        help=
-        "The compression algorithm to be used when sending request to server. Default is None.",
-    )
-    parser.add_argument(
-        "-S",
-        "--streaming",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-    parser.add_argument(
-        "-r",
-        "--root-certificates",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded root certificates. Default is None.",
-    )
-    parser.add_argument(
-        "-p",
-        "--private-key",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded private key. Default is None.",
-    )
-    parser.add_argument(
-        "-x",
-        "--certificate-chain",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded certificate chain. Default is None.",
-    )
-    parser.add_argument(
-        "--request-output-len",
-        type=int,
-        required=False,
-        default=16,
-        help="Request output length",
-    )
-    parser.add_argument(
-        '--stop-after-ms',
-        type=int,
-        required=False,
-        default=0,
-        help='Early stop the generation after a few milliseconds')
-    parser.add_argument(
-        "--stop-via-request-cancel",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Early stop use request cancellation instead of stop request")
-
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='1',
-                        required=False,
-                        help='The request_id for the request')
-    parser.add_argument(
-        "--return-perf-metrics",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Return per-request perf metrics",
-    )
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default='tensorrt_llm',
-                        help='Specify model name')
-
-    FLAGS = parser.parse_args()
-
-    input_data = np.array([FLAGS.text], dtype=object)
-
-    output_len = FLAGS.request_output_len
-
-    inputs = _prepare_inputs(input_data, output_len)
-
-    stop_inputs = None
-    if FLAGS.stop_after_ms > 0 and not FLAGS.stop_via_request_cancel:
-        stop_inputs = prepare_stop_signals()
-
-    request_id = FLAGS.request_id
-    user_data = UserData()
-    with grpcclient.InferenceServerClient(
-            url=FLAGS.url,
-            verbose=FLAGS.verbose,
-            ssl=FLAGS.ssl,
-            root_certificates=FLAGS.root_certificates,
-            private_key=FLAGS.private_key,
-            certificate_chain=FLAGS.certificate_chain,
-    ) as triton_client:
-        try:
-            # Send request
-            infer_future = triton_client.async_infer(
-                FLAGS.model_name,
-                inputs,
-                outputs=None,
-                request_id=request_id,
-                callback=partial(callback, user_data),
-                parameters={'Streaming': FLAGS.streaming})
-
-            expected_responses = 1
-
-            if FLAGS.stop_after_ms > 0:
-
-                time.sleep(FLAGS.stop_after_ms / 1000.0)
-
-                if FLAGS.stop_via_request_cancel:
-                    infer_future.cancel()
-                else:
-                    triton_client.async_infer(
-                        FLAGS.model_name,
-                        stop_inputs,
-                        request_id=request_id,
-                        callback=partial(callback, user_data),
-                        parameters={'Streaming': FLAGS.streaming})
-
-            processed_count = 0
-            while processed_count < expected_responses:
-                try:
-                    result = user_data._completed_requests.get()
-                    print("Got completed request", flush=True)
-                except Exception:
-                    break
-
-                if type(result) == InferenceServerException:
-                    if result.status() == "StatusCode.CANCELLED":
-                        print("Request is cancelled")
-                    else:
-                        print("Received an error from server:")
-                        print(result)
-                        raise result
-                else:
-                    print(
-                        f'Output text: {result.as_numpy("text_output")[0].decode("utf-8")}'
-                    )
-
-                processed_count += 1
-
-        except Exception as e:
-            err = "Encountered error: " + str(e)
-            print(err)
-            sys.exit(err)
-
-        sys.exit(0)
diff --git a/triton_backend/tools/multimodal/client.py b/triton_backend/tools/multimodal/client.py
deleted file mode 100755
index bac2b4ef5b3..00000000000
--- a/triton_backend/tools/multimodal/client.py
+++ /dev/null
@@ -1,420 +0,0 @@
-#!/usr/bin/python
-
-import argparse
-import base64
-import io
-import os
-import sys
-from datetime import datetime
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-from functools import partial
-
-import numpy as np
-import requests
-import tritonclient.grpc as grpcclient
-from PIL import Image
-from transformers import AutoProcessor, Blip2Processor
-from utils import utils
-
-
-def prepare_inputs(text_data,
-                   image_data,
-                   request_output_len_data,
-                   beam_width_data,
-                   temperature_data,
-                   repetition_penalty_data,
-                   presence_penalty_data,
-                   end_id,
-                   pad_id,
-                   top_k_data,
-                   top_p_data,
-                   streaming_data,
-                   prompt_table_extra_id_data,
-                   image_input_name="image_input"):
-    inputs = [
-        utils.prepare_tensor("text_input", text_data, grpcclient),
-        utils.prepare_tensor(image_input_name, image_data, grpcclient),
-        utils.prepare_tensor("max_tokens", request_output_len_data, grpcclient),
-        utils.prepare_tensor("beam_width", beam_width_data, grpcclient),
-        utils.prepare_tensor("temperature", temperature_data, grpcclient),
-        utils.prepare_tensor("end_id", end_id, grpcclient),
-        utils.prepare_tensor("pad_id", pad_id, grpcclient),
-        utils.prepare_tensor("top_k", top_k_data, grpcclient),
-        utils.prepare_tensor("top_p", top_p_data, grpcclient),
-        utils.prepare_tensor("stream", streaming_data, grpcclient),
-    ]
-    if repetition_penalty_data is not None:
-        inputs += [
-            utils.prepare_tensor("repetition_penalty", repetition_penalty_data,
-                                 grpcclient),
-        ]
-    if presence_penalty_data is not None:
-        inputs += [
-            utils.prepare_tensor("presence_penalty", presence_penalty_data,
-                                 grpcclient),
-        ]
-    if prompt_table_extra_id_data is not None:
-        inputs += [
-            utils.prepare_tensor("prompt_table_extra_id",
-                                 prompt_table_extra_id_data, grpcclient),
-        ]
-    return inputs
-
-
-def load_image(image_path):
-    if image_path.startswith("http") or image_path.startswith("https"):
-        image = Image.open(requests.get(image_path,
-                                        stream=True).raw).convert("RGB")
-    elif image_path.startswith("data:image/jpeg;base64,"):
-        image_base64 = image_path.split(",")[1]
-        # Decode the base64 string
-        image_data = base64.b64decode(image_base64)
-        # Create a BytesIO object from the decoded data
-        image_buffer = io.BytesIO(image_data)
-        image = Image.open(image_buffer).convert("RGB")
-    else:
-        image = Image.open(image_path).convert("RGB")
-    return image
-
-
-def load_video(video_path, num_of_frames):
-    import av
-    container = av.open(video_path)
-    total_frames = container.streams.video[0].frames
-    indices = np.arange(0, total_frames,
-                        total_frames / num_of_frames).astype(int)
-
-    def read_video_pyav(container, indices):
-
-        frames = []
-        container.seek(0)
-        start_index = indices[0]
-        end_index = indices[-1]
-        for i, frame in enumerate(container.decode(video=0)):
-            if i > end_index:
-                break
-            if i >= start_index and i in indices:
-                frames.append(frame)
-        return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-
-    video = read_video_pyav(container, indices)
-    return video
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        type=str,
-        required=False,
-        default="localhost:8001",
-        help="Inference server URL. Default is localhost:8001.",
-    )
-    parser.add_argument('--text',
-                        type=str,
-                        required=False,
-                        default='Question: which city is this? Answer:',
-                        help='Input text')
-
-    parser.add_argument(
-        '--image',
-        type=str,
-        required=False,
-        default=
-        'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png',
-        help='Input image')
-
-    parser.add_argument('--video',
-                        type=str,
-                        required=False,
-                        default=None,
-                        help='Input video')
-
-    parser.add_argument(
-        '--video_num_frames',
-        type=int,
-        required=False,
-        default=None,
-        help=
-        'The number of frames sampled from the video in the Llava-OneVision model.'
-    )
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        required=False,
-                        default=-1,
-                        help='The token id for end token.')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        required=False,
-                        default=1,
-                        help='The token id for pad token.')
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=1.0,
-        help="The repetition penalty value",
-    )
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--request-output-len",
-        type=int,
-        required=False,
-        default=16,
-        help="Request output length",
-    )
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        required=False,
-        default=1,
-        help="top k value",
-    )
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        required=False,
-        default=0.,
-        help="top p value",
-    )
-    parser.add_argument(
-        "--streaming",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-    parser.add_argument(
-        "--use_bls",
-        action="store_true",
-        required=False,
-        default=False,
-        help="Use BLS model instead of ensemble.",
-    )
-    parser.add_argument(
-        "--prompt_table_extra_id",
-        type=int,
-        required=False,
-        default=None,
-        help=
-        "When enable kv cache reuse, we need a unique id to determine whether the images are the same. The type of extra id is uint64, and its range is from 1 to the maximum value of uint64.",
-    )
-    parser.add_argument("--model_type",
-                        required=True,
-                        choices=[
-                            'blip2', 'llava', 'vila', 'mllama',
-                            'llava_onevision', 'qwen2_vl'
-                        ],
-                        help="Model type")
-    parser.add_argument("--hf_model_dir",
-                        required=False,
-                        type=str,
-                        default=None,
-                        help="path to the model directory")
-    FLAGS = parser.parse_args()
-    # load and process images or video
-    if 'vila' in FLAGS.model_type:
-        image_paths = FLAGS.image.split(",")
-        raw_image = []
-        for image_path in image_paths:
-            raw_image.append(load_image(image_path))
-    elif FLAGS.video is not None:
-        assert FLAGS.video_num_frames is not None, "Number of frames should be provided for video input."
-        raw_video = load_video(FLAGS.video, FLAGS.video_num_frames)
-    else:
-        raw_image = load_image(FLAGS.image)
-
-    if 'blip2' in FLAGS.model_type:
-        if FLAGS.hf_model_dir is not None and os.path.exists(
-                FLAGS.hf_model_dir):
-            processor = Blip2Processor.from_pretrained(FLAGS.hf_model_dir)
-        else:
-            processor = Blip2Processor.from_pretrained(
-                "Salesforce/blip2-opt-2.7b")
-        image = processor(raw_image, FLAGS.text,
-                          return_tensors="pt")['pixel_values']
-    elif FLAGS.model_type == 'llava':
-        if FLAGS.hf_model_dir is not None and os.path.exists(
-                FLAGS.hf_model_dir):
-            processor = AutoProcessor.from_pretrained(FLAGS.hf_model_dir)
-        else:
-            processor = AutoProcessor.from_pretrained(
-                "llava-hf/llava-1.5-7b-hf")
-
-        image = processor(text=FLAGS.text,
-                          images=raw_image,
-                          return_tensors="pt")['pixel_values']
-    elif 'vila' in FLAGS.model_type:
-        # vila support multiple images input
-        sys.path.append(FLAGS.hf_model_dir + "/../VILA")
-        from llava.model import LlavaLlamaConfig  # noqa
-        from transformers import AutoModel
-        model = AutoModel.from_pretrained(
-            FLAGS.hf_model_dir,
-            device_map='auto',
-            trust_remote_code=True,
-        )
-        vision_tower = model.get_vision_tower()
-        image_processor = vision_tower.image_processor
-        from llava.mm_utils import process_images
-        if not isinstance(raw_image, list):
-            raw_image = [raw_image]
-        image = process_images(raw_image, image_processor, model.config)
-
-    if 'mllama' in FLAGS.model_type:
-        image_tag = '<|image|>'
-        if image_tag not in FLAGS.text:
-            FLAGS.text = image_tag + FLAGS.text
-        image_data = np.array([[raw_image]])
-        image_input_name = "image_bytes_input"
-    elif 'llava_onevision' in FLAGS.model_type:
-        if FLAGS.video is not None:
-            image_data = np.array([raw_video])
-            image_input_name = "video_bytes_input"
-        else:
-            image_data = np.array([[raw_image]])
-            image_input_name = "image_bytes_input"
-    elif FLAGS.model_type == 'qwen2_vl':
-        raw_image = raw_image.resize((504, 504))
-        image_data = np.array([[raw_image]])
-        image_input_name = "image_bytes_input"
-    else:
-        image = image.unsqueeze(0)
-        image_data = image.numpy().astype(np.float16)
-        image_input_name = "image_input"
-
-    text_data = np.array([[FLAGS.text.encode("utf8")]], dtype=np.object_)
-    end_id_data = np.array([[FLAGS.end_id]], dtype=np.int32)
-    pad_id_data = np.array([[FLAGS.pad_id]], dtype=np.int32)
-    request_output_len = [[FLAGS.request_output_len]]
-    request_output_len_data = np.array(request_output_len, dtype=np.int32)
-    beam_width = [[FLAGS.beam_width]]
-    beam_width_data = np.array(beam_width, dtype=np.int32)
-    top_k = [[FLAGS.top_k]]
-    top_k_data = np.array(top_k, dtype=np.int32)
-    top_p = [[FLAGS.top_p]]
-    top_p_data = np.array(top_p, dtype=np.float32)
-    temperature = [[FLAGS.temperature]]
-    temperature_data = np.array(temperature, dtype=np.float32)
-    streaming = [[FLAGS.streaming]]
-    streaming_data = np.array(streaming, dtype=bool)
-
-    model_name = "ensemble"
-    if FLAGS.use_bls:
-        model_name = "tensorrt_llm_bls"
-
-    repetition_penalty_data = None
-    if FLAGS.repetition_penalty is not None:
-        repetition_penalty = [[FLAGS.repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty, dtype=np.float32)
-    presence_penalty_data = None
-    if FLAGS.presence_penalty is not None:
-        presence_penalty = [[FLAGS.presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-
-    prompt_table_extra_id_data = None
-    if FLAGS.prompt_table_extra_id is not None:
-        prompt_table_extra_id = [[FLAGS.prompt_table_extra_id]]
-        prompt_table_extra_id_data = np.array(prompt_table_extra_id,
-                                              dtype=np.uint64)
-
-    inputs = prepare_inputs(text_data,
-                            image_data,
-                            request_output_len_data,
-                            beam_width_data,
-                            temperature_data,
-                            repetition_penalty_data,
-                            presence_penalty_data,
-                            end_id_data,
-                            pad_id_data,
-                            top_k_data,
-                            top_p_data,
-                            streaming_data,
-                            prompt_table_extra_id_data,
-                            image_input_name=image_input_name)
-
-    start_time = datetime.now()
-
-    #Only include needed outputs
-    outputs = []
-    outputs.append(grpcclient.InferRequestedOutput("text_output"))
-
-    with utils.create_inference_server_client('grpc',
-                                              FLAGS.url,
-                                              concurrency=None,
-                                              verbose=FLAGS.verbose) as client:
-        user_data = utils.UserData()
-
-        if FLAGS.streaming:
-            client.start_stream(
-                callback=partial(utils.completion_callback, user_data),
-                stream_timeout=None,
-            )
-            client.async_stream_infer(model_name, inputs, outputs=outputs)
-            client.stop_stream(cancel_requests=False)
-
-            results = []
-            while True:
-                try:
-                    (result,
-                     error) = user_data._completed_requests.get(block=False)
-                    output = result.as_numpy("text_output")
-                    for i in range(FLAGS.beam_width):
-                        print("[beam", i, "]: ", output[i].decode())
-                except Exception:
-                    break
-
-        else:
-            client.async_infer(model_name,
-                               inputs,
-                               partial(utils.completion_callback, user_data),
-                               outputs=outputs)
-            results = utils.get_grpc_results(user_data, request_parallelism=1)
-
-    stop_time = datetime.now()
-
-    if not FLAGS.streaming:
-        output = results[0].as_numpy("text_output")
-        for i in range(FLAGS.beam_width):
-            print("[beam", i, "]:")
-            print(output[i].decode())
-
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Latency: {latency} ms")
diff --git a/triton_backend/tools/tests/__init__.py b/triton_backend/tools/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/tools/tests/test_fill_template.py b/triton_backend/tools/tests/test_fill_template.py
deleted file mode 100644
index bb8c4c66578..00000000000
--- a/triton_backend/tools/tests/test_fill_template.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from fill_template import split
-
-
-def test_split():
-    assert split("a,b,c", ",") == ["a", "b", "c"]
-    assert split("a\\,b\\,c", ",") == ["a,b,c"]
-    assert split("a\\,b\\,c", ":") == ["a,b,c"]
diff --git a/triton_backend/tools/utils.sh b/triton_backend/tools/utils.sh
deleted file mode 100644
index 27ef4cd1c68..00000000000
--- a/triton_backend/tools/utils.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
-# success, 1 on failure
-
-function wait_for_server_ready() {
-
-    local spid="$1";
-    local wait_time_secs="${2:-30}";
-    local triton_http_port="${3:-8000}"
-    WAIT_RET=0
-
-    local wait_secs=$wait_time_secs
-    until test $wait_secs -eq 0 ; do
-        if ! kill -0 $spid; then
-            echo "=== Server not running."
-            WAIT_RET=1
-            return
-        fi
-
-        sleep 1;
-
-        set +e
-        code=`curl -s -w %{http_code} localhost:${triton_http_port}/v2/health/ready`
-        set -e
-        if [ "$code" == "200" ]; then
-            return
-        fi
-
-        ((wait_secs--));
-    done
-
-    echo "=== Timeout $wait_time_secs secs. Server not ready."
-    WAIT_RET=1
-}
diff --git a/triton_backend/tools/utils/__init__.py b/triton_backend/tools/utils/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/triton_backend/tools/utils/utils.py b/triton_backend/tools/utils/utils.py
deleted file mode 100644
index 4c80eafce77..00000000000
--- a/triton_backend/tools/utils/utils.py
+++ /dev/null
@@ -1,483 +0,0 @@
-import csv
-import json
-import math
-import queue
-import random
-from datetime import timedelta
-from functools import partial
-
-import numpy as np
-import pandas as pd
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from tabulate import tabulate
-from tritonclient.utils import np_to_triton_dtype
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-        self._latencies = []
-        self._latency_dict = {}
-        self._start_time_dict = {}
-        self._stop_time_dict = {}
-
-
-# Callback function used for async_stream_infer()
-def completion_callback(user_data, result, error):
-    # passing error raise and handling out
-    user_data._completed_requests.put((result, error))
-
-
-def prepare_tensor(name, input, protocol):
-    client_util = httpclient if protocol == "http" else grpcclient
-    t = client_util.InferInput(name, input.shape,
-                               np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def prepare_outputs(protocol,
-                    return_log_probs=False,
-                    return_context_logits=False,
-                    return_generation_logits=False,
-                    return_finish_reason=False,
-                    return_stop_reason=False,
-                    return_cumulative_logprob=False):
-
-    client_util = httpclient if protocol == "http" else grpcclient
-
-    outputs = []
-    outputs.append(client_util.InferRequestedOutput("text_output"))
-
-    if return_log_probs:
-        outputs.append(client_util.InferRequestedOutput("cum_log_probs"))
-        outputs.append(client_util.InferRequestedOutput("output_log_probs"))
-
-    if return_context_logits:
-        outputs.append(client_util.InferRequestedOutput("context_logits"))
-
-    if return_generation_logits:
-        outputs.append(client_util.InferRequestedOutput("generation_logits"))
-
-    if return_finish_reason:
-        outputs.append(client_util.InferRequestedOutput("finish_reason"))
-
-    if return_stop_reason:
-        outputs.append(client_util.InferRequestedOutput("stop_reason"))
-
-    if return_cumulative_logprob:
-        outputs.append(client_util.InferRequestedOutput("cumulative_logprob"))
-
-    return outputs
-
-
-def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags):
-    output_len = np.ones([input_start_ids.shape[0], 1]).astype(
-        np.int32) * flags.output_len
-    runtime_top_k = (flags.topk *
-                     np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
-    runtime_top_p = flags.topp * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    beam_search_diversity_rate = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    temperature = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    len_penalty = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    repetition_penalty = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    seed = 0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.uint64)
-    output_log_probs = True * \
-        np.ones([input_start_ids.shape[0], 1]).astype(bool)
-    beam_width = (flags.beam_width *
-                  np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
-    pad_ids = pad_id * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    end_ids = end_id * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    min_tokens = 1 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    presence_penalty = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    frequency_penalty = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    bad_words_list = np.concatenate([
-        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
-        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
-    ],
-                                    axis=1)
-    stop_word_list = np.concatenate([
-        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
-        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
-    ],
-                                    axis=1)
-    inputs = [
-        prepare_tensor("input_ids", input_start_ids, flags.protocol),
-        prepare_tensor("input_lengths", input_len, flags.protocol),
-        prepare_tensor("request_output_len", output_len, flags.protocol),
-        prepare_tensor("pad_id", pad_ids, flags.protocol),
-        prepare_tensor("end_id", end_ids, flags.protocol),
-        prepare_tensor("beam_width", beam_width, flags.protocol),
-        prepare_tensor("temperature", temperature, flags.protocol),
-        prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol),
-        prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol),
-        prepare_tensor("len_penalty", len_penalty, flags.protocol),
-        prepare_tensor("repetition_penalty", repetition_penalty,
-                       flags.protocol),
-        prepare_tensor("min_tokens", min_tokens, flags.protocol),
-        prepare_tensor("presence_penalty", presence_penalty, flags.protocol),
-        prepare_tensor("frequency_penalty", frequency_penalty, flags.protocol),
-        prepare_tensor("seed", seed, flags.protocol),
-        prepare_tensor("output_log_probs", output_log_probs, flags.protocol),
-        # prepare_tensor("bad_words_list", bad_words_list, flags.protocol),
-        # prepare_tensor("stop_words_list", stop_word_list, flags.protocol),
-    ]
-    return inputs
-
-
-def create_inference_server_client(protocol, url, concurrency, verbose):
-    client_util = httpclient if protocol == "http" else grpcclient
-    if protocol == "http":
-        return client_util.InferenceServerClient(url,
-                                                 concurrency=concurrency,
-                                                 verbose=verbose)
-    elif protocol == "grpc":
-        return client_util.InferenceServerClient(url, verbose=verbose)
-
-
-def send_requests(model_name, inputs, client, request_parallelism):
-    results = []
-    for _ in range(request_parallelism):
-        result = client.infer(model_name, inputs)
-        results.append(result)
-    return results
-
-
-def send_requests_async(model_name, inputs, client, flags, request_parallelism):
-    if flags.protocol == "http":
-        async_requests = []
-        for _ in range(request_parallelism):
-            async_requests.append(client.async_infer(model_name, inputs))
-        return async_requests
-    else:
-        user_data = UserData()
-        for _ in range(request_parallelism):
-            client.async_infer(model_name, inputs,
-                               partial(completion_callback, user_data))
-        return user_data
-
-
-def get_http_results(async_requests):
-    results = []
-    for async_request in async_requests:
-        results.append(async_request.get_result())
-    return results
-
-
-def get_grpc_results(user_data, request_parallelism):
-    results = []
-    processed_count = 0
-    while processed_count < request_parallelism:
-        (result, error) = user_data._completed_requests.get()
-        processed_count += 1
-        if error is not None:
-            raise RuntimeError(error)
-        results.append(result)
-    return results
-
-
-def append_start_and_end_ids(inputs,
-                             batch_size,
-                             flags,
-                             start_id=None,
-                             end_id=None):
-    if start_id is not None:
-        start_ids = start_id * np.ones([batch_size, 1]).astype(np.int32)
-        inputs.append(prepare_tensor("start_id", start_ids, flags.protocol))
-    if end_id is not None:
-        end_ids = end_id * np.ones([batch_size, 1]).astype(np.int32)
-        inputs.append(prepare_tensor("end_id", end_ids, flags.protocol))
-
-
-def generate_histogram(range_buckets, frequencies):
-    histogram = []
-
-    for i in range(len(range_buckets)):
-        bucket = range_buckets[i]
-        frequency = frequencies[i]
-
-        # Split the bucket range into min and max values
-        min_range, max_range = bucket
-
-        # Generate 'frequency' random values within the specified range
-        random.seed(420)
-        random_values = [
-            random.randint(min_range, max_range) for _ in range(frequency)
-        ]
-
-        # Extend the histogram with the random values
-        histogram.extend(random_values)
-
-    # Randomize the order of values in the histogram
-    random.shuffle(histogram)
-
-    return histogram
-
-
-def get_token_list_from_histogram(histogram_key):
-
-    histogram_buckets = {
-        "example_ip": [(151, 175), (176, 200), (201, 225), (226, 250),
-                       (251, 275)],
-        "example_op": [(6, 10), (11, 15), (16, 20), (21, 25), (26, 30)]
-    }
-    histogram_freq = {
-        "example_ip": [220, 225, 150, 150, 140],
-        "example_op": [76, 210, 174, 130, 152]
-    }
-
-    range_buckets = histogram_buckets[histogram_key]
-    freqs = histogram_freq[histogram_key]
-    assert (len(range_buckets) == len(freqs))
-
-    return generate_histogram(range_buckets, freqs)
-
-
-def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
-    if delay_dist == "constant":
-        delays = [mean_time_bet_reqs] * num_reqs
-    elif delay_dist == "exponential_dist":
-        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
-
-    return delays
-
-
-def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
-    # set seed for determinism
-    np.random.seed(420)
-    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
-
-
-def get_norm_dist_tokens(mean, stdev, num_reqs):
-    # set seed for determinism
-    np.random.seed(420)
-    numbers_list = np.random.normal(loc=mean, scale=stdev,
-                                    size=num_reqs).tolist()
-    return [max(1, math.ceil(x)) for x in numbers_list]
-
-
-def gen_random_start_ids(ip_lens):
-    input_start_ids = []
-    for ip_len in ip_lens:
-        start_ids = list(
-            np.random.randint(low=0,
-                              high=np.iinfo(np.int32).max,
-                              size=ip_len,
-                              dtype=np.int32))
-        input_start_ids.append(np.array([start_ids]))
-
-    return input_start_ids
-
-
-def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
-    if delay_dist == "constant":
-        delays = [mean_time_bet_reqs] * num_reqs
-    elif delay_dist == "exponential_dist":
-        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
-
-    return delays
-
-
-def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
-    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
-
-
-def get_norm_dist_tokens(mean, stdev, num_reqs):
-    numbers_list = np.random.normal(loc=mean, scale=stdev,
-                                    size=num_reqs).tolist()
-    return [max(1, math.ceil(x)) for x in numbers_list]
-
-
-def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
-    """
-    Receives start and end times of all requests,
-    divides total E2E time into equal intervals and assigns how many requests are in flight
-    in each interval.
-    """
-    # Calculate min of start time and max of end time
-    min_start_time = min(start_times)
-    max_end_time = max(end_times)
-
-    # need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time
-    sec_per_request = 1.0 / requests_per_sec
-    NUM_INTERVALS = int((max_end_time - min_start_time) /
-                        timedelta(seconds=(sec_per_request / 10)))
-    print(NUM_INTERVALS)
-    # Calculate interval length
-    interval_length = (max_end_time - min_start_time) / NUM_INTERVALS
-
-    # Initialize a list to store the count of requests in each interval
-    interval_counts = [0] * NUM_INTERVALS
-
-    # Iterate through the requests and update interval counts
-    for i in range(len(start_times)):
-        start = start_times[i]
-        end = end_times[i]
-
-        # Calculate which interval the request falls into
-        interval_index = int((start - min_start_time) / interval_length)
-
-        # Increment the count for that interval and subsequent intervals until end
-        while start < end and interval_index < NUM_INTERVALS:
-            interval_counts[interval_index] += 1
-            interval_index += 1
-            start += interval_length
-
-    return interval_counts
-
-
-def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS):
-
-    #### Gather info about requests
-    op_token_len_list = []
-    op_token_len_ooo = {}
-
-    for response in responses:
-        #JG: long sequence to extract output length from response json dict. Responses are out of order
-        op_token_len_ooo[response.get_response(as_json=True)['id']] = \
-            int(response.get_response(as_json=True)['outputs'][0]['shape'][2])
-
-    op_token_len_list = [
-        value for key, value in sorted(op_token_len_ooo.items())
-    ]
-
-    assert (len(op_token_len_list) == len(ip_token_len_list))
-    if not FLAGS.exclude_input_in_output:
-        for i in range(len(op_token_len_list)):
-            op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i]
-
-    # Get latencies per request
-    # Order latencies based on issue order.
-    latency_list_in_order = [
-        value for key, value in sorted(user_data._latency_dict.items())
-    ]
-    start_time_list_in_order = [
-        value for key, value in sorted(user_data._start_time_dict.items())
-    ]
-    stop_time_list_in_order = [
-        value for key, value in sorted(user_data._stop_time_dict.items())
-    ]
-
-    latency_sorted = np.sort(latency_list_in_order)
-    index_99 = math.ceil(len(latency_sorted) * 0.99)
-    index_90 = math.ceil(len(latency_sorted) * 0.90)
-
-    data = {
-        'latency': latency_list_in_order,
-        'start_time': start_time_list_in_order,
-        'stop_time': stop_time_list_in_order,
-        'num_ip_tokens': ip_token_len_list,
-        'num_op_tokens': op_token_len_list
-    }
-
-    # Bundle everything in a single DF
-    df = pd.DataFrame(data)
-
-    #stats
-    df['num_ip_tokens'].sum()
-    avg_ip_tokens = df['num_ip_tokens'].mean()
-    df['num_ip_tokens'].median()
-    df['num_ip_tokens'].std()
-    total_op_tokens = df['num_op_tokens'].sum()
-    avg_op_tokens = df['num_op_tokens'].mean()
-    df['num_op_tokens'].median()
-    df['num_op_tokens'].std()
-
-    tend = max(df['stop_time'].tolist())
-    t0 = min(df['start_time'].tolist())
-    total_latency = (tend - t0).total_seconds()
-    requests_per_sec = len(responses) / total_latency
-    tokens_generated_per_sec = total_op_tokens / total_latency
-
-    avg_in_flight_requests = 0
-
-    print_data_dict = {}
-    print_data_dict["Requests/Sec"] = requests_per_sec
-    print_data_dict["OP tokens/sec"] = tokens_generated_per_sec
-    print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order)
-    print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1]
-    print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1]
-    print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens
-    print_data_dict["Avg. Output tokens per request"] = avg_op_tokens
-    print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests
-    print_data_dict["Total latency (ms)"] = total_latency * 1000
-    print_data_dict["Total requests"] = len(responses)
-
-    print_data = [["Requests/Sec", requests_per_sec],
-                  ["OP tokens/sec", tokens_generated_per_sec],
-                  ["Avg. latency (ms)",
-                   np.mean(latency_list_in_order)],
-                  ["P99 latency (ms)", latency_sorted[index_99 - 1]],
-                  ["P90 latency (ms)", latency_sorted[index_90 - 1]],
-                  ["Avg. IP tokens per request", avg_ip_tokens],
-                  ["Avg. OP tokens per request", avg_op_tokens],
-                  ["Avg. InFlight requests", avg_in_flight_requests],
-                  ["Total latency (ms)", total_latency * 1000],
-                  ["Total requests", len(responses)]]
-
-    # Format numerical values to 2 decimal places
-    formatted_data = [[item, f"{value:.2f}"] for item, value in print_data]
-    headers = ["Stat", "Value"]
-    table = tabulate(formatted_data, headers=headers, tablefmt="pretty")
-
-    if FLAGS.op_stats_csv is not None:
-        with open(FLAGS.op_stats_csv, "a", newline="") as file:
-            filednames = print_data_dict.keys()
-            writer = csv.DictWriter(file, fieldnames=filednames)
-
-            # Check if the file is empty, and write the header if needed
-            if file.tell() == 0:
-                writer.writeheader()
-
-            # Write the dictionaries as new rows
-            writer.writerow(print_data_dict)
-
-    print(table)
-
-    if FLAGS.dump_perfetto_trace:
-        json_dict = []
-        for i in range(len(op_token_len_list)):
-            req_dict = {}
-            req_dict['name'] = 'req_{}'.format(i)
-            req_dict["cat"] = "batch"
-            req_dict["ph"] = "X"
-            req_dict["ts"] = (start_time_list_in_order[i].timestamp() -
-                              t0.timestamp()) * 1000000  #perfetto expects us
-            req_dict["dur"] = (
-                stop_time_list_in_order[i] -
-                start_time_list_in_order[i]).total_seconds() * 1000000
-            req_dict["pid"] = "1"
-            req_dict["args"] = {
-                "isl": int(ip_token_len_list[i]),
-                "osl": int(op_token_len_list[i])
-            }
-            json_dict.append(req_dict)
-
-        with open("prfetto_dump.json", "w") as file:
-            json.dump(json_dict, file, indent=4)
-
-    return print_data_dict
-
-
-def extract_string_from_nested_list(nested_list):
-    if isinstance(nested_list, str):
-        return nested_list
-    elif isinstance(nested_list, list):
-        for item in nested_list:
-            extracted_string = extract_string_from_nested_list(item)
-            if extracted_string:
-                return extracted_string
-    return ""
diff --git a/triton_backend/tools/whisper/client.py b/triton_backend/tools/whisper/client.py
deleted file mode 100644
index d2097d15fe2..00000000000
--- a/triton_backend/tools/whisper/client.py
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/usr/bin/python
-"""
-This script supports to load audio files and sends it to the server
-for decoding, in parallel.
-
-Usage:
-# For offlien whisper server
-python3 client.py \
-    --server-addr localhost \
-    --model-name whisper_bls \
-    --num-tasks $num_task \
-    --whisper-prompt "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" \
-    --audio-path $audio_path
-"""
-
-import argparse
-import asyncio
-import queue
-import sys
-import time
-import types
-
-import numpy as np
-import soundfile
-import tritonclient
-import tritonclient.grpc.aio as grpcclient
-from tritonclient.grpc import InferenceServerException
-from tritonclient.utils import np_to_triton_dtype
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    parser.add_argument(
-        "--server-addr",
-        type=str,
-        default="localhost",
-        help="Address of the server",
-    )
-
-    parser.add_argument(
-        "--server-port",
-        type=int,
-        default=8001,
-        help="Grpc port of the triton server, default is 8001",
-    )
-
-    parser.add_argument(
-        "--audio-path",
-        type=str,
-        required=True,
-        help=
-        "Path to a single audio file. It can't be specified at the same time with --manifest-dir",
-    )
-
-    parser.add_argument(
-        "--whisper-prompt",
-        type=str,
-        default="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-        help=
-        "e.g. <|startofprev|>My hot words<|startoftranscript|><|en|><|transcribe|><|notimestamps|>, please check https://arxiv.org/pdf/2305.11095.pdf also.",
-    )
-
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="whisper_bls",
-        choices=[
-            "whisper_bls",
-            "whisper_bls_async",
-            "whisper",
-        ],
-        help="triton model_repo module name to request",
-    )
-
-    parser.add_argument(
-        "--streaming",
-        action="store_true",
-        default=False,
-        help="""True for decopuled mode, False for synchronous mode""",
-    )
-
-    parser.add_argument(
-        "--num-tasks",
-        type=int,
-        default=1,
-        help="Number of parallel tasks",
-    )
-
-    return parser.parse_args()
-
-
-def load_audio(wav_path):
-    waveform, sample_rate = soundfile.read(wav_path)
-    assert sample_rate == 16000, f"Only support 16k sample rate, but got {sample_rate}"
-    return waveform, sample_rate
-
-
-async def send_whisper(
-    dps: list,
-    name: str,
-    triton_client: tritonclient.grpc.aio.InferenceServerClient,
-    protocol_client: types.ModuleType,
-    log_interval: int,
-    model_name: str,
-    padding_duration: int = 10,
-    whisper_prompt:
-    str = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    streaming: bool = False,
-):
-    total_duration = 0.0
-    results = []
-    latency_data = []
-    task_id = int(name[5:])
-    for i, dp in enumerate(dps):
-        if i % log_interval == 0:
-            print(f"{name}: {i}/{len(dps)}")
-
-        waveform, sample_rate = load_audio(dp["audio_filepath"])
-        duration = int(len(waveform) / sample_rate)
-
-        # padding to nearest 10 seconds
-        samples = np.zeros(
-            (
-                1,
-                padding_duration * sample_rate *
-                ((duration // padding_duration) + 1),
-            ),
-            dtype=np.float32,
-        )
-
-        samples[0, :len(waveform)] = waveform
-
-        lengths = np.array([[len(waveform)]], dtype=np.int32)
-
-        # Prepare inputs and outputs
-        inputs = [
-            protocol_client.InferInput("WAV", samples.shape,
-                                       np_to_triton_dtype(samples.dtype)),
-            protocol_client.InferInput("WAV_LENS", lengths.shape,
-                                       np_to_triton_dtype(lengths.dtype)),
-            protocol_client.InferInput("TEXT_PREFIX", [1, 1], "BYTES"),
-        ]
-        inputs[0].set_data_from_numpy(samples)
-        inputs[1].set_data_from_numpy(lengths)
-        input_data_numpy = np.array([whisper_prompt], dtype=object)
-        input_data_numpy = input_data_numpy.reshape((1, 1))
-        inputs[2].set_data_from_numpy(input_data_numpy)
-        outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
-
-        # Send request
-        sequence_id = 100000000 + i + task_id * 10
-        start = time.time()
-        if streaming:
-            user_data = UserData()
-
-            async def async_request_iterator():
-                yield {
-                    "model_name": model_name,
-                    "inputs": inputs,
-                    "outputs": outputs,
-                    "request_id": str(sequence_id)
-                }
-
-            try:
-                response_iterator = triton_client.stream_infer(
-                    inputs_iterator=async_request_iterator(),
-                    stream_timeout=None,
-                )
-                async for response in response_iterator:
-                    result, error = response
-                    if error:
-                        print(error)
-                        user_data._completed_requests.put(error)
-                    else:
-                        user_data._completed_requests.put(result)
-            except InferenceServerException as error:
-                print(error)
-                sys.exit(1)
-            results = []
-            while True:
-                try:
-                    data_item = user_data._completed_requests.get(block=False)
-                    if type(data_item) == InferenceServerException:
-                        sys.exit(1)
-                    else:
-
-                        decoding_results = data_item.as_numpy("TRANSCRIPTS")[0]
-                        if type(decoding_results) == np.ndarray:
-                            decoding_results = b" ".join(
-                                decoding_results).decode("utf-8")
-                        else:
-                            decoding_results = decoding_results.decode("utf-8")
-                        results.append(decoding_results)
-                except Exception:
-                    break
-            decoding_results = results[-1]
-        else:
-            response = await triton_client.infer(model_name,
-                                                 inputs,
-                                                 request_id=str(sequence_id),
-                                                 outputs=outputs)
-            decoding_results = response.as_numpy("TRANSCRIPTS")[0]
-            if type(decoding_results) == np.ndarray:
-                decoding_results = b" ".join(decoding_results).decode("utf-8")
-            else:
-                decoding_results = decoding_results.decode("utf-8")
-        end = time.time() - start
-        latency_data.append((end, duration))
-        total_duration += duration
-        results.append((
-            dp["id"],
-            dp["text"].split(),
-            decoding_results.split(),
-        ))
-        print(results[-1])
-
-    return total_duration, results, latency_data
-
-
-async def main():
-    args = get_args()
-    dps_list = [[{
-        "audio_filepath": args.audio_path,
-        "text": "foo",
-        "id": 0,
-    }]] * args.num_tasks
-
-    url = f"{args.server_addr}:{args.server_port}"
-
-    triton_client = grpcclient.InferenceServerClient(url=url, verbose=False)
-    protocol_client = grpcclient
-
-    tasks = []
-    start_time = time.time()
-    for i in range(args.num_tasks):
-        task = asyncio.create_task(
-            send_whisper(
-                dps=dps_list[i],
-                name=f"task-{i}",
-                triton_client=triton_client,
-                protocol_client=protocol_client,
-                log_interval=1,
-                model_name=args.model_name,
-                whisper_prompt=args.whisper_prompt,
-                streaming=args.streaming,
-            ))
-        tasks.append(task)
-
-    answer_list = await asyncio.gather(*tasks)
-
-    end_time = time.time()
-    elapsed = end_time - start_time
-
-    results = []
-    total_duration = 0.0
-    latency_data = []
-    for answer in answer_list:
-        total_duration += answer[0]
-        results += answer[1]
-        latency_data += answer[2]
-
-    rtf = elapsed / total_duration
-
-    s = f"RTF: {rtf:.4f}\n"
-    s += f"total_duration: {total_duration:.3f} seconds\n"
-    s += f"({total_duration/3600:.2f} hours)\n"
-    s += f"processing time: {elapsed:.3f} seconds " f"({elapsed/3600:.2f} hours)\n"
-
-    latency_list = [chunk_end for (chunk_end, chunk_duration) in latency_data]
-    latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
-    latency_variance = np.var(latency_list, dtype=np.float64) * 1000.0
-    s += f"latency_variance: {latency_variance:.2f}\n"
-    s += f"latency_50_percentile_ms: {np.percentile(latency_list, 50) * 1000.0:.2f}\n"
-    s += f"latency_90_percentile_ms: {np.percentile(latency_list, 90) * 1000.0:.2f}\n"
-    s += f"latency_95_percentile_ms: {np.percentile(latency_list, 95) * 1000.0:.2f}\n"
-    s += f"latency_99_percentile_ms: {np.percentile(latency_list, 99) * 1000.0:.2f}\n"
-    s += f"average_latency_ms: {latency_ms:.2f}\n"
-
-    print(s)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())