From 798c20bb74f53d621543711164c01f607d391399 Mon Sep 17 00:00:00 2001 From: Phillip Kuznetsov Date: Thu, 2 Oct 2025 23:26:47 +0000 Subject: [PATCH] Remove triton_backend to make code-scanner happy Signed-off-by: Phillip Kuznetsov --- .../disaggregated_serving/README.md | 123 -- .../disaggregated_serving.md | 123 -- .../disaggregated_serving_bls/1/model.py | 138 -- .../disaggregated_serving_bls/config.pbtxt | 465 ----- triton_backend/all_models/gpt/ensemble/1/.tmp | 0 .../all_models/gpt/ensemble/config.pbtxt | 230 --- .../all_models/gpt/postprocessing/1/model.py | 119 -- .../gpt/postprocessing/config.pbtxt | 32 - .../all_models/gpt/preprocessing/1/model.py | 196 -- .../all_models/gpt/preprocessing/config.pbtxt | 71 - .../all_models/gpt/tensorrt_llm/1/model.py | 203 --- .../all_models/gpt/tensorrt_llm/config.pbtxt | 146 -- .../inflight_batcher_llm/ensemble/1/.tmp | 0 .../ensemble/config.pbtxt | 712 -------- .../postprocessing/1/model.py | 177 -- .../postprocessing/config.pbtxt | 70 - .../preprocessing/1/model.py | 1003 ---------- .../preprocessing/config.pbtxt | 261 --- .../tensorrt_llm/1/model.py | 1623 ----------------- .../tensorrt_llm/config.pbtxt | 887 --------- .../tensorrt_llm_bls/1/lib/decode.py | 459 ----- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 597 ------ .../tensorrt_llm_bls/1/model.py | 253 --- .../tensorrt_llm_bls/config.pbtxt | 447 ----- .../llmapi/tensorrt_llm/1/helpers.py | 159 -- .../all_models/llmapi/tensorrt_llm/1/model.py | 692 ------- .../llmapi/tensorrt_llm/1/model.yaml | 19 - .../llmapi/tensorrt_llm/config.pbtxt | 237 --- .../multimodal/ensemble/config.pbtxt | 638 ------- .../multimodal/multimodal_encoders/1/model.py | 707 ------- .../multimodal_encoders/1/multimodal_utils.py | 350 ---- .../multimodal_encoders/config.pbtxt | 182 -- .../requirements-llava-onevision.txt | 1 - .../multimodal/requirements-mllama.txt | 2 - .../multimodal/requirements-qwen2vl.txt | 2 - .../multimodal/requirements-vila.txt | 5 - .../all_models/tests/test_decode.py | 376 ---- .../tests/test_llmapi_python_backend.py | 249 --- .../tests/test_multi_image_preprocess.py | 256 --- .../tests/test_multimodal_encoders.py | 155 -- .../all_models/tests/test_python_backend.py | 872 --------- .../all_models/tests/test_triton_decoder.py | 456 ----- .../all_models/whisper/whisper_bls/1/fbank.py | 99 - .../all_models/whisper/whisper_bls/1/model.py | 242 --- .../whisper/whisper_bls/1/tokenizer.py | 183 -- .../whisper/whisper_bls/config.pbtxt | 73 - .../base_metrics_verification_tests.py | 400 ---- .../custom_metrics_verification_tests.py | 169 -- .../ci/L0_backend_trtllm/generate_engines.sh | 82 - .../ci/L0_backend_trtllm/simple_data.json | 67 - triton_backend/ci/L0_backend_trtllm/test.sh | 662 ------- triton_backend/ci/README.md | 107 -- .../inflight_batcher_llm/CMakeLists.txt | 318 ---- .../inflight_batcher_llm/client/README.md | 31 - .../inflight_batcher_llm/client/__init__.py | 0 .../e2e_grpc_speculative_decoding_client.py | 562 ------ .../client/end_to_end_grpc_client.py | 736 -------- .../client/inflight_batcher_llm_client.py | 1250 ------------- .../TritonTensorRTLLMBackendConfig.cmake.in | 39 - .../cmake/modules/set_ifndef.cmake | 24 - .../inflight_batcher_llm/scripts/build.sh | 76 - .../custom_metrics_reporter.cc | 283 --- .../custom_metrics_reporter.h | 196 -- .../src/libtensorrtllm.cc | 225 --- .../src/libtriton_tensorrtllm.ldscript | 33 - .../src/model_instance_state.cc | 1569 ---------------- .../src/model_instance_state.h | 295 --- .../inflight_batcher_llm/src/model_state.cc | 336 ---- .../inflight_batcher_llm/src/model_state.h | 158 -- .../inflight_batcher_llm/src/namedTensor.cpp | 121 -- .../inflight_batcher_llm/src/namedTensor.h | 93 - .../inflight_batcher_llm/src/utils.cc | 1139 ------------ .../inflight_batcher_llm/src/utils.h | 443 ----- .../inflight_batcher_llm/tests/CMakeLists.txt | 66 - .../inflight_batcher_llm/tests/first.json | 25 - .../tests/modelInstanceStateTest.cpp | 94 - .../inflight_batcher_llm/tests/modelState.cpp | 105 -- .../inflight_batcher_llm/tests/second.json | 25 - .../inflight_batcher_llm/tests/third.json | 25 - .../inflight_batcher_llm/tests/utilsTest.cpp | 781 -------- triton_backend/requirements.txt | 7 - .../scripts/launch_triton_server.py | 249 --- triton_backend/tools/__init__.py | 0 triton_backend/tools/dataset/long_input.csv | 1 - triton_backend/tools/dataset/long_output.csv | 1 - .../tools/dataset/mini_cnn_eval.json | 502 ----- .../dataset/mini_cnn_eval_spec_decoding.json | 77 - .../tools/dataset/short_input_end_id.csv | 1 - .../dataset/short_input_end_id_medusa.csv | 1 - .../tools/dataset/short_output_end_id.csv | 1 - .../dataset/short_output_end_id_eagle.csv | 1 - .../dataset/short_output_end_id_medusa.csv | 1 - triton_backend/tools/fill_template.py | 70 - .../tools/gpt/benchmark_core_model.py | 177 -- triton_backend/tools/gpt/client.py | 120 -- triton_backend/tools/gpt/client_async.py | 133 -- triton_backend/tools/gpt/end_to_end_test.py | 249 --- triton_backend/tools/gpt/gen_input_data.py | 108 -- triton_backend/tools/gpt/input_data.json | 3 - .../benchmark_core_model.py | 518 ------ .../inflight_batcher_llm/end_to_end_test.py | 638 ------- .../speculative_decoding_test.py | 375 ---- .../test_max_queue_size.py | 638 ------- triton_backend/tools/llmapi_client.py | 275 --- triton_backend/tools/multimodal/client.py | 420 ----- triton_backend/tools/tests/__init__.py | 0 .../tools/tests/test_fill_template.py | 33 - triton_backend/tools/utils.sh | 35 - triton_backend/tools/utils/__init__.py | 0 triton_backend/tools/utils/utils.py | 483 ----- triton_backend/tools/whisper/client.py | 291 --- 111 files changed, 30233 deletions(-) delete mode 100644 triton_backend/all_models/disaggregated_serving/README.md delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving.md delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py delete mode 100644 triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt delete mode 100644 triton_backend/all_models/gpt/ensemble/1/.tmp delete mode 100755 triton_backend/all_models/gpt/ensemble/config.pbtxt delete mode 100644 triton_backend/all_models/gpt/postprocessing/1/model.py delete mode 100755 triton_backend/all_models/gpt/postprocessing/config.pbtxt delete mode 100644 triton_backend/all_models/gpt/preprocessing/1/model.py delete mode 100644 triton_backend/all_models/gpt/preprocessing/config.pbtxt delete mode 100644 triton_backend/all_models/gpt/tensorrt_llm/1/model.py delete mode 100644 triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt delete mode 100644 triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp delete mode 100644 triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt delete mode 100644 triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py delete mode 100644 triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt delete mode 100755 triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py delete mode 100755 triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt delete mode 100755 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py delete mode 100644 triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/1/helpers.py delete mode 100755 triton_backend/all_models/llmapi/tensorrt_llm/1/model.py delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml delete mode 100644 triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt delete mode 100755 triton_backend/all_models/multimodal/ensemble/config.pbtxt delete mode 100755 triton_backend/all_models/multimodal/multimodal_encoders/1/model.py delete mode 100644 triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py delete mode 100755 triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt delete mode 100644 triton_backend/all_models/multimodal/requirements-llava-onevision.txt delete mode 100644 triton_backend/all_models/multimodal/requirements-mllama.txt delete mode 100755 triton_backend/all_models/multimodal/requirements-qwen2vl.txt delete mode 100755 triton_backend/all_models/multimodal/requirements-vila.txt delete mode 100644 triton_backend/all_models/tests/test_decode.py delete mode 100644 triton_backend/all_models/tests/test_llmapi_python_backend.py delete mode 100755 triton_backend/all_models/tests/test_multi_image_preprocess.py delete mode 100644 triton_backend/all_models/tests/test_multimodal_encoders.py delete mode 100644 triton_backend/all_models/tests/test_python_backend.py delete mode 100644 triton_backend/all_models/tests/test_triton_decoder.py delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/fbank.py delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/model.py delete mode 100644 triton_backend/all_models/whisper/whisper_bls/1/tokenizer.py delete mode 100644 triton_backend/all_models/whisper/whisper_bls/config.pbtxt delete mode 100644 triton_backend/ci/L0_backend_trtllm/base_metrics_verification_tests.py delete mode 100644 triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py delete mode 100644 triton_backend/ci/L0_backend_trtllm/generate_engines.sh delete mode 100644 triton_backend/ci/L0_backend_trtllm/simple_data.json delete mode 100644 triton_backend/ci/L0_backend_trtllm/test.sh delete mode 100644 triton_backend/ci/README.md delete mode 100644 triton_backend/inflight_batcher_llm/CMakeLists.txt delete mode 100644 triton_backend/inflight_batcher_llm/client/README.md delete mode 100644 triton_backend/inflight_batcher_llm/client/__init__.py delete mode 100644 triton_backend/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py delete mode 100644 triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py delete mode 100755 triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py delete mode 100644 triton_backend/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in delete mode 100644 triton_backend/inflight_batcher_llm/cmake/modules/set_ifndef.cmake delete mode 100644 triton_backend/inflight_batcher_llm/scripts/build.sh delete mode 100644 triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc delete mode 100644 triton_backend/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h delete mode 100644 triton_backend/inflight_batcher_llm/src/libtensorrtllm.cc delete mode 100644 triton_backend/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript delete mode 100644 triton_backend/inflight_batcher_llm/src/model_instance_state.cc delete mode 100644 triton_backend/inflight_batcher_llm/src/model_instance_state.h delete mode 100644 triton_backend/inflight_batcher_llm/src/model_state.cc delete mode 100644 triton_backend/inflight_batcher_llm/src/model_state.h delete mode 100644 triton_backend/inflight_batcher_llm/src/namedTensor.cpp delete mode 100644 triton_backend/inflight_batcher_llm/src/namedTensor.h delete mode 100644 triton_backend/inflight_batcher_llm/src/utils.cc delete mode 100644 triton_backend/inflight_batcher_llm/src/utils.h delete mode 100644 triton_backend/inflight_batcher_llm/tests/CMakeLists.txt delete mode 100644 triton_backend/inflight_batcher_llm/tests/first.json delete mode 100644 triton_backend/inflight_batcher_llm/tests/modelInstanceStateTest.cpp delete mode 100644 triton_backend/inflight_batcher_llm/tests/modelState.cpp delete mode 100644 triton_backend/inflight_batcher_llm/tests/second.json delete mode 100644 triton_backend/inflight_batcher_llm/tests/third.json delete mode 100644 triton_backend/inflight_batcher_llm/tests/utilsTest.cpp delete mode 100644 triton_backend/requirements.txt delete mode 100644 triton_backend/scripts/launch_triton_server.py delete mode 100644 triton_backend/tools/__init__.py delete mode 100644 triton_backend/tools/dataset/long_input.csv delete mode 100644 triton_backend/tools/dataset/long_output.csv delete mode 100644 triton_backend/tools/dataset/mini_cnn_eval.json delete mode 100644 triton_backend/tools/dataset/mini_cnn_eval_spec_decoding.json delete mode 100755 triton_backend/tools/dataset/short_input_end_id.csv delete mode 100644 triton_backend/tools/dataset/short_input_end_id_medusa.csv delete mode 100755 triton_backend/tools/dataset/short_output_end_id.csv delete mode 100644 triton_backend/tools/dataset/short_output_end_id_eagle.csv delete mode 100644 triton_backend/tools/dataset/short_output_end_id_medusa.csv delete mode 100644 triton_backend/tools/fill_template.py delete mode 100644 triton_backend/tools/gpt/benchmark_core_model.py delete mode 100644 triton_backend/tools/gpt/client.py delete mode 100644 triton_backend/tools/gpt/client_async.py delete mode 100644 triton_backend/tools/gpt/end_to_end_test.py delete mode 100644 triton_backend/tools/gpt/gen_input_data.py delete mode 100644 triton_backend/tools/gpt/input_data.json delete mode 100644 triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py delete mode 100644 triton_backend/tools/inflight_batcher_llm/end_to_end_test.py delete mode 100644 triton_backend/tools/inflight_batcher_llm/speculative_decoding_test.py delete mode 100644 triton_backend/tools/inflight_batcher_llm/test_max_queue_size.py delete mode 100755 triton_backend/tools/llmapi_client.py delete mode 100755 triton_backend/tools/multimodal/client.py delete mode 100644 triton_backend/tools/tests/__init__.py delete mode 100644 triton_backend/tools/tests/test_fill_template.py delete mode 100644 triton_backend/tools/utils.sh delete mode 100644 triton_backend/tools/utils/__init__.py delete mode 100644 triton_backend/tools/utils/utils.py delete mode 100644 triton_backend/tools/whisper/client.py diff --git a/triton_backend/all_models/disaggregated_serving/README.md b/triton_backend/all_models/disaggregated_serving/README.md deleted file mode 100644 index 9ebb917c752..00000000000 --- a/triton_backend/all_models/disaggregated_serving/README.md +++ /dev/null @@ -1,123 +0,0 @@ - - -# Running Disaggregated Serving with Triton TensorRT LLM Backend - -## Overview - -Disaggregated serving refers to a technique that uses separate GPUs for -running the context and generation phases of LLM inference. - -For Triton integration, a BLS model named -[_disaggregated\_serving\_bls_](./disaggregated_serving_bls/1/model.py) -has been created that orchestrates the disaggregated serving pipeline. This -BLS model requires the TRT-LLM model names that are going to be used for -context and generation phases. - -This example assumes access to a two GPU device systems with CUDA_VISIBLE_DEVICES -set to `0,1`. - -## Model Repository Setup and Start Server - -1. Setup the model repository as instructed in the [LLaMa](../docs/llama.md) -guide. - -2. Create context and generation models with the desired tensor-parallel -configuration. We will be using `context` and `generation` model names for -context and generation models respectively. The context and generation models -should be copying the config -[tensorrt_llm](../inflight_batcher_llm/tensorrt_llm/) model. - -3. Set the `participant_ids` for context and generation models to `1` and `2` respectively. - -4. Set the `gpu_device_ids` for context and generation models to `0` and `1` respectively. - -5. Set the `context_model_name` and `generation_model_name` to `context` and `generation` in the -[disaggregated_serving_bls](./disaggregated_serving_bls/config.pbtxt) model configuration. - -Your model repository should look like below: - -``` -disaggreagted_serving/ -|-- context -| |-- 1 -| `-- config.pbtxt -|-- disaggregated_serving_bls -| |-- 1 -| | `-- model.py -| `-- config.pbtxt -|-- ensemble -| |-- 1 -| `-- config.pbtxt -|-- generation -| |-- 1 -| `-- config.pbtxt -|-- postprocessing -| |-- 1 -| | `-- model.py -| `-- config.pbtxt -`-- preprocessing - |-- 1 - | `-- model.py - `-- config.pbtxt -``` - -6. Rename the `tensorrt_llm` model in the `ensemble` config.pbtxt file to `disaggregated_serving_bls`. - -7. Launch the Triton Server: - -``` -python3 scripts/launch_triton_server.py --world_size 3 --tensorrt_llm_model_name context,generation --multi-model --disable-spawn-processes -``` - -> ![NOTE] -> -> The world size should be equal to `tp*pp` of context model + `tp*pp` of generation model + 1. -> The additional process is required for the orchestrator. - -6. Send a request to the server. - -``` -python3 inflight_batcher_llm/client/end_to_end_grpc_client.py -S -p "Machine learning is" -``` - -## Creating Multiple Copies of the Context and Generation Models (Data Parallelism) - -You can also create multiple copies of the context and generation models. This can be -achieved by setting the `participant_ids` and `gpu_device_ids` for each instance. - -For example, if you have a context model with `tp=2` and you want to create 2 -copies of it, you can set the `participant_ids` to `1,2;3,4`, -`gpu_device_ids` to `0,1;2,3` (assuming a 4-GPU system), and set the `count` -in `instance_groups` section of the model configuration to 2. This will create 2 -copies of the context model where the first copy will be on GPU 0 and 1, and the -second copy will be on GPU 2 and 3. - -## Known Issues - -1. Only C++ version of the backend is supported right now. diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md b/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md deleted file mode 100644 index 9ebb917c752..00000000000 --- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving.md +++ /dev/null @@ -1,123 +0,0 @@ - - -# Running Disaggregated Serving with Triton TensorRT LLM Backend - -## Overview - -Disaggregated serving refers to a technique that uses separate GPUs for -running the context and generation phases of LLM inference. - -For Triton integration, a BLS model named -[_disaggregated\_serving\_bls_](./disaggregated_serving_bls/1/model.py) -has been created that orchestrates the disaggregated serving pipeline. This -BLS model requires the TRT-LLM model names that are going to be used for -context and generation phases. - -This example assumes access to a two GPU device systems with CUDA_VISIBLE_DEVICES -set to `0,1`. - -## Model Repository Setup and Start Server - -1. Setup the model repository as instructed in the [LLaMa](../docs/llama.md) -guide. - -2. Create context and generation models with the desired tensor-parallel -configuration. We will be using `context` and `generation` model names for -context and generation models respectively. The context and generation models -should be copying the config -[tensorrt_llm](../inflight_batcher_llm/tensorrt_llm/) model. - -3. Set the `participant_ids` for context and generation models to `1` and `2` respectively. - -4. Set the `gpu_device_ids` for context and generation models to `0` and `1` respectively. - -5. Set the `context_model_name` and `generation_model_name` to `context` and `generation` in the -[disaggregated_serving_bls](./disaggregated_serving_bls/config.pbtxt) model configuration. - -Your model repository should look like below: - -``` -disaggreagted_serving/ -|-- context -| |-- 1 -| `-- config.pbtxt -|-- disaggregated_serving_bls -| |-- 1 -| | `-- model.py -| `-- config.pbtxt -|-- ensemble -| |-- 1 -| `-- config.pbtxt -|-- generation -| |-- 1 -| `-- config.pbtxt -|-- postprocessing -| |-- 1 -| | `-- model.py -| `-- config.pbtxt -`-- preprocessing - |-- 1 - | `-- model.py - `-- config.pbtxt -``` - -6. Rename the `tensorrt_llm` model in the `ensemble` config.pbtxt file to `disaggregated_serving_bls`. - -7. Launch the Triton Server: - -``` -python3 scripts/launch_triton_server.py --world_size 3 --tensorrt_llm_model_name context,generation --multi-model --disable-spawn-processes -``` - -> ![NOTE] -> -> The world size should be equal to `tp*pp` of context model + `tp*pp` of generation model + 1. -> The additional process is required for the orchestrator. - -6. Send a request to the server. - -``` -python3 inflight_batcher_llm/client/end_to_end_grpc_client.py -S -p "Machine learning is" -``` - -## Creating Multiple Copies of the Context and Generation Models (Data Parallelism) - -You can also create multiple copies of the context and generation models. This can be -achieved by setting the `participant_ids` and `gpu_device_ids` for each instance. - -For example, if you have a context model with `tp=2` and you want to create 2 -copies of it, you can set the `participant_ids` to `1,2;3,4`, -`gpu_device_ids` to `0,1;2,3` (assuming a 4-GPU system), and set the `count` -in `instance_groups` section of the model configuration to 2. This will create 2 -copies of the context model where the first copy will be on GPU 0 and 1, and the -second copy will be on GPU 2 and 3. - -## Known Issues - -1. Only C++ version of the backend is supported right now. diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py b/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py deleted file mode 100644 index 14e8de75a87..00000000000 --- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/1/model.py +++ /dev/null @@ -1,138 +0,0 @@ -import json - -import triton_python_backend_utils as pb_utils - - -def read_parameter_as_type(value, name, pytype=str): - if value == "": - return None - if value.startswith("${") and value.endswith("}"): - return None - if pytype is bool: - return value.lower() in ["1", "true"] - try: - result = pytype(value) - return result - except: - pb_utils.Logger.log_warning( - f"Could not read parameter '{name}' with value '{value}', will use default." - ) - return None - - -def get_parameter(model_config, name, pytype=str): - if name not in model_config['parameters']: - return None - return read_parameter_as_type( - model_config['parameters'][name]['string_value'], name, pytype) - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args['model_config']) - self.context_model_name = get_parameter(model_config, - "context_model_name") - self.generation_model_name = get_parameter(model_config, - "generation_model_name") - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - - def create_context_request(self, request): - inputs = request.inputs() - triton_request = pb_utils.InferenceRequest( - model_name=self.context_model_name, - inputs=inputs, - parameters={"request_type": "context_only"}, - requested_output_names=[]) - return triton_request - - def create_generation_request(self, request, context_response): - inputs = request.inputs() - context_phase_params = pb_utils.get_output_tensor_by_name( - context_response, "context_phase_params") - if context_phase_params is None: - raise pb_utils.TritonModelException( - "Context response must have an output named context phase params" - ) - inputs.append(context_phase_params) - triton_request = pb_utils.InferenceRequest( - model_name=self.generation_model_name, - inputs=inputs, - parameters={"request_type": "generation_only"}, - requested_output_names=[]) - return triton_request - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - for request in requests: - context_request = self.create_context_request(request) - context_responses = context_request.exec(decoupled=self.decoupled) - if self.decoupled: - context_responses = list(context_responses) - assert len( - context_responses) == 1, "Expected 1 context response" - - if self.decoupled: - context_response = context_responses[0] - else: - context_response = context_responses - if context_response.has_error(): - raise pb_utils.TritonModelException( - f"Context model {self.context_model_name} failed with error: {context_response.error().message()}" - ) - generation_request = self.create_generation_request( - request, context_response) - - # TODO(itabrizian): Send the context response to reduce TTFT in decoupled case. - # It requires adding the generated token to the generation request - # to avoid sending the first token multiple times. - responses = generation_request.exec(decoupled=self.decoupled) - - if self.decoupled: - for response in responses: - if response.has_error(): - raise pb_utils.TritonModelException( - f"Generation model {self.generation_model_name} failed with error: {response.error().message()}" - ) - request.get_response_sender().send(response) - - request.get_response_sender().send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - request.get_response_sender().send( - responses, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) diff --git a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt b/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt deleted file mode 100644 index 8243c2cf03b..00000000000 --- a/triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt +++ /dev/null @@ -1,465 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "disaggregated_serving_bls" -backend: "python" -max_batch_size: ${triton_max_batch_size} - -model_transaction_policy { - decoupled: ${decoupled_mode} -} - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - allow_ragged_batch: true - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "request_output_len" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_return_sequences" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "draft_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "draft_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "draft_acceptance_threshold" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stop_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "bad_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "embedding_bias" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "runtime_top_p_min" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "runtime_top_p_decay" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "runtime_top_p_reset_ids" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "early_stopping" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "beam_search_diversity_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_perf_metrics" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "exclude_input_in_output" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "stop" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "streaming" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "prompt_table_extra_ids" - data_type: TYPE_UINT64 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - # the unique task ID for the given LoRA. - # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. - # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. - # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. - { - name: "lora_task_id" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] - # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer - # each of the in / out tensors are first flattened and then concatenated together in the format above. - # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. - { - name: "lora_weights" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - # module identifier (same size a first dimension of lora_weights) - # See LoraModule::ModuleType for model id mapping - # - # "attn_qkv": 0 # compbined qkv adapter - # "attn_q": 1 # q adapter - # "attn_k": 2 # k adapter - # "attn_v": 3 # v adapter - # "attn_dense": 4 # adapter for the dense layer in attention - # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection - # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection - # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate - # - # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] - { - name: "lora_config" - data_type: TYPE_INT32 - dims: [ -1, 3 ] - optional: true - allow_ragged_batch: true - }, - { - name: "guided_decoding_guide_type" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "guided_decoding_guide" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "return_num_output_tokens" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_num_input_tokens" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "sequence_length" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: ${logits_datatype} - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: ${logits_datatype} - dims: [ -1, -1, -1 ] - }, - { - name: "batch_index" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "sequence_index" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_alloc_new_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_reused_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_alloc_total_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "arrival_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "first_scheduled_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "first_token_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "last_token_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "acceptance_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - }, - { - name: "total_accepted_draft_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "total_draft_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_output_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_input_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - } -] -instance_group [ - { - count: ${disaggregated_serving_bls_count} - kind : KIND_CPU - } -] - -parameters: { - key: "context_model_name" - value: { - string_value: "${context_model_name}" - } -} -parameters: { - key: "generation_model_name" - value: { - string_value: "${generation_model_name}" - } -} diff --git a/triton_backend/all_models/gpt/ensemble/1/.tmp b/triton_backend/all_models/gpt/ensemble/1/.tmp deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/triton_backend/all_models/gpt/ensemble/config.pbtxt b/triton_backend/all_models/gpt/ensemble/config.pbtxt deleted file mode 100755 index e1bc29a2531..00000000000 --- a/triton_backend/all_models/gpt/ensemble/config.pbtxt +++ /dev/null @@ -1,230 +0,0 @@ -name: "ensemble" -platform: "ensemble" -max_batch_size: 1024 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "output_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "end_id" - } - input_map { - key: "pad_id" - value: "pad_id" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_tokens" - value: "min_tokens" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "frequency_penalty" - value: "frequency_penalty" - } - input_map { - key: "seed" - value: "seed" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "output_log_probs" - value: "output_log_probs" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - } - ] -} diff --git a/triton_backend/all_models/gpt/postprocessing/1/model.py b/triton_backend/all_models/gpt/postprocessing/1/model.py deleted file mode 100644 index 006a98cf399..00000000000 --- a/triton_backend/all_models/gpt/postprocessing/1/model.py +++ /dev/null @@ -1,119 +0,0 @@ -# -*- coding: utf-8 -*- -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side="left", - trust_remote_code=True) - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy( - output_config['data_type']) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name( - request, 'TOKENS_BATCH').as_numpy() - - # Reshape Input - # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) - # tokens_batch = tokens_batch.T - - # Postprocessing output data. - outputs = self._postprocessing(tokens_batch) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( - 'OUTPUT', - np.array(outputs).astype(self.output_dtype)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[output_tensor]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _postprocessing(self, tokens_batch): - outputs = [] - for beam_tokens in tokens_batch: - for tokens in beam_tokens: - output = self.tokenizer.decode(tokens) - outputs.append(output.encode('utf8')) - return outputs diff --git a/triton_backend/all_models/gpt/postprocessing/config.pbtxt b/triton_backend/all_models/gpt/postprocessing/config.pbtxt deleted file mode 100755 index 04b56cab226..00000000000 --- a/triton_backend/all_models/gpt/postprocessing/config.pbtxt +++ /dev/null @@ -1,32 +0,0 @@ -name: "postprocessing" -backend: "python" -max_batch_size: 1024 -dynamic_batching {} -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/triton_backend/all_models/gpt/preprocessing/1/model.py b/triton_backend/all_models/gpt/preprocessing/1/model.py deleted file mode 100644 index a301e16b747..00000000000 --- a/triton_backend/all_models/gpt/preprocessing/1/model.py +++ /dev/null @@ -1,196 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from typing import List - -import numpy as np -import torch -import triton_python_backend_utils as pb_utils -from torch.nn.utils.rnn import pad_sequence -from transformers import AutoTokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left', - legacy=False, - trust_remote_code=True) - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token, - add_special_tokens=False)[0] - - # Parse model output configs and convert Triton types to numpy types - input_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS" - ] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name( - model_config, input_name)['data_type'])) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, - 'QUERY').as_numpy() - request_output_len = pb_utils.get_input_tensor_by_name( - request, 'REQUEST_OUTPUT_LEN').as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, 'BAD_WORDS_DICT').as_numpy() - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, 'STOP_WORDS_DICT').as_numpy() - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - 'INPUT_ID', - np.array(input_id).astype(self.input_id_dtype)) - request_input_len_tensor = pb_utils.Tensor( - 'REQUEST_INPUT_LEN', - np.array(request_input_len).astype( - self.request_input_len_dtype)) - request_output_len_tensor = pb_utils.Tensor('REQUEST_OUTPUT_LEN', - request_output_len) - bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) - stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS', - stop_words) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor - ]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - start_ids = [ - torch.IntTensor(self.tokenizer.encode(s[0].decode())) for s in query - ] - start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) - - start_ids = pad_sequence(start_ids, - batch_first=True, - padding_value=self.pad_id) - # input_len = min(start_lengths) - #attn_mask = torch.ones((batch_size, input_len, input_len)).tril() - - return start_ids, start_lengths - - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): - ''' - word_lists format: - len(word_lists) == batch_size - word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". - ''' - assert self.tokenizer != None, "need to set tokenizer" - - if word_lists is None: - # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") - - flat_ids = [] - offsets = [] - for word_list in word_lists: - item_flat_ids = [] - item_offsets = [] - - for word in word_list: - if isinstance(word, bytes): - word = word.decode() - - ids = self.tokenizer.encode(word, add_special_tokens=False) - - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), - constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) diff --git a/triton_backend/all_models/gpt/preprocessing/config.pbtxt b/triton_backend/all_models/gpt/preprocessing/config.pbtxt deleted file mode 100644 index c36b2b81618..00000000000 --- a/triton_backend/all_models/gpt/preprocessing/config.pbtxt +++ /dev/null @@ -1,71 +0,0 @@ -name: "preprocessing" -backend: "python" -max_batch_size: 1024 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "PROMPT_LEARNING_TASK_NAME_IDS" - data_type: TYPE_INT32 - dims: [ 1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/triton_backend/all_models/gpt/tensorrt_llm/1/model.py b/triton_backend/all_models/gpt/tensorrt_llm/1/model.py deleted file mode 100644 index cb8cad377fe..00000000000 --- a/triton_backend/all_models/gpt/tensorrt_llm/1/model.py +++ /dev/null @@ -1,203 +0,0 @@ -import json - -import torch -import triton_python_backend_utils as pb_utils -from torch import from_numpy - -from tensorrt_llm.runtime import ModelRunner, SamplingConfig - - -def mpi_comm(): - from mpi4py import MPI - return MPI.COMM_WORLD - - -def mpi_rank(): - return mpi_comm().Get_rank() - - -def get_engine_name(model, dtype, tp_size, rank): - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is not None: - # Triton tensor -> numpy tensor -> PyTorch tensor - return from_numpy(tensor.as_numpy()) - else: - return tensor - - -def get_input_scalar_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is not None: - # Triton tensor -> numpy tensor -> first scalar - tensor = tensor.as_numpy() - return tensor.reshape((tensor.size, ))[0] - else: - return tensor - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args['model_config']) - engine_dir = model_config['parameters']['engine_dir']['string_value'] - self.comm = mpi_comm() - self.rank = mpi_rank() - self.runner = ModelRunner.from_dir(engine_dir=engine_dir, - rank=self.rank) - if self.rank != 0: - while (True): - self.execute([None]) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - responses = [] - - # Every Python backend must iterate through list of requests and create - # an instance of pb_utils.InferenceResponse class for each of them. You - # should avoid storing any of the input Tensors in the class attributes - # as they will be overridden in subsequent inference requests. You can - # make a copy of the underlying NumPy array and store it if it is - # required. - for request in requests: - # Perform inference on the request and append it to responses list... - inputs = {} - if self.rank == 0: - inputs['input_ids'] = get_input_tensor_by_name( - request, 'input_ids') - inputs['input_lengths'] = get_input_tensor_by_name( - request, 'input_lengths') - inputs['request_output_len'] = get_input_scalar_by_name( - request, 'request_output_len') - inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') - inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') - inputs['beam_width'] = get_input_scalar_by_name( - request, 'beam_width') - inputs['temperature'] = get_input_scalar_by_name( - request, 'temperature') - inputs['runtime_top_k'] = get_input_scalar_by_name( - request, 'runtime_top_k') - inputs['runtime_top_p'] = get_input_scalar_by_name( - request, 'runtime_top_p') - inputs['len_penalty'] = get_input_scalar_by_name( - request, 'len_penalty') - inputs['repetition_penalty'] = get_input_scalar_by_name( - request, 'repetition_penalty') - inputs['min_tokens'] = get_input_scalar_by_name( - request, 'min_tokens') - inputs['presence_penalty'] = get_input_scalar_by_name( - request, 'presence_penalty') - inputs['frequency_penalty'] = get_input_scalar_by_name( - request, 'frequency_penalty') - inputs['seed'] = get_input_scalar_by_name(request, 'seed') - inputs['output_log_probs'] = get_input_scalar_by_name( - request, 'output_log_probs') - - # Broadcast requests to other clients - inputs = self.comm.bcast(inputs, root=0) - input_ids = inputs['input_ids'].cuda() - end_id = inputs['end_id'] - pad_id = inputs['pad_id'] - - sampling_config = SamplingConfig(end_id=end_id, pad_id=pad_id) - if inputs['beam_width'] is not None: - sampling_config.num_beams = inputs['beam_width'] - if inputs['temperature'] is not None: - sampling_config.temperature = inputs['temperature'] - if inputs['runtime_top_k'] is not None: - sampling_config.top_k = inputs['runtime_top_k'] - if inputs['runtime_top_p'] is not None: - sampling_config.top_p = inputs['runtime_top_p'] - if inputs['len_penalty'] is not None: - sampling_config.length_penalty = inputs['len_penalty'] - if inputs['repetition_penalty'] is not None: - sampling_config.repetition_penalty = inputs[ - 'repetition_penalty'] - if inputs['min_tokens'] is not None: - sampling_config.min_tokens = inputs['min_tokens'] - if inputs['presence_penalty'] is not None: - sampling_config.presence_penalty = inputs['presence_penalty'] - if inputs['frequency_penalty'] is not None: - sampling_config.frequency_penalty = inputs['frequency_penalty'] - sampling_config.seed = inputs['seed'] - sampling_config.output_log_probs = inputs['output_log_probs'] - sampling_config.return_dict = True - - outputs = self.runner.generate(input_ids, None, sampling_config) - output_ids = outputs["output_ids"] - - if self.rank == 0: - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - torch.cuda.synchronize() - output_tensors = [ - pb_utils.Tensor("output_ids", - output_ids.cpu().numpy()) - ] - - if sampling_config.output_log_probs: - # [max_seq_len, batch_size, num_beams] -> [batch_size, max_seq_len, num_beams] - log_probs = self.runner.session.log_probs_tiled.transpose( - 0, 1).cpu().numpy() - output_tensors.append( - pb_utils.Tensor("log_probs", log_probs)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - - inference_response = pb_utils.InferenceResponse(output_tensors) - else: - inference_response = pb_utils.InferenceResponse([]) - responses.append(inference_response) - - # You must return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - return diff --git a/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt b/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt deleted file mode 100644 index 5d2108fc3ea..00000000000 --- a/triton_backend/all_models/gpt/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,146 +0,0 @@ -name: "tensorrt_llm" -backend: "python" -max_batch_size: 1024 - -# # Uncomment this for dynamic_batching -# dynamic_batching { -# max_queue_delay_microseconds: 50000 -# } - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "output_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters { - key: "engine_dir" - value: { - string_value: "${engine_dir}" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} diff --git a/triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp b/triton_backend/all_models/inflight_batcher_llm/ensemble/1/.tmp deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt deleted file mode 100644 index c07a5ab6de4..00000000000 --- a/triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt +++ /dev/null @@ -1,712 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "ensemble" -platform: "ensemble" -max_batch_size: ${triton_max_batch_size} -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ 1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_return_sequences" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "exclude_input_in_output" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_perf_metrics" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_num_input_tokens" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_num_output_tokens" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_table_extra_id" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - # the unique task ID for the given LoRA. - # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. - # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. - # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. - { - name: "lora_task_id" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] - # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer - # each of the in / out tensors are first flattened and then concatenated together in the format above. - # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. - { - name: "lora_weights" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - # module identifier (same size a first dimension of lora_weights) - # See LoraModule::ModuleType for model id mapping - # - # "attn_qkv": 0 # compbined qkv adapter - # "attn_q": 1 # q adapter - # "attn_k": 2 # k adapter - # "attn_v": 3 # v adapter - # "attn_dense": 4 # adapter for the dense layer in attention - # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection - # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection - # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate - # - # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] - { - name: "lora_config" - data_type: TYPE_INT32 - dims: [ -1, 3 ] - optional: true - allow_ragged_batch: true - }, - { - name: "guided_decoding_guide_type" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "guided_decoding_guide" - data_type: TYPE_STRING - dims: [ 1 ] - optional: true - allow_ragged_batch: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: ${logits_datatype} - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: ${logits_datatype} - dims: [ -1, -1, -1 ] - }, - { - name: "batch_index" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "sequence_index" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_alloc_new_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_reused_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_alloc_total_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "kv_cache_alloc_total_blocks" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "arrival_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "first_scheduled_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "first_token_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "last_token_time_ns" - data_type: TYPE_INT64 - dims: [ 1 ] - }, - { - name: "acceptance_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - }, - { - name: "total_accepted_draft_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "total_draft_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_input_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "num_output_tokens" - data_type: TYPE_INT32 - dims: [ 1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "DECODER_QUERY" - value: "decoder_text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - input_map { - key: "EMBEDDING_BIAS_WORDS" - value: "embedding_bias_words" - } - input_map { - key: "EMBEDDING_BIAS_WEIGHTS" - value: "embedding_bias_weights" - } - input_map { - key: "END_ID" - value: "end_id" - } - input_map { - key: "PAD_ID" - value: "pad_id" - } - input_map { - key: "PROMPT_TABLE_EXTRA_ID" - value: "prompt_table_extra_id" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_DECODER_INPUT_LEN" - value: "_REQUEST_DECODER_INPUT_LEN" - } - output_map { - key: "DECODER_INPUT_ID" - value: "_DECODER_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - output_map { - key: "STOP_WORDS_IDS" - value: "_STOP_WORDS_IDS" - } - output_map { - key: "BAD_WORDS_IDS" - value: "_BAD_WORDS_IDS" - } - output_map { - key: "EMBEDDING_BIAS" - value: "_EMBEDDING_BIAS" - } - output_map { - key: "OUT_END_ID" - value: "_PREPROCESSOR_END_ID" - } - output_map { - key: "OUT_PAD_ID" - value: "_PREPROCESSOR_PAD_ID" - } - output_map { - key: "OUT_PROMPT_TABLE_EXTRA_IDS" - value: "_OUT_PROMPT_TABLE_EXTRA_IDS" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "decoder_input_ids" - value: "_DECODER_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "decoder_input_lengths" - value: "_REQUEST_DECODER_INPUT_LEN" - } - input_map { - key: "exclude_input_in_output" - value: "exclude_input_in_output" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "_PREPROCESSOR_END_ID" - } - input_map { - key: "pad_id" - value: "_PREPROCESSOR_PAD_ID" - } - input_map { - key: "embedding_bias" - value: "_EMBEDDING_BIAS" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_tokens" - value: "min_tokens" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "frequency_penalty" - value: "frequency_penalty" - } - input_map { - key: "seed" - value: "seed" - } - input_map { - key: "return_log_probs" - value: "return_log_probs" - } - input_map { - key: "return_context_logits" - value: "return_context_logits" - } - input_map { - key: "return_generation_logits" - value: "return_generation_logits" - } - input_map { - key: "return_perf_metrics" - value: "return_perf_metrics" - } - input_map { - key: "return_num_input_tokens" - value: "return_num_input_tokens" - } - input_map { - key: "return_num_output_tokens" - value: "return_num_output_tokens" - } - input_map { - key: "num_return_sequences" - value: "num_return_sequences" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "streaming" - value: "stream" - } - input_map { - key: "prompt_embedding_table" - value: "prompt_embedding_table" - } - input_map { - key: "prompt_vocab_size" - value: "prompt_vocab_size" - } - input_map { - key: "stop_words_list" - value: "_STOP_WORDS_IDS" - } - input_map { - key: "bad_words_list" - value: "_BAD_WORDS_IDS" - } - input_map { - key: "prompt_table_extra_ids" - value: "_OUT_PROMPT_TABLE_EXTRA_IDS" - }, - input_map { - key: "lora_task_id", - value: "lora_task_id" - }, - input_map { - key: "lora_weights", - value: "lora_weights" - }, - input_map { - key: "lora_config", - value: "lora_config" - }, - input_map { - key: "guided_decoding_guide_type", - value: "guided_decoding_guide_type" - }, - input_map { - key: "guided_decoding_guide", - value: "guided_decoding_guide" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - output_map { - key: "sequence_length" - value: "_SEQUENCE_LENGTH" - }, - output_map { - key: "cum_log_probs" - value: "cum_log_probs" - } - output_map { - key: "output_log_probs" - value: "output_log_probs" - }, - output_map { - key: "context_logits" - value: "context_logits" - }, - output_map { - key: "generation_logits" - value: "generation_logits" - }, - output_map { - key: "batch_index" - value: "batch_index" - }, - output_map { - key: "sequence_index" - value: "sequence_index" - }, - output_map { - key: "kv_cache_alloc_new_blocks" - value: "kv_cache_alloc_new_blocks" - }, - output_map { - key: "kv_cache_reused_blocks" - value: "kv_cache_reused_blocks" - }, - output_map { - key: "kv_cache_alloc_total_blocks" - value: "kv_cache_alloc_total_blocks" - }, - output_map { - key: "arrival_time_ns" - value: "arrival_time_ns" - }, - output_map { - key: "first_scheduled_time_ns" - value: "first_scheduled_time_ns" - }, - output_map { - key: "first_token_time_ns" - value: "first_token_time_ns" - }, - output_map { - key: "last_token_time_ns" - value: "last_token_time_ns" - }, - output_map { - key: "acceptance_rate" - value: "acceptance_rate" - }, - output_map { - key: "total_accepted_draft_tokens" - value: "total_accepted_draft_tokens" - }, - output_map { - key: "total_draft_tokens" - value: "total_draft_tokens" - }, - output_map { - key: "num_input_tokens" - value: "num_input_tokens" - }, - output_map { - key: "num_output_tokens" - value: "num_output_tokens" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - input_map { - key: "SEQUENCE_LENGTH" - value: "_SEQUENCE_LENGTH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - } - ] -} diff --git a/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py b/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py deleted file mode 100644 index 70d88ae785c..00000000000 --- a/triton_backend/all_models/inflight_batcher_llm/postprocessing/1/model.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - - skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens') - if skip_special_tokens is not None: - skip_special_tokens_str = skip_special_tokens['string_value'].lower( - ) - if skip_special_tokens_str in [ - 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' - ]: - self.skip_special_tokens = skip_special_tokens_str in [ - 'true', '1', 't', 'y', 'yes' - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." - ) - self.skip_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." - ) - self.skip_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy( - output_config['data_type']) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - tokens_batch = [] - sequence_lengths = [] - for idx, request in enumerate(requests): - for input_tensor in request.inputs(): - if input_tensor.name() == "TOKENS_BATCH": - tokens_batch.append(input_tensor.as_numpy()) - elif input_tensor.name() == "SEQUENCE_LENGTH": - sequence_lengths.append(input_tensor.as_numpy()) - else: - raise ValueError(f"unknown input {input_tensor.name}") - - # batch decode - list_of_tokens = [] - req_idx_offset = 0 - req_idx_offsets = [req_idx_offset] - for idx, token_batch in enumerate(tokens_batch): - for batch_idx, beam_tokens in enumerate(token_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[idx][batch_idx][beam_idx] - list_of_tokens.append(tokens[:seq_len]) - req_idx_offset += 1 - - req_idx_offsets.append(req_idx_offset) - - all_outputs = self.tokenizer.batch_decode( - list_of_tokens, skip_special_tokens=self.skip_special_tokens) - - # construct responses - responses = [] - for idx, request in enumerate(requests): - req_outputs = [ - x.encode('utf8') - for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx + - 1]] - ] - - output_tensor = pb_utils.Tensor( - 'OUTPUT', - np.array(req_outputs).astype(self.output_dtype)) - - outputs = [output_tensor] - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=outputs) - responses.append(inference_response) - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') diff --git a/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt deleted file mode 100644 index 9f3655ef4d7..00000000000 --- a/triton_backend/all_models/inflight_batcher_llm/postprocessing/config.pbtxt +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "postprocessing" -backend: "python" -max_batch_size: ${triton_max_batch_size} -dynamic_batching {} -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "SEQUENCE_LENGTH" - data_type: TYPE_INT32 - dims: [ -1 ] - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -parameters { - key: "skip_special_tokens" - value: { - string_value: "${skip_special_tokens}" - } -} - -instance_group [ - { - count: ${postprocessing_instance_count} - kind: KIND_CPU - } -] diff --git a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py b/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py deleted file mode 100755 index 5a7d36ac1e8..00000000000 --- a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py +++ /dev/null @@ -1,1003 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import base64 -import io -import json -import os -from collections import defaultdict -from typing import List - -import numpy as np -import requests -import triton_python_backend_utils as pb_utils -from PIL import Image -from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - - add_special_tokens = model_config['parameters'].get( - 'add_special_tokens') - multimodal_model_path = model_config['parameters'][ - 'multimodal_model_path']['string_value'] - max_num_images = model_config['parameters'].get('max_num_images') - - if max_num_images is not None: - max_num_images_str = max_num_images['string_value'] - if max_num_images_str.isdigit(): - self.max_num_images = int(max_num_images_str) - else: - print( - f"[TensorRT-LLM][WARNING] 'max_num_images' parameter is not set correctly (value is {max_num_images_str}). Will be set to None" - ) - self.max_num_images = None - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default." - ) - self.max_num_images = None - if multimodal_model_path == "${multimodal_model_path}" or multimodal_model_path == "": - multimodal_model_path = None - - if add_special_tokens is not None: - add_special_tokens_str = add_special_tokens['string_value'].lower() - if add_special_tokens_str in [ - 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' - ]: - self.add_special_tokens = add_special_tokens_str in [ - 'true', '1', 't', 'y', 'yes' - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." - ) - self.add_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." - ) - self.add_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - - if isinstance(self.tokenizer, T5Tokenizer): - self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.tokenizer_end_id = self.tokenizer.encode( - self.tokenizer.eos_token, add_special_tokens=False)[0] - self.tokenizer_pad_id = self.tokenizer.encode( - self.tokenizer.pad_token, add_special_tokens=False)[0] - self.vocab_size = self.tokenizer.vocab_size - - self.is_multimodal = False - self.model_type = None - self.vision_preprocessor = None - - if multimodal_model_path is not None: - self.is_multimodal = True - multimodal_model_path = os.path.join(multimodal_model_path, - 'config.json') - with open(multimodal_model_path, 'r') as f: - visual_model_config = json.load(f) - self.model_type = visual_model_config['builder_config'][ - 'model_type'] - - assert self.model_type in [ - 'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision', - 'qwen2_vl' - ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama, llava_onevision and qwen2_vl. Got {self.model_type}." - - assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently." - - llm_model_path = model_config['parameters']['gpt_model_path'][ - 'string_value'] - llm_model_path = os.path.join(llm_model_path, 'config.json') - with open(llm_model_path, 'r') as f: - llm_model_config = json.load(f) - self.vocab_size = int( - llm_model_config["pretrained_config"]["vocab_size"]) - self._setup_ptable_shape(llm_model_config) - - if self.model_type in ['mllama', 'llava_onevision', 'qwen2_vl']: - self.vision_preprocessor = VisionPreProcessor( - self.model_type, - AutoProcessor.from_pretrained(tokenizer_dir), model_config) - - # Parse model output configs and convert Triton types to numpy types - output_names = [ - "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", - "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_TABLE_EXTRA_IDS", - "PIXEL_VALUES", "IMAGE_SIZES" - ] - input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_input_config_by_name(model_config, - input_name)['data_type'])) - - for output_name in output_names: - setattr( - self, - output_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name( - model_config, output_name)['data_type'])) - - def _setup_ptable_shape(self, llm_model_config): - max_prompt_embedding_table_size = llm_model_config['build_config'][ - 'max_prompt_embedding_table_size'] - max_batch_size = llm_model_config['build_config']['max_batch_size'] - - num_multimodal_features = max_prompt_embedding_table_size // max_batch_size - hidden_size = llm_model_config['pretrained_config']['hidden_size'] - if self.max_num_images is not None: - num_multimodal_features = num_multimodal_features // self.max_num_images - - self.ptable_shape = (-1, num_multimodal_features, hidden_size) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, - 'QUERY').as_numpy() - batch_size = query.shape[0] - - decoder_query = pb_utils.get_input_tensor_by_name( - request, 'DECODER_QUERY') - if decoder_query is not None: - decoder_query = decoder_query.as_numpy() - - request_output_len = pb_utils.get_input_tensor_by_name( - request, 'REQUEST_OUTPUT_LEN').as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, 'BAD_WORDS_DICT') - if bad_words_dict is not None: - bad_words_dict = bad_words_dict.as_numpy() - - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, 'STOP_WORDS_DICT') - if stop_words_dict is not None: - stop_words_dict = stop_words_dict.as_numpy() - - embedding_bias_words = pb_utils.get_input_tensor_by_name( - request, 'EMBEDDING_BIAS_WORDS') - if embedding_bias_words is not None: - embedding_bias_words = embedding_bias_words.as_numpy() - - embedding_bias_weights = pb_utils.get_input_tensor_by_name( - request, 'EMBEDDING_BIAS_WEIGHTS') - if embedding_bias_weights is not None: - embedding_bias_weights = embedding_bias_weights.as_numpy() - - # Take the end_id from the input tensors - # If not specified, use tokenizer to get end_id - end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID') - if end_id is not None: - end_id = end_id.as_numpy() - else: - end_id = [[self.tokenizer_end_id]] * batch_size - - # Take the pad_id from the input tensors - # If not specified, use tokenizer to get pad_id - pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID') - if pad_id is not None: - pad_id = pad_id.as_numpy() - else: - pad_id = [[self.tokenizer_pad_id]] * batch_size - - # Take the extra_id from the input tensors - # Extra id is used in kv cache reuse for p-tuning - prompt_table_extra_id = pb_utils.get_input_tensor_by_name( - request, 'PROMPT_TABLE_EXTRA_ID') - if prompt_table_extra_id is not None: - prompt_table_extra_id = prompt_table_extra_id.as_numpy() - assert prompt_table_extra_id.shape[ - 0] == batch_size, "Prompt table extra id must have the same batch size as Query" - assert prompt_table_extra_id.shape[ - 1] == 1, "Multiple IDs cannot be provided for a single image" - - # Preprocessing vision input passed as a url or bytes tensor - img_urls = pb_utils.get_input_tensor_by_name(request, 'IMAGE_URL') - image_bytes = pb_utils.get_input_tensor_by_name( - request, 'IMAGE_BYTES') - video_bytes = pb_utils.get_input_tensor_by_name( - request, 'VIDEO_BYTES') - vision_processed_tensors = [] - visual_tokens = [] - if self.is_multimodal and (img_urls or image_bytes or video_bytes): - assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None" - processed_tensors = {} - if self.model_type == 'mllama': - processed_tensors = self.vision_preprocessor.mllama_process( - queries=query.astype(str).tolist(), - img_urls=img_urls, - image_bytes=image_bytes, - ) - elif self.model_type == 'llava_onevision': - if video_bytes is None: - processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_image( - queries=query.astype(str).tolist(), - img_urls=img_urls, - image_bytes=image_bytes, - ) - else: - processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_video( - queries=query.astype(str).tolist(), - video_bytes=video_bytes, - ) - elif self.model_type == 'qwen2_vl': - processed_tensors = self.vision_preprocessor.qwen2_vl_process_image( - queries=query.astype(str).tolist(), - img_urls=img_urls, - image_bytes=image_bytes, - ) - qwen2vl_input_id_tensor = processed_tensors.get("INPUT_IDS") - processed_tensors.pop("INPUT_IDS") - qwen2vl_input_length_tensor = processed_tensors.get( - "REQUEST_INPUT_LEN") - processed_tensors.pop("REQUEST_INPUT_LEN") - else: - raise ValueError( - "Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs" - ) - vision_processed_tensors = [ - pb_utils.Tensor.from_dlpack(k, v) - for k, v in processed_tensors.items() - ] - else: - assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided" - - # Preprocessing input data. - # For the LLaVA_OneVision model, num_multimodal_features is not a fixed value - input_id, request_input_len = self._create_request( - query, visual_tokens) - if decoder_query is not None: - decoder_input_id, request_decoder_input_len = self._create_request( - decoder_query) - else: - decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32) - request_decoder_input_len = 1 * np.ones( - (batch_size, 1), np.int32) - - bad_words = self._to_word_list_format(bad_words_dict, batch_size) - stop_words = self._to_word_list_format(stop_words_dict, batch_size) - - embedding_bias = self._get_embedding_bias( - embedding_bias_words, embedding_bias_weights, - self.embedding_bias_weights_dtype, batch_size) - - if prompt_table_extra_id is not None and self.model_type != 'qwen2_vl': - prompt_table_extra_ids = np.zeros_like(input_id) - for i in range(batch_size): - prompt_table_extra_ids[i] = np.where( - input_id[i] >= self.vocab_size, - prompt_table_extra_id[i], 0) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - # Qwen2-VL model has special logic to process input ids - if self.model_type == 'qwen2_vl': - input_id_tensor = pb_utils.Tensor.from_dlpack( - 'INPUT_ID', qwen2vl_input_id_tensor) - request_input_len_tensor = pb_utils.Tensor.from_dlpack( - 'REQUEST_INPUT_LEN', qwen2vl_input_length_tensor) - else: - input_id_tensor = pb_utils.Tensor( - 'INPUT_ID', input_id.astype(self.input_id_dtype)) - request_input_len_tensor = pb_utils.Tensor( - 'REQUEST_INPUT_LEN', - request_input_len.astype(self.request_input_len_dtype)) - decoder_input_id_tensor = pb_utils.Tensor( - 'DECODER_INPUT_ID', - decoder_input_id.astype(self.decoder_input_id_dtype)) - request_decoder_input_len_tensor = pb_utils.Tensor( - 'REQUEST_DECODER_INPUT_LEN', - request_decoder_input_len.astype( - self.request_decoder_input_len_dtype)) - request_output_len_tensor = pb_utils.Tensor('REQUEST_OUTPUT_LEN', - request_output_len) - bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) - stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS', - stop_words) - embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS', - embedding_bias) - end_id_tensor = pb_utils.Tensor('OUT_END_ID', - np.array(end_id, dtype=np.int32)) - pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID', - np.array(pad_id, dtype=np.int32)) - if prompt_table_extra_id is not None: - prompt_table_extra_ids_tensor = pb_utils.Tensor( - 'OUT_PROMPT_TABLE_EXTRA_IDS', - np.array(prompt_table_extra_ids, - dtype=self.out_prompt_table_extra_ids_dtype)) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, decoder_input_id_tensor, - bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_decoder_input_len_tensor, - request_output_len_tensor, embedding_bias_tensor, - end_id_tensor, pad_id_tensor, prompt_table_extra_ids_tensor - ] + vision_processed_tensors) - else: - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, decoder_input_id_tensor, - bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_decoder_input_len_tensor, - request_output_len_tensor, embedding_bias_tensor, - end_id_tensor, pad_id_tensor - ] + vision_processed_tensors) - responses.append(inference_response) - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _split_prompt_by_images(self, concatenated_ids, image_token_index=-200): - """ - Splits tokenized prompts by image placeholders for each sample in the batch. - - Args: - concatenated_ids (np.ndarray): A batch of concatenated token IDs, where image placeholders are indicated by `image_token_index`. - - Returns: - List[List[np.ndarray]]: A list containing lists of token ID arrays for each prompt segment, per batch sample. - """ - batch_splits = [] - for batch in concatenated_ids: - zero_indices = np.where(batch == image_token_index)[0] - start_idx = 0 - splits = [] - for idx in zero_indices: - if start_idx != idx: - splits.append(batch[start_idx:idx].reshape(1, -1)) - start_idx = idx + 1 - if start_idx < len(batch): - splits.append(batch[start_idx:].reshape(1, -1)) - - splits = [split for split in splits if split.size > 0] - batch_splits.append(splits) - - return batch_splits - - def _setup_fake_prompts(self, batch_size, batch_split_prompts): - """ - Replaces image placeholders with unique fake prompt IDs for multi-image inputs. - - Args: - batch_size (int): The number of samples in the batch. - batch_split_prompts (List[List[np.ndarray]]): Tokenized prompt segments for each batch sample. - - Returns: - np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs. - """ - - num_multimodal_features = self.ptable_shape[1] - input_ids_list = [] - - for batch_idx in range(batch_size): - splits = batch_split_prompts[batch_idx] - sample_input_ids = [splits[0]] - sample_fake_prompt_counter = self.vocab_size - - for split_idx in range(len(splits) - 1): - fake_prompt_id = np.arange( - sample_fake_prompt_counter, - sample_fake_prompt_counter + num_multimodal_features) - sample_fake_prompt_counter += num_multimodal_features - fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0) - sample_input_ids.append(fake_prompt_id) - sample_input_ids.append(splits[split_idx + 1]) - - sample_input_ids = np.concatenate(sample_input_ids, axis=1) - input_ids_list.append(sample_input_ids) - - # Pad the input_ids to the same length for bs > 1 - max_seq_len = max( - [sample_input_ids.shape[1] for sample_input_ids in input_ids_list]) - input_ids_padded = [] - for sample_input_ids in input_ids_list: - seq_len = sample_input_ids.shape[1] - pad_width = max_seq_len - seq_len - if pad_width > 0: - sample_input_ids_padded = np.pad( - sample_input_ids, ((0, 0), (0, pad_width)), - 'constant', - constant_values=self.tokenizer_pad_id) - else: - sample_input_ids_padded = sample_input_ids - input_ids_padded.append(sample_input_ids_padded) - - input_ids = np.stack(input_ids_padded) - input_ids = input_ids.reshape(batch_size, -1).astype(np.int32) - - return input_ids - - def _process_multi_image_inputs(self, query, image_token_index=-200): - """ - Processes input queries that contain multiple images by tokenizing the input strings and inserting image_token_index between the parts. - - Args: - query (np.ndarray): Batch of input strings. - - Returns: - List[np.ndarray]: List of tokenized input IDs for each sample. - """ - start_ids = [] - for s in query: - parts = s[0].decode().split('') - num_images = len(parts) - 1 - if num_images > self.max_num_images: - raise ValueError( - f"The number of images in the request ({num_images}) exceeds the maximum allowed ({self.max_num_images})." - ) - tokenized_parts = [ - self.tokenizer.encode(part, add_special_tokens=False) - for part in parts - ] - - # Insert `image_token_index` between the parts to represent - final_ids = [] - for i, part in enumerate(tokenized_parts): - final_ids.extend(part) - if i < len(tokenized_parts) - 1: - final_ids.append(image_token_index) - - start_ids.append(np.array(final_ids).astype(int)) - - return start_ids - - def _create_request(self, query, visual_tokens=None): - """ - query : batch string (2D numpy array) - """ - if isinstance(self.tokenizer, T5Tokenizer): - start_ids = [ - np.array([self.tokenizer_bos_id] + self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens)). - astype(int) for s in query - ] - else: - # Qwen2-VL input id is calculated when processing image - if 'qwen2_vl' == self.model_type: - return None, None - if self.is_multimodal and self.max_num_images and self.max_num_images > 1: - start_ids = self._process_multi_image_inputs(query) - - else: - start_ids = [ - np.array( - self.tokenizer.encode( - s[0].decode(), - add_special_tokens=self.add_special_tokens)).astype( - int) for s in query - ] - - if self.is_multimodal: - if 'blip2' in self.model_type or 'mllama' == self.model_type: - pre_prompt = None - post_prompt = None - elif 'llava' == self.model_type: - pre_prompt = "USER:\n" - post_prompt = " ASSISTANT:" - elif 'vila' == self.model_type: - pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: " - post_prompt = " ASSISTANT:" - elif 'llava_onevision' == self.model_type: - pre_prompt = "<|im_start|>user " - post_prompt = "<|im_end|><|im_start|>assistant\n" - pre_prompt_id = np.array( - self.tokenizer.encode( - pre_prompt, - add_special_tokens=self.add_special_tokens, - padding=True)) if pre_prompt is not None else np.array( - [], dtype=int) - - post_prompt_id = np.array( - self.tokenizer.encode( - post_prompt, - add_special_tokens=self.add_special_tokens, - padding=True)) if post_prompt is not None else np.array( - [], dtype=int) - - if self.max_num_images and self.max_num_images > 1: - concatenated_ids = [ - np.concatenate((pre_prompt_id, ids, post_prompt_id), axis=0) - for ids in start_ids - ] - batch_split_prompts = self._split_prompt_by_images( - concatenated_ids) - start_ids = self._setup_fake_prompts(query.shape[0], - batch_split_prompts) - elif self.model_type == 'llava_onevision': - fake_prompt_ids = [] - extra_id = np.array( - self.tokenizer.encode( - '\n', - add_special_tokens=self.add_special_tokens, - padding=True)) - for tokens in visual_tokens: - prompt_id = np.arange(self.vocab_size, - self.vocab_size + tokens) - fake_prompt_ids.append(prompt_id) - start_ids = [ - np.concatenate((pre_prompt_id, prompt_id, extra_id, ids, - post_prompt_id), - axis=0) - for prompt_id, ids in zip(fake_prompt_ids, start_ids) - ] - else: - fake_prompt_id = np.arange( - self.vocab_size, self.vocab_size + self.ptable_shape[1]) - start_ids = [ - np.concatenate( - (pre_prompt_id, fake_prompt_id, ids, post_prompt_id), - axis=0) for ids in start_ids - ] - - start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) - - max_len = 0 - for seq in start_ids: - max_len = max(max_len, seq.shape[0]) - start_ids = np.stack([ - np.pad(seq, (0, max_len - seq.shape[0]), - 'constant', - constant_values=(0, self.tokenizer_pad_id)) - for seq in start_ids - ]) - - return start_ids, start_lengths - - def _to_word_list_format(self, word_lists: List[List[str | bytes]], - batch_size): - ''' - word_lists format: - len(word_lists) == batch_size - word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". - ''' - assert self.tokenizer != None, "need to set tokenizer" - - if word_lists is None: - # Return an empty array of shape (1,2,0) - return np.empty([batch_size, 2, 0], dtype="int32") - - flat_ids = [] - offsets = [] - for word_list in word_lists: - item_flat_ids = [] - item_offsets = [] - - for word in word_list: - if isinstance(word, bytes): - word = word.decode() - - ids = self.tokenizer.encode(word, add_special_tokens=False) - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), - constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) - - def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights, - bias_dtype, batch_size): - - assert self.tokenizer != None, "need to set tokenizer" - - if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([batch_size, 0], - dtype=self.embedding_bias_weights_dtype) - - batch_embedding_bias = [] - for words, weights in zip(embedding_bias_words, embedding_bias_weights): - - vocab_size = len(self.tokenizer.vocab) - embedding_bias = [0.] * vocab_size - - assert len(words) == len( - weights - ), "Embedding bias words must have same dimension as embedding bias weights" - - for word, weight in zip(words, weights): - if isinstance(word, bytes): - word = word.decode() - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - for id in ids: - embedding_bias[id] += weight - - batch_embedding_bias.append(np.array(embedding_bias)) - - return np.array(batch_embedding_bias, dtype=bias_dtype) - - -class VisionPreProcessor: - """ A class that can load images from url requests, and process them via a vision model processor, - in preparation for the vision encoder. - """ - - def __init__(self, - vision_model_type, - vision_model_processor, - preprocessor_model_config={}): - # import libraries that are only relevant for multimodal models - import torch - from torch.utils.dlpack import from_dlpack - - # NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm - # except for the specific modules tensorrt_llm and multimodal_encoders. - # As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils. - _str_to_torch_dtype_dict = dict( - bfloat16=torch.bfloat16, - float16=torch.float16, - float32=torch.float32, - int64=torch.int64, - int32=torch.int32, - int8=torch.int8, - bool=torch.bool, - fp8=torch.float8_e4m3fn, - ) - - def str_dtype_to_torch(dtype): - ret = _str_to_torch_dtype_dict.get(dtype) - assert ret is not None, f'Unsupported dtype: {dtype}' - return ret - - self.load_images_tensor = lambda tensor: tensor if not hasattr( - tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack()) - - # extract expected output tensor dtype - self.output_str_dtypes = {} - for properties in preprocessor_model_config.get('output', []): - dtype = properties['data_type'] - self.output_str_dtypes[properties['name']] = np.dtype( - pb_utils.triton_string_to_numpy(dtype)).name - - # create method for converting output tensors batch to the expected type - self.convert_tensor_list_to_tensor = lambda tensor_list: torch.concat( - [ - torch.from_numpy(x) if isinstance(x, np.ndarray) else x - for x in tensor_list - ], - dim=0) - self.convert_tensor_to_str_dtype = lambda tensor, dtype: tensor.to( - str_dtype_to_torch(dtype)) - - # create model-specific processor - self.vision_model_processor = vision_model_processor - self.vision_model_type = vision_model_type - - def load_images_from_urls(self, img_urls): - images = [] - for img_url in img_urls: - img_url = img_url.decode() - if img_url.startswith("data:image/jpeg;base64,"): - image_base64 = img_url.split(",")[1] - # Decode the base64 string - image_data = base64.b64decode(image_base64) - # Create a BytesIO object from the decoded data - image_buffer = io.BytesIO(image_data) - images.append(Image.open(image_buffer)) - else: - images.append(Image.open( - requests.get(img_url, stream=True).raw)) - return images - - def mllama_process(self, queries, img_urls=None, image_bytes=None): - vision_processed_tensors = {} - if img_urls is not None or image_bytes is not None: - if img_urls is not None: - # download and read images - images = [ - self.load_images_from_urls(urls) - for urls in img_urls.as_numpy() - ] - else: - images = [ - img for img_list in self.load_images_tensor(image_bytes) - for img in img_list - ] - - batch_size = len(images) - - preprocessor_outputs = {} - possible_output_names = [ - 'PIXEL_VALUES', 'ASPECT_RATIO_IDS', 'ASPECT_RATIO_MASK', - 'CROSS_ATTENTION_MASK' - ] - for batch_id in range(batch_size): - # Preprocess images and query - processed_vision_data = self.vision_model_processor( - images=images[batch_id], - text=queries[batch_id], - return_tensors="pt") - # Reshape pixel_values to [num_images, *HWC/CHW] - val = processed_vision_data["pixel_values"] - val = val.reshape(1, -1, *(val.shape[-3:])) - processed_vision_data["pixel_values"] = val - - # Create vision output tensors - for key in possible_output_names: - val = processed_vision_data.get(key.lower()) - if val is not None: - if key not in preprocessor_outputs: - preprocessor_outputs[key] = [] - preprocessor_outputs[key].append(val) - - for key, tensor_list in preprocessor_outputs.items(): - val = self.convert_tensor_list_to_tensor(tensor_list) - if key in self.output_str_dtypes: - val = self.convert_tensor_to_str_dtype( - val, self.output_str_dtypes[key]) - vision_processed_tensors[key] = val - return vision_processed_tensors - - def llava_onevision_process_image(self, - queries, - img_urls=None, - image_bytes=None): - - import torch - vision_processed_tensors = {} - if img_urls is not None: - # download and read images - images = [ - self.load_images_from_urls(urls) - for urls in img_urls.as_numpy() - ] - else: - images = [ - img for img_list in self.load_images_tensor(image_bytes) - for img in img_list - ] - - batch_size = len(images) - assert len( - queries - ) == batch_size, f"Image must have the same batch size as Query." - preprocessor_outputs = {} - possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES'] - visual_tokens = [] - for batch_id in range(batch_size): - # Preprocess images and query - processed_vision_data = self.vision_model_processor( - images=images[batch_id], text='', return_tensors="pt") - visual_tokens.append(processed_vision_data['input_ids'].shape[1]) - # Create vision output tensors - for key in possible_output_names: - val = processed_vision_data.get(key.lower()) - if val is not None: - if key not in preprocessor_outputs: - preprocessor_outputs[key] = [] - preprocessor_outputs[key].append(val) - - max_patch = max(x.shape[1] - for x in preprocessor_outputs['PIXEL_VALUES']) - preprocessor_outputs['PIXEL_VALUES'] = [ - torch.nn.functional.pad( - image, (0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[1], 0, 0), - mode='constant') - for image in preprocessor_outputs['PIXEL_VALUES'] - ] - for key, tensor_list in preprocessor_outputs.items(): - val = self.convert_tensor_list_to_tensor(tensor_list) - if key in self.output_str_dtypes: - val = self.convert_tensor_to_str_dtype( - val, self.output_str_dtypes[key]) - vision_processed_tensors[key] = val - return vision_processed_tensors, visual_tokens - - def llava_onevision_process_video(self, queries, video_bytes=None): - import torch - vision_processed_tensors = {} - videos = [video for video in self.load_images_tensor(video_bytes)] - - batch_size = len(videos) - assert len( - queries - ) == batch_size, f"Video must have the same batch size as Query." - preprocessor_outputs = {} - preprocessor_outputs['PIXEL_VALUES'] = [] - preprocessor_outputs['IS_VIDEO_INPUT'] = [] - visual_tokens = [] - for batch_id in range(len(queries)): - processed_vision_data = self.vision_model_processor( - videos=list(videos[batch_id]), - text='