From 631185613d0203b334fdab17224f2c9b413808e0 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 18 Jun 2025 14:17:37 +0800 Subject: [PATCH 1/4] init Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_utils.py | 113 +++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 7 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index 367b79ffb76..caba5e276b7 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -19,9 +19,10 @@ # yapf: disable from ..bindings.executor import (BatchingType, CapacitySchedulerPolicy, ContextChunkingPolicy, ExecutorConfig, - KvCacheRetentionConfig, SchedulerConfig) + GuidedDecodingConfig, KvCacheRetentionConfig, + PeftCacheConfig, SchedulerConfig) # yapf: enable -from ..builder import BuildConfig, Engine, build +from ..builder import BuildConfig, Engine, EngineConfig, build from ..llmapi.llm_args import TrtLlmArgs from ..logger import logger from ..mapping import Mapping @@ -30,15 +31,16 @@ from ..module import Module from .build_cache import (BuildCache, BuildCacheConfig, CachedStage, get_build_cache_config_from_env) -from .llm_args import (CalibConfig, DraftTargetDecodingConfig, +from .llm_args import (BaseLlmArgs, CalibConfig, DraftTargetDecodingConfig, EagleDecodingConfig, KvCacheConfig, LlmArgs, LookaheadDecodingConfig, MedusaDecodingConfig, - MTPDecodingConfig, NGramDecodingConfig, _ModelFormatKind, - _ModelWrapper, _ParallelConfig, get_model_format, - update_llm_args_with_extra_dict, + MTPDecodingConfig, NGramDecodingConfig, PybindMirror, + _ModelFormatKind, _ModelWrapper, _ParallelConfig, + get_model_format, update_llm_args_with_extra_dict, update_llm_args_with_extra_options) from .mpi_session import MPINodeState, MpiSession -from .tokenizer import TransformersTokenizer, load_hf_tokenizer +from .tokenizer import (TransformersTokenizer, _xgrammar_tokenizer_info, + load_hf_tokenizer) # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import from .utils import (download_hf_model, download_hf_pretrained_config, enable_llm_debug, get_directory_size_in_gb, print_colored, @@ -855,6 +857,103 @@ class LlmBuildStats: build_steps_info: List[Tuple[str, float]] = field(default_factory=list) +def llm_args_to_executor_config(args: BaseLlmArgs, tokenizer) -> ExecutorConfig: + max_batch_size = args.max_batch_size + max_num_tokens = args.max_num_tokens + max_seq_len = args.max_seq_len + + build_config = args.build_config if isinstance( + args, TrtLlmArgs) else BuildConfig() + + max_batch_size = max_batch_size or build_config.max_batch_size + max_num_tokens = max_num_tokens or build_config.max_num_tokens + max_seq_len = max_seq_len or build_config.max_seq_len + + executor_config = ExecutorConfig( + max_beam_width=args.max_beam_width, + scheduler_config=PybindMirror.maybe_to_pybind(args.scheduler_config), + batching_type=PybindMirror.maybe_to_pybind(args.batching_type) + or BatchingType.INFLIGHT, + max_batch_size=max_batch_size, + max_num_tokens=max_num_tokens, + gather_generation_logits=args.gather_generation_logits) + if args.backend is None: + # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens + if max_seq_len is not None: + executor_config.max_seq_len = max_seq_len + else: + engine_config = EngineConfig.from_json_file(args.model / + "config.json") + executor_config.max_seq_len = engine_config.build_config.max_seq_len + + if args.kv_cache_config is not None: + executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( + args.kv_cache_config) + if os.getenv("FORCE_DETERMINISTIC", "0") == "1": + # Disable KV cache reuse for deterministic mode + executor_config.kv_cache_config.enable_block_reuse = False + executor_config.kv_cache_config.enable_partial_reuse = False + + if args.peft_cache_config is not None: + executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( + args.peft_cache_config) + elif isinstance(args, + TrtLlmArgs) and args.build_config.plugin_config.lora_plugin: + engine_config = EngineConfig.from_json_file(args.model / "config.json") + lora_config = engine_config.build_config.lora_config + max_lora_rank = lora_config.max_lora_rank + num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \ + len(lora_config.lora_target_modules + lora_config.missing_qkv_modules) + executor_config.peft_cache_config = PeftCacheConfig( + num_device_module_layer=max_lora_rank * num_lora_modules * + args.max_loras, + num_host_module_layer=max_lora_rank * num_lora_modules * + args.max_cpu_loras, + ) + if args.decoding_config is not None: + executor_config.decoding_config = args.decoding_config + + if args.guided_decoding_backend == 'xgrammar': + executor_config.guided_decoding_config = GuidedDecodingConfig( + backend=GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR, + **_xgrammar_tokenizer_info(tokenizer)) + elif args.guided_decoding_backend is not None: + raise ValueError( + f"Unrecognized guided decoding backend {args.guided_decoding_backend}" + ) + + executor_config.normalize_log_probs = args.normalize_log_probs + executor_config.enable_chunked_context = args.enable_chunked_prefill + executor_config.max_beam_width = args.max_beam_width or args.build_config.max_beam_width + if isinstance( + args, + TrtLlmArgs) and args.extended_runtime_perf_knob_config is not None: + executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind( + args.extended_runtime_perf_knob_config) + + if args.cache_transceiver_config is not None: + executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( + args.cache_transceiver_config) + + from tensorrt_llm._torch.pyexecutor.config import update_executor_config + update_executor_config( + executor_config, + backend=args.backend, + pytorch_backend_config=args.get_pytorch_backend_config() + if args.backend in ["pytorch", "_autodeploy"] else None, + mapping=args.parallel_config.to_mapping(), + build_config=args.build_config + if isinstance(args, TrtLlmArgs) else None, + speculative_config=args.speculative_config, + hf_model_dir=self._hf_model_dir, + trt_engine_dir=self._engine_dir, + max_input_len=args.max_input_len, + max_seq_len=max_seq_len) + + executor_config.llm_parallel_config = args.parallel_config + return executor_config + + __all__ = [ 'LlmArgs', 'LlmBuildStats', From bd281ca4f52ecf986852c087a861c48cf462dca7 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 18 Jun 2025 14:25:39 +0800 Subject: [PATCH 2/4] remove unused variables Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py | 4 +--- tensorrt_llm/_torch/pyexecutor/config.py | 7 ------- tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 -- tensorrt_llm/_torch/pyexecutor/py_executor_creator.py | 1 - tensorrt_llm/executor/worker.py | 3 +-- tensorrt_llm/llmapi/llm.py | 2 -- 6 files changed, 2 insertions(+), 17 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index bfe4f3c0dec..49c8b783e18 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -241,9 +241,7 @@ def forward( return {"logits": logits_flat} -def create_autodeploy_executor( - executor_config: ExecutorConfig, checkpoint_dir: str = None, engine_dir: str = None -): +def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir: str = None): """Create an AutoDeploy executor from the given configuration and checkpoint directory. This is the entrypoint API to the _autodeploy backend. diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 8ee4c2d5487..9e5540914ff 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -94,8 +94,6 @@ class PyTorchConfig: 'max_seq_len', 'tokens_per_block', 'mapping', - 'hf_model_dir', - 'trt_engine_dir', ] @@ -106,8 +104,6 @@ def update_executor_config( mapping: Optional[Mapping] = None, build_config: Optional[BuildConfig] = None, speculative_config: Optional[SpecConfig] = None, - hf_model_dir: Optional[str] = None, - trt_engine_dir: Optional[str] = None, max_input_len: Optional[int] = None, max_seq_len: Optional[int] = None): if backend is None: @@ -130,9 +126,6 @@ def update_executor_config( # TODO: move to pure-Python KvCacheConfig, and remove dependency on build_config. executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block - executor_config.hf_model_dir = hf_model_dir - executor_config.trt_engine_dir = trt_engine_dir - if max_input_len is not None: executor_config.max_input_len = max_input_len diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index c692a8b0c3b..01e9324e987 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -282,8 +282,6 @@ def __init__( self.is_cuda_graph_dummy = False self.py_lora_task_layer_module_configs = None - self.py_tokens = super().get_tokens() - self.py_return_log_probs = return_log_probs self.py_return_context_logits = return_context_logits self.py_return_generation_logits = return_generation_logits diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 9f23bddd264..95f899557f7 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -179,7 +179,6 @@ def _get_mapping(executor_config: ExecutorConfig) -> Mapping: def create_py_executor( executor_config: ExecutorConfig, checkpoint_dir: str = None, - engine_dir: str = None, lora_config: Optional[LoraConfig] = None, garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor: _mangle_executor_config(executor_config) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index f9e3fb5e898..9a2fd17d816 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -119,7 +119,6 @@ def _create_engine(): args = { "executor_config": executor_config, "checkpoint_dir": executor_config.hf_model_dir, - "engine_dir": executor_config.trt_engine_dir, } if executor_config.backend == "pytorch": from tensorrt_llm._torch.pyexecutor.py_executor_creator import \ @@ -135,7 +134,7 @@ def _create_engine(): else: raise ValueError( f"Unsupported backend config: {executor_config.backend}") - + args["engine_dir"] = executor_config.trt_engine_dir return create_executor(**args) self.engine = _create_engine() diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 4ddf97b665d..f7e2d04a19f 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -685,8 +685,6 @@ def _build_model(self): build_config=self.args.build_config if self._on_trt_backend else None, speculative_config=self.args.speculative_config, - hf_model_dir=self._hf_model_dir, - trt_engine_dir=self._engine_dir, max_input_len=self.args.max_input_len, max_seq_len=max_seq_len) self._executor_config.llm_parallel_config = self.args.parallel_config From 2e123ef15318eed4bf875490d7cb4782a319e134 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 18 Jun 2025 16:07:02 +0800 Subject: [PATCH 3/4] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/config.py | 4 ++++ tensorrt_llm/executor/worker.py | 1 - tensorrt_llm/llmapi/llm.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 9e5540914ff..bd861e63ca5 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -94,6 +94,7 @@ class PyTorchConfig: 'max_seq_len', 'tokens_per_block', 'mapping', + 'hf_model_dir', ] @@ -104,6 +105,7 @@ def update_executor_config( mapping: Optional[Mapping] = None, build_config: Optional[BuildConfig] = None, speculative_config: Optional[SpecConfig] = None, + hf_model_dir: Optional[str] = None, max_input_len: Optional[int] = None, max_seq_len: Optional[int] = None): if backend is None: @@ -126,6 +128,8 @@ def update_executor_config( # TODO: move to pure-Python KvCacheConfig, and remove dependency on build_config. executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block + executor_config.hf_model_dir = hf_model_dir + if max_input_len is not None: executor_config.max_input_len = max_input_len diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 9a2fd17d816..dfe95ff2b77 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -134,7 +134,6 @@ def _create_engine(): else: raise ValueError( f"Unsupported backend config: {executor_config.backend}") - args["engine_dir"] = executor_config.trt_engine_dir return create_executor(**args) self.engine = _create_engine() diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index f7e2d04a19f..367dad1743c 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -685,6 +685,7 @@ def _build_model(self): build_config=self.args.build_config if self._on_trt_backend else None, speculative_config=self.args.speculative_config, + hf_model_dir=self._hf_model_dir, max_input_len=self.args.max_input_len, max_seq_len=max_seq_len) self._executor_config.llm_parallel_config = self.args.parallel_config From 169eeec77bdbf6c994944fee812719ad0e9fabb3 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 10:49:54 +0800 Subject: [PATCH 4/4] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index caba5e276b7..fbe7a486660 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -946,7 +946,6 @@ def llm_args_to_executor_config(args: BaseLlmArgs, tokenizer) -> ExecutorConfig: if isinstance(args, TrtLlmArgs) else None, speculative_config=args.speculative_config, hf_model_dir=self._hf_model_dir, - trt_engine_dir=self._engine_dir, max_input_len=args.max_input_len, max_seq_len=max_seq_len)