fix: llmapi-launch add add trtllm-bench test with engine building (#4… (NVIDIA#4550)

Superjomn · web-flow · commit 55170ec83a13 · 2025-06-01T08:38:01.000+08:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/ipc.py b/tensorrt_llm/executor/ipc.py
@@ -37,7 +37,10 @@ def __init__(self,
         '''
         Parameters:
             address (tuple[str, Optional[bytes]], optional): The address (tcp-ip_port, hmac_auth_key) for the IPC. Defaults to None. If hmac_auth_key is None and use_hmac_encryption is False, the queue will not use HMAC encryption.
+            socket_type (int): The type of socket to use. Defaults to zmq.PAIR.
             is_server (bool): Whether the current process is the server or the client.
+            is_async (bool): Whether to use asyncio for the socket. Defaults to False.
+            name (str, optional): The name of the queue. Defaults to None.
             use_hmac_encryption (bool): Whether to use HMAC encryption for pickled data. Defaults to True.
         '''
 
@@ -57,10 +60,7 @@ def __init__(self,
         self.use_hmac_encryption = use_hmac_encryption
 
         # Check HMAC key condition
-        if self.use_hmac_encryption and self.is_server and self.hmac_key is not None:
-            raise ValueError(
-                "Server should not receive HMAC key when encryption is enabled")
-        elif self.use_hmac_encryption and not self.is_server and self.hmac_key is None:
+        if self.use_hmac_encryption and not self.is_server and self.hmac_key is None:
             raise ValueError(
                 "Client must receive HMAC key when encryption is enabled")
         elif not self.use_hmac_encryption and self.hmac_key is not None:
@@ -79,7 +79,7 @@ def __init__(self,
                 f"Server [{name}] bound to {self.address_endpoint} in {self.socket_type_str[socket_type]}\n",
                 "green")
 
-            if self.use_hmac_encryption:
+            if self.use_hmac_encryption and not self.hmac_key:
                 # Initialize HMAC key for pickle encryption
                 logger.info(f"Generating a new HMAC key for server {self.name}")
                 self.hmac_key = os.urandom(32)
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
@@ -23,6 +23,12 @@ def get_spawn_proxy_process_ipc_addr_env() -> str | None:
     return os.getenv("TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR")
 
 
+def get_spawn_proxy_process_ipc_hmac_key_env() -> bytes | None:
+    ''' Get the HMAC key for the spawn proxy process dynamically. '''
+    if key := os.getenv("TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY"):
+        return bytes.fromhex(key)
+
+
 def get_spawn_proxy_process_env() -> bool:
     ''' Get the environment variable for the spawn proxy process dynamically. '''
     return os.getenv("TLLM_SPAWN_PROXY_PROCESS") == "1"
@@ -42,10 +48,7 @@ def create_mpi_comm_session(
         print_colored_debug(
             f"Using RemoteMpiPoolSessionClient to bind to external MPI processes at {get_spawn_proxy_process_ipc_addr_env()}\n",
             "yellow")
-        hmac_key = os.getenv("TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY")
-        # Convert the hex string to bytes
-        if hmac_key is not None:
-            hmac_key = bytes.fromhex(hmac_key)
+        hmac_key = get_spawn_proxy_process_ipc_hmac_key_env()
         return RemoteMpiCommSessionClient(
             addr=get_spawn_proxy_process_ipc_addr_env(), hmac_key=hmac_key)
     else:
diff --git a/tensorrt_llm/llmapi/mgmn_leader_node.py b/tensorrt_llm/llmapi/mgmn_leader_node.py
@@ -3,7 +3,9 @@
 MPI Proxy process to connect and get the MPI task to run.
 '''
 from tensorrt_llm._utils import mpi_world_size
-from tensorrt_llm.executor.utils import get_spawn_proxy_process_ipc_addr_env
+from tensorrt_llm.executor.utils import (
+    get_spawn_proxy_process_ipc_addr_env,
+    get_spawn_proxy_process_ipc_hmac_key_env)
 from tensorrt_llm.llmapi.mpi_session import RemoteMpiCommSessionServer
 from tensorrt_llm.llmapi.utils import print_colored_debug
 
@@ -17,6 +19,7 @@ def launch_server_main(sub_comm=None):
         comm=sub_comm,
         n_workers=num_ranks,
         addr=get_spawn_proxy_process_ipc_addr_env(),
+        hmac_key=get_spawn_proxy_process_ipc_hmac_key_env(),
         is_comm=True)
     print_colored_debug(
         f"MPI Comm Server started at {get_spawn_proxy_process_ipc_addr_env()}")
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -19,7 +19,7 @@
 import sys
 import tempfile
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import pytest
 import yaml
@@ -434,6 +434,148 @@ def test_qwen_e2e_cpprunner_large_new_tokens(model_name, model_path, llm_venv,
     ), f"Found zero length in sequence_lengths tensor: {seq_lengths}"
 
 
+# TODO replace the trtllm_bench_prolog
+class BenchRunner:
+
+    def __init__(self,
+                 llm_root: str,
+                 llm_venv: Any,
+                 model_subdir: str,
+                 model_name: str,
+                 streaming: bool,
+                 tp_size: int,
+                 use_pytorch_backend: bool = False,
+                 skip_engine_build: bool = False,
+                 quant: Optional[str] = None,
+                 extra_llm_api_options: Optional[str] = None,
+                 use_mpirun: bool = False):
+
+        llm_models = llm_models_root()
+        assert llm_models is not None
+        self.llm_root = llm_root
+        self.llm_venv = llm_venv
+        self.model_path = Path(llm_models, model_subdir).absolute()
+        self.model_name = model_name
+        self.quant = quant
+        self.streaming = streaming
+        self.skip_engine_build = skip_engine_build
+        self.use_pytorch_backend = use_pytorch_backend
+        self.use_mpirun = use_mpirun
+        self.tp_size = tp_size
+        self.quant_name = self.quant if self.quant is not None else "FP16"
+        self.extra_llm_api_options = extra_llm_api_options
+
+        self.work_dir = Path(tempfile.TemporaryDirectory().name)
+
+        self.dataset_path = os.path.join(self.work_dir, f"data.txt")
+        if self.use_mpirun:
+            self.mpirun_cmd = f"mpirun --allow-run-as-root -n {self.tp_size} trtllm-llmapi-launch"
+        else:
+            self.mpirun_cmd = ""
+        self.engine_path = None
+
+    def __call__(self):
+        self.prepare_dataset()
+        if not (self.skip_engine_build or self.use_pytorch_backend):
+            self.build_engine()
+        self.run_bench()
+
+    def prepare_dataset(self):
+        dataset_tool = Path(self.llm_root, "benchmarks", "cpp",
+                            "prepare_dataset.py")
+
+        # Generate a small dataset to run a test.
+        self.work_dir.mkdir(parents=True)
+        command = [
+            f"{dataset_tool.resolve()}",
+            "--stdout",
+            "--tokenizer",
+            f"{self.model_path}",
+            "token-norm-dist",
+            "--input-mean",
+            "128",
+            "--output-mean",
+            "128",
+            "--input-stdev",
+            "0",
+            "--output-stdev",
+            "0",
+            "--num-requests",
+            "10",
+        ]
+        print(f"Running command: {' '.join(command)}")
+        dataset_output = self.llm_venv.run_cmd(
+            command,
+            caller=check_output,
+        )
+        # Grab the stdout and write it to a dataset file for passing to suite.
+        with open(self.dataset_path, "w") as dataset:
+            dataset.write(dataset_output)
+
+    def build_engine(self):
+        if self.skip_engine_build:
+            return
+
+        build_cmd = \
+            f"{self.mpirun_cmd} " \
+            f"trtllm-bench " \
+            f"--model {self.model_name} " \
+            f"--model_path {self.model_path} " \
+            f"--workspace {self.work_dir} " \
+            f"build --tp_size {self.tp_size}"
+
+        if self.quant is not None:
+            build_cmd = f"{build_cmd} --quantization {self.quant}"
+
+        build_cmd = f"{build_cmd} --dataset {self.dataset_path}"
+        build_output = check_output(build_cmd,
+                                    shell=True,
+                                    env=self.llm_venv._new_env)
+
+        for line in build_output.split("\n")[::-1]:
+            if line.startswith("ENGINE SAVED:"):
+                self.engine_path = Path(line.split(":")[1])
+                break
+
+    def run_bench(self):
+        streaming = "--streaming" if self.streaming else ""
+        benchmark_cmd = \
+            f"{self.mpirun_cmd} " \
+            f"trtllm-bench --model {self.model_name} --model_path {self.model_path} " \
+            f"throughput " \
+            f"--tp {self.tp_size} "
+        if self.engine_path:
+            benchmark_cmd += f"--engine_dir {self.engine_path} "
+        benchmark_cmd += f" --dataset {self.dataset_path} {streaming}"
+
+        if self.use_pytorch_backend:
+            benchmark_cmd += " --backend pytorch"
+
+        if self.extra_llm_api_options:
+            benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
+        check_call(benchmark_cmd, shell=True, env=self.llm_venv._new_env)
+
+
+@pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"],
+                         ids=["llama3-8b"])
+@pytest.mark.parametrize("model_subdir",
+                         ["llama-models-v3/llama-v3-8b-instruct-hf"],
+                         ids=["llama-v3"])
+@pytest.mark.parametrize("use_pytorch_backend", [True, False],
+                         ids=["pytorch_backend", "trt_backend"])
+def test_trtllm_bench_llmapi_launch(llm_root, llm_venv, model_name,
+                                    model_subdir, use_pytorch_backend):
+    runner = BenchRunner(llm_root=llm_root,
+                         llm_venv=llm_venv,
+                         model_name=model_name,
+                         model_subdir=model_subdir,
+                         streaming=False,
+                         use_pytorch_backend=use_pytorch_backend,
+                         use_mpirun=True,
+                         tp_size=2)
+    runner()
+
+
 def trtllm_bench_prolog(
         llm_root,
         llm_venv,
@@ -664,14 +806,14 @@ def test_trtllm_bench_mgmn(llm_root, llm_venv):
     model_name = "meta-llama/Llama-3.1-8B"
     llama_model_dir = Path(
         llm_models_root()) / "llama-3.1-model/Llama-3.1-8B-Instruct"
-    dataset_path = trtllm_bench_prolog(llm_root,
-                                       llm_venv,
-                                       engine_dir=None,
-                                       model_subdir=llama_model_dir,
-                                       model_name=model_name,
-                                       quant=None,
-                                       streaming=False,
-                                       skip_engine_build=True)
+    _, _, dataset_path = trtllm_bench_prolog(llm_root,
+                                             llm_venv,
+                                             engine_dir=None,
+                                             model_subdir=llama_model_dir,
+                                             model_name=model_name,
+                                             quant=None,
+                                             streaming=False,
+                                             skip_engine_build=True)
 
     benchmark_cmd = \
             f"mpirun -n 2 trtllm-llmapi-launch trtllm-bench --model {model_name} " \
@@ -685,7 +827,10 @@ def test_trtllm_bench_mgmn(llm_root, llm_venv):
                                      dir="./",
                                      delete=True,
                                      delete_on_close=True) as running_log:
-        check_call(benchmark_cmd, shell=True, stdout=running_log)
+        check_call(benchmark_cmd,
+                   shell=True,
+                   running_log=running_log,
+                   env=llm_venv._new_env)
         _check_mem_usage(running_log, [30, 0, 0, 0])
 
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -298,3 +298,5 @@ l0_dgx_h100:
   - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
   - examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
   - unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
+  - test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b]
+  - test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -378,7 +378,6 @@ examples/test_eagle.py::test_mistral_eagle_1gpu[mistral-7b-v0.1-eagle2] SKIP (ht
 examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5220761)
 examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5220761)
 examples/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5219531)
-test_e2e.py::test_trtllm_bench_mgmn SKIP (https://nvbugs/5220766)
 examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5219535)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
 examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5227342)