Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
99fc347
move weights loading related logic to ModelLoader
QiJune Sep 6, 2025
018b022
fix
QiJune Sep 6, 2025
0242a6e
rebase
QiJune Sep 8, 2025
e4a542c
Merge branch 'main' into model_loader
QiJune Sep 8, 2025
cf84a58
clean
QiJune Sep 8, 2025
746e486
fix
QiJune Sep 8, 2025
2c92b6e
Merge branch 'main' into model_loader
QiJune Sep 9, 2025
38505f5
Merge branch 'main' into model_loader
QiJune Sep 9, 2025
c64d320
fix ci
QiJune Sep 9, 2025
e0e5bf8
rebase
QiJune Sep 9, 2025
18e79fe
Merge branch 'main' into model_loader
QiJune Sep 16, 2025
96e700e
Merge branch 'main' into model_loader
QiJune Sep 16, 2025
1a9b420
Merge branch 'main' into model_loader
QiJune Sep 17, 2025
91d79d6
Merge branch 'main' into model_loader
QiJune Sep 17, 2025
9656901
Merge branch 'main' into model_loader
QiJune Sep 18, 2025
133a9eb
Merge branch 'main' into model_loader
QiJune Sep 18, 2025
9556a94
Merge branch 'main' into model_loader
QiJune Sep 18, 2025
1ae8cfd
rebase
QiJune Sep 19, 2025
fd2a93e
Merge branch 'main' into model_loader
QiJune Sep 19, 2025
41291db
Merge branch 'main' into model_loader
QiJune Sep 20, 2025
bd0a47a
Merge branch 'main' into model_loader
QiJune Sep 22, 2025
199face
clean
QiJune Sep 22, 2025
c3a01d6
fix ci
QiJune Sep 22, 2025
7169c51
fix
QiJune Sep 22, 2025
cf13d0f
Merge branch 'main' into model_loader
QiJune Sep 22, 2025
4a05fb8
Merge branch 'main' into model_loader
QiJune Sep 22, 2025
21ff657
fix
QiJune Sep 23, 2025
810e37d
Merge branch 'main' into model_loader
QiJune Sep 23, 2025
8904e9c
clean
QiJune Sep 24, 2025
1ff33a0
Merge branch 'main' into model_loader
QiJune Sep 24, 2025
797f5e1
fix
QiJune Sep 24, 2025
6ff0fad
[TRTLLM-7015] [feat] Enable `prompt_logprobs` in pytorch backend (#7580)
venkywonka Sep 24, 2025
fa85991
Merge branch 'main' into model_loader
QiJune Sep 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/llm_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,17 @@ class LlmResponse:
def has_error(self):
return self.error_msg is not None

def clear_context_logits(self):
"""Clear context logits from the response result.

This is used to drop context logits after prompt_logprobs have been computed
when the user didn't explicitly request them.
"""
if self.result and hasattr(self.result, '_py_result'):
py_result = self.result._py_result
if hasattr(py_result, '_context_logits'):
py_result._context_logits = None


class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
"""LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
Expand Down Expand Up @@ -377,10 +388,36 @@ def __init__(
def is_generation_only_request(self):
return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY

def create_response(
self,
use_fast_logits=False,
mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
def create_response(self,
use_fast_logits=False,
mpi_world_rank=0) -> LlmResponse | None:
"""Create an LlmResponse from the current request state.

This method generates a response containing the request's execution results,
including generated tokens, logits, and completion status. It wraps the
parent class's serialized result in a PyTorch-specific LlmResponse object.

Args:
use_fast_logits (bool, optional, default=False): Only applicable for TRT-backend with speculative decoding enabled. When returning generation logits under speculative decoding,
`use_fast_logits=True` replaces tensor payloads with tiny metadata so the target pulls logits
directly (zero-copy/IPC), reducing overhead; ignored on PyTorch.
mpi_world_rank (int, optional, default=0): Only applicable for TRT-backend, with speculative decoding
enabled, and `use_fast_logits=True`. Contains the MPI world rank of the process containing the draft
model, that produces the generation logits. This helps transfer logits from the draft model to the
target model without going through the serialization/transport path.

Returns:
LlmResponse | None: An LlmResponse object containing the request results
if there is valid output, otherwise None.
The response includes:
- request_id: The request identifier (parent ID for child requests)
- result: LlmResult wrapping both serialized and PyTorch-specific results
- client_id: The client identifier for request routing

Note:
Returns None if the serialized result is empty (len(result) == 0),
indicating no output was generated for this request iteration.
"""
result, is_final = super().create_serialized_result(
use_fast_logits, mpi_world_rank)
return LlmResponse(
Expand Down
Loading
Loading