Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
31a15db
add CUDAGraphModelEngine
QiJune Jul 30, 2025
52fd48a
rebase
QiJune Jul 31, 2025
6d9673d
rebase
QiJune Aug 1, 2025
fe6fa01
rebase
QiJune Aug 9, 2025
3841576
rebase
QiJune Aug 11, 2025
57b9db1
update
QiJune Aug 11, 2025
fdfacbc
fix
QiJune Aug 11, 2025
2eb3030
fix
QiJune Aug 11, 2025
0640458
fix
QiJune Aug 11, 2025
4fb2876
fix
QiJune Aug 11, 2025
a0d4d52
fix
QiJune Aug 11, 2025
96793c8
fix
QiJune Aug 11, 2025
df4d782
fix
QiJune Aug 11, 2025
677c218
fix
QiJune Aug 11, 2025
5b1ed33
fix
QiJune Aug 11, 2025
215d0d5
fix
QiJune Aug 11, 2025
cf36b99
update
QiJune Aug 11, 2025
66a0398
rebase
QiJune Aug 11, 2025
492f6c0
rename
QiJune Aug 12, 2025
91e87a5
rebase
QiJune Aug 12, 2025
7b470e9
fix
QiJune Aug 12, 2025
d821fa0
clean
QiJune Aug 12, 2025
22df791
polish
QiJune Aug 12, 2025
c10dba4
fix
QiJune Aug 12, 2025
ed19860
fix
QiJune Aug 12, 2025
1f64f35
fix
QiJune Aug 12, 2025
c13d9fb
fix
QiJune Aug 12, 2025
e20fbab
fix
QiJune Aug 12, 2025
fd5d5cb
fix
QiJune Aug 12, 2025
73a8c90
rebase
QiJune Aug 13, 2025
f416639
fix
QiJune Aug 14, 2025
821d54a
fix test
QiJune Aug 14, 2025
6c8d995
rebase
QiJune Aug 14, 2025
f2ab53a
fix
QiJune Aug 14, 2025
2d7a780
fix ci
QiJune Aug 14, 2025
5dd8d17
fix
QiJune Aug 15, 2025
58f9482
fix
QiJune Aug 15, 2025
801c3d4
fix
QiJune Aug 15, 2025
6690c36
clean moe
QiJune Aug 15, 2025
48eab08
clean
QiJune Aug 15, 2025
f8c12ec
fix
QiJune Aug 15, 2025
16ebd43
fix
QiJune Aug 15, 2025
9d8f9f6
fix test_pad_generation_requests
QiJune Aug 15, 2025
70b4d50
Merge branch 'main' into cuda_graph
QiJune Aug 15, 2025
0a08c5a
rebase
QiJune Aug 15, 2025
50c7c22
Merge branch 'main' into cuda_graph
QiJune Aug 19, 2025
3ff27b3
rebase
QiJune Aug 19, 2025
b97069a
fix attention dp
QiJune Aug 19, 2025
95f488b
polish
QiJune Aug 19, 2025
b57ba3b
fix
QiJune Aug 19, 2025
9fae269
fix
QiJune Aug 19, 2025
debca3c
clean
QiJune Aug 20, 2025
544f41a
Merge branch 'main' into cuda_graph
QiJune Aug 20, 2025
e69cf93
fix conflicts
QiJune Aug 21, 2025
afe209f
rebase
QiJune Aug 21, 2025
3307c43
Merge branch 'main' into cuda_graph
QiJune Aug 21, 2025
43b18bd
clean tests
QiJune Aug 22, 2025
530ab91
rebase
QiJune Aug 22, 2025
907bc22
[None][chore] Bump version to 1.1.0rc2 (#7167)
yiqingy0 Aug 22, 2025
e3de575
[#7136][feat] trtllm-serve + autodeploy integration (#7141)
suyoggupta Aug 22, 2025
c232ba8
[TRTLLM-4921][feat] Enable chunked prefill for Nemotron-H (#6334)
tomeras91 Aug 22, 2025
37543a9
[None][refactor] Simplify decoder state initialization for speculativ…
Funatiq Aug 22, 2025
b36460d
[None][feat] Deepseek: Start Eagle work (#6210)
IzzyPutterman Aug 22, 2025
81fd468
[None][fix] Correct KV cache percentage report out. (#7102)
FrankD412 Aug 22, 2025
3d54a1a
[None] [feat] nsys profile output kernel classifier (#7020)
gracehonv Aug 23, 2025
96ff82e
[None][fix] Waive test (#7185)
Tabrizian Aug 24, 2025
7ff0405
Merge branch 'main' into cuda_graph
QiJune Aug 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-1.1.0rc1-green)](./tensorrt_llm/version.py)
[![version](https://img.shields.io/badge/release-1.1.0rc2-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
Expand Down
32 changes: 0 additions & 32 deletions cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/request.h"
#include "tensorrt_llm/runtime/worldConfig.h"

namespace tensorrt_llm::runtime
Expand Down Expand Up @@ -88,37 +87,6 @@ class CreateNewDecoderRequests : Algorithm
SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;

private:
//! @brief Setups decoder internal tensors for new speculative decoding request
static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
SizeType32 maxDecodingEngineTokens);

//! @brief Setups decoder internal tensors for new request in Draft model Sps mode
static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream);

//! @brief Setups decoder internal tensors for new Medusa request
static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens);

//! @brief Setups decoder internal tensors for new Lookahead request
static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

//! @brief Setups decoder internal tensors for new Explicit draft tokens request
static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

//! @brief Setups decoder internal tensors for new Eagle request
static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

[[nodiscard]] std::shared_ptr<runtime::ITensor> retrieveDraftLogits(runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig, std::shared_ptr<runtime::ITensor> const& tensor,
runtime::BufferManager const& bufferManager) const;

bool mSpeculativeDecodingFastLogits;
bool mIsLeaderInOrchMode;
bool mIsNormalizeLogProbs;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ class GenericLlmRequest

[[nodiscard]] SizeType32 getNumDraftTokens() const
{
return mDraftTokens->size();
return hasDraftTokens() ? mDraftTokens->size() : 0;
}

void discardDraftTokens(SizeType32 numTokensToDiscard)
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/tensorrt_llm/runtime/decodingInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,13 @@ class DecodingInput
{
public:
TensorPtr draftLogits;
TensorPtr draftLogitsHost;
TensorPtr draftProbs;
TensorPtr targetProbs;
TensorPtr numDraftTokens;
TensorPtr numDraftTokensHost;
TensorPtr draftTokenIds;
TensorPtr draftTokenIdsHost;
TensorPtr useDraftLogits;
TensorPtr useDraftLogitsHost;

Expand Down
54 changes: 0 additions & 54 deletions cpp/include/tensorrt_llm/runtime/request.h

This file was deleted.

Loading
Loading