Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 6 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ struct ContextChunkingConfig
{
ContextChunkingConfig() = default;

ContextChunkingConfig(executor::ContextChunkingPolicy policy, tensorrt_llm::runtime::SizeType32 unitSize)
: chunkingPolicy(policy)
, chunkUnitSize(unitSize)
{
}

executor::ContextChunkingPolicy chunkingPolicy;
/// The minimum size, also known as the chunk unit size. It generally
/// needs to be equal to the size of the kv cache block or its integer
Expand Down
11 changes: 6 additions & 5 deletions cpp/include/tensorrt_llm/common/cudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@
#include <string>
#ifndef _WIN32 // Linux
#include <sys/sysinfo.h>
#endif // not WIN32
#endif // not WIN32
#include <vector>
#ifdef _WIN32 // Windows
#ifdef _WIN32 // Windows
#include <windows.h>
#undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
// this undef.
#endif // WIN32
#undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
// this undef.
#endif // WIN32

namespace tensorrt_llm::common
{
Expand Down Expand Up @@ -146,6 +146,7 @@ void checkEx(

inline std::optional<bool> isCudaLaunchBlocking()
{
return true;
thread_local bool firstCall = true;
thread_local std::optional<bool> result = std::nullopt;
if (!firstCall)
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -802,7 +802,7 @@ __device__ void warpgroup_fence_operand(float& reg)
__forceinline__ __device__ uint32_t get_lane_id()
{
uint32_t lane_id;
asm("mov.u32 %0, %laneid;" : "=r"(lane_id));
asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
return lane_id;
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ class KVCacheIndex
TLLM_CHECK_DEBUG(value >= 0);
}

__host__ __device__ [[nodiscard]] UnderlyingType get() const
[[nodiscard]] __host__ __device__ UnderlyingType get() const
{
return value & (~kSecondaryPoolFlag);
}

__host__ __device__ [[nodiscard]] bool isPrimary() const
[[nodiscard]] __host__ __device__ bool isPrimary() const
{
return (value & kSecondaryPoolFlag) == 0;
}
Expand Down
1 change: 0 additions & 1 deletion cpp/kernels/fmha_v2/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
generated
bin
cubin
obj
Expand Down
31 changes: 31 additions & 0 deletions cpp/kernels/fmha_v2/generate_fmha_v2_sources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Script to generate FMHA v2 kernel sources
# Based on the genrule from tensorrt_llm.BUILD

set -e # Exit on any error

# Set up the environment for SM80-SM100 generation
export ENABLE_SM100=1
export TORCH_CUDA_ARCH_LIST="8.0,8.6,8.9,9.0,10.0"
export GENERATE_CU_TRTLLM="true"
export GENERATE_CUBIN="true"
export ENABLE_HMMA_FP32="true"

# Path to the TensorRT-LLM setup script
SETUP_SCRIPT="/home/philkuz/dev/TensorRT-LLM/cpp/kernels/fmha_v2/setup.py"
SETUP_DIR=$(dirname "$SETUP_SCRIPT")

cd "$SETUP_DIR"
# Run the setup script to generate kernel sources
echo "Running setup.py to generate kernel sources..."
python3 setup.py

# Show what was generated
echo "Generated files:"
find "$SETUP_DIR/generated" -name "*.cu" | head -20
echo "..."
echo "Total .cu files: $(find "$SETUP_DIR/generated" -name "*.cu" | wc -l)"

cd $(git rev-parse --show-toplevel)
cp cpp/kernels/fmha_v2/generated/fmha_cubin.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
3,951 changes: 3,951 additions & 0 deletions cpp/kernels/fmha_v2/generated/fmha_cubin.h

Large diffs are not rendered by default.

Loading
Loading