gimletlabs · philkuz · Sep 13, 2025 · Sep 14, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
@@ -31,6 +31,12 @@ struct ContextChunkingConfig
 {
     ContextChunkingConfig() = default;
 
+    ContextChunkingConfig(executor::ContextChunkingPolicy policy, tensorrt_llm::runtime::SizeType32 unitSize)
+        : chunkingPolicy(policy)
+        , chunkUnitSize(unitSize)
+    {
+    }
+
     executor::ContextChunkingPolicy chunkingPolicy;
     /// The minimum size, also known as the chunk unit size. It generally
     /// needs to be equal to the size of the kv cache block or its integer

diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h
@@ -37,13 +37,13 @@
 #include <string>
 #ifndef _WIN32 // Linux
 #include <sys/sysinfo.h>
-#endif         // not WIN32
+#endif // not WIN32
 #include <vector>
-#ifdef _WIN32  // Windows
+#ifdef _WIN32 // Windows
 #include <windows.h>
-#undef ERROR   // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
-               // this undef.
-#endif         // WIN32
+#undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
+             // this undef.
+#endif       // WIN32
 
 namespace tensorrt_llm::common
 {
@@ -146,6 +146,7 @@ void checkEx(
 
 inline std::optional<bool> isCudaLaunchBlocking()
 {
+    return true;
     thread_local bool firstCall = true;
     thread_local std::optional<bool> result = std::nullopt;
     if (!firstCall)

diff --git a/cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh
@@ -802,7 +802,7 @@ __device__ void warpgroup_fence_operand(float& reg)
 __forceinline__ __device__ uint32_t get_lane_id()
 {
     uint32_t lane_id;
-    asm("mov.u32 %0, %laneid;" : "=r"(lane_id));
+    asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
     return lane_id;
 }
 

diff --git a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
@@ -39,12 +39,12 @@ class KVCacheIndex
         TLLM_CHECK_DEBUG(value >= 0);
     }
 
-    __host__ __device__ [[nodiscard]] UnderlyingType get() const
+    [[nodiscard]] __host__ __device__ UnderlyingType get() const
     {
         return value & (~kSecondaryPoolFlag);
     }
 
-    __host__ __device__ [[nodiscard]] bool isPrimary() const
+    [[nodiscard]] __host__ __device__ bool isPrimary() const
     {
         return (value & kSecondaryPoolFlag) == 0;
     }

diff --git a/cpp/kernels/fmha_v2/.gitignore b/cpp/kernels/fmha_v2/.gitignore
@@ -1,4 +1,3 @@
-generated
 bin
 cubin
 obj

diff --git a/cpp/kernels/fmha_v2/generate_fmha_v2_sources.sh b/cpp/kernels/fmha_v2/generate_fmha_v2_sources.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Script to generate FMHA v2 kernel sources
+# Based on the genrule from tensorrt_llm.BUILD
+
+set -e  # Exit on any error
+
+# Set up the environment for SM80-SM100 generation
+export ENABLE_SM100=1
+export TORCH_CUDA_ARCH_LIST="8.0,8.6,8.9,9.0,10.0"
+export GENERATE_CU_TRTLLM="true"
+export GENERATE_CUBIN="true"
+export ENABLE_HMMA_FP32="true"
+
+# Path to the TensorRT-LLM setup script
+SETUP_SCRIPT="/home/philkuz/dev/TensorRT-LLM/cpp/kernels/fmha_v2/setup.py"
+SETUP_DIR=$(dirname "$SETUP_SCRIPT")
+
+cd "$SETUP_DIR"
+# Run the setup script to generate kernel sources
+echo "Running setup.py to generate kernel sources..."
+python3 setup.py
+
+# Show what was generated
+echo "Generated files:"
+find "$SETUP_DIR/generated" -name "*.cu" | head -20
+echo "..."
+echo "Total .cu files: $(find "$SETUP_DIR/generated" -name "*.cu" | wc -l)"
+
+cd $(git rev-parse --show-toplevel)
+cp cpp/kernels/fmha_v2/generated/fmha_cubin.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
diff --git a/cpp/kernels/fmha_v2/generated/fmha_cubin.h b/cpp/kernels/fmha_v2/generated/fmha_cubin.h
-Original file line number
+Diff line change
@@ -1,4 +1,3 @@
-    generated
     bin
     cubin
     obj
@@ Expand Down @@