diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 0e5ea60f61402..04177b11e9c30 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
     steps:
       - uses: actions/checkout@v5
-      - uses: gradle/actions/wrapper-validation@v4
+      - uses: gradle/actions/wrapper-validation@v5
 concurrency:
   group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 42d1bdc295785..683c5594e82f2 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -20,7 +20,7 @@ permissions:
 
 jobs:
   build:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     env:
       DOCFXVERSION: 2.62.2
     steps:
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 437fc0e2c6334..3d24d4b6b75b6 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU CUDA CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -52,9 +52,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - uses: actions/setup-node@v5
         with:
@@ -111,7 +111,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -188,10 +188,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -199,9 +199,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - name: Set OnnxRuntimeBuildDirectory
         shell: pwsh
@@ -227,7 +227,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml
index 395ccfbe70244..b608c0879aa45 100644
--- a/.github/workflows/windows_openvino.yml
+++ b/.github/workflows/windows_openvino.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   BUILD_OPENVINO_EP:
     name: Windows OpenVINO CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 240
     env:
       AZCOPY_AUTO_LOGIN_TYPE: MSI
diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml
index 9788792b94fa8..1906fcb18c841 100644
--- a/.github/workflows/windows_qnn_x64.yml
+++ b/.github/workflows/windows_qnn_x64.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   build_test_qnn_ep:
     name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 120
     strategy:
       matrix: 
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 5f3dcb9607a47..2a1fe97d9b7b7 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   build:
     name: Windows GPU TensorRT CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     steps:
       - uses: actions/checkout@v5
         with:
@@ -41,10 +41,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -56,9 +56,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - uses: actions/setup-node@v5
@@ -116,7 +116,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -193,10 +193,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -208,9 +208,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
           Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
 
       - name: Set OnnxRuntimeBuildDirectory
@@ -237,7 +237,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml
index 6165375e7a54a..6a1b43e54ed89 100644
--- a/.github/workflows/windows_x64_debug_build_x64_debug.yml
+++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_debug:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml
index f9d7b0d9e9e04..0bcd282e8dc50 100644
--- a/.github/workflows/windows_x64_release_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
index 54c13e1e04b0a..3934047266f59 100644
--- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
+++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_ep_generic_interface:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
index 06230962b39be..1c38d8e58970c 100644
--- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_vitisai:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml
index 21033ef4cbe3c..6eb9f00d3997d 100644
--- a/.github/workflows/windows_x64_release_xnnpack.yml
+++ b/.github/workflows/windows_x64_release_xnnpack.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x64_release_xnnpack:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml
index fa1e9362e2f34..597c1c7f4b6cf 100644
--- a/.github/workflows/windows_x86.yml
+++ b/.github/workflows/windows_x86.yml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_x86_release:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 300
 
     steps:
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index a6c2798a482eb..53cc1a6f9292c 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.23.0
+1.24.0
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 793207f5b6d76..8186da507a442 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
 cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
-cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
+option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)
 
 option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
 option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
@@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
 endif()
 
 
-# Single output director for all binaries
+# Single output directory for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
 
+# Local mirror directory of cmake dependencies
+set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
+set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies")
+
 
 include(FetchContent)
 
@@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD)
   set(onnxruntime_MINIMAL_BUILD ON)
 endif()
 
-set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
 set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
 set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)
 set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7b243ff15cd80..bf76753c1b3c0 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09
 # Use the latest commit of 10.9-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/emsdk b/cmake/external/emsdk
index d49219d03a41c..419021fa04042 160000
--- a/cmake/external/emsdk
+++ b/cmake/external/emsdk
@@ -1 +1 @@
-Subproject commit d49219d03a41cd12f95a33ba84273c20d41fd350
+Subproject commit 419021fa040428bc69ef1559b325addb8e10211f
diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake
index 55059b9500a8e..e8044411e4201 100644
--- a/cmake/external/helper_functions.cmake
+++ b/cmake/external/helper_functions.cmake
@@ -4,11 +4,11 @@
 # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects.
 
 function(onnxruntime_fetchcontent_declare contentName)
+    cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "")
+    message(STATUS "Fetch ${contentName} from ${ARG_URL}")
     FetchContent_Declare(${ARGV})
     string(TOLOWER ${contentName} contentNameLower)
-    list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR)
-    if(index_SOURCE_SUBDIR GREATER_EQUAL 0)
-      cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") 
+    if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "")
       set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE)
     endif()
 endfunction()
diff --git a/cmake/external/onnx b/cmake/external/onnx
index 54b72a5edd399..e709452ef2bbc 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit 54b72a5edd399eb096ee09fecdef03201e9bde89
+Subproject commit e709452ef2bbc1d113faf678c24e6d3467696e83
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 8e1a880579b34..b6a741d8b0fe7 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
 
     if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
       # Search a local mirror folder
-      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+      string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
 
       if(EXISTS "${LOCAL_URL}")
         cmake_path(ABSOLUTE_PATH LOCAL_URL)
@@ -498,13 +498,7 @@ else()
 endif()
 
 if(Patch_FOUND)
-  set(ONNXRUNTIME_ONNX_PATCH_COMMAND
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch &&
-      # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-      # This change should be included in ONNX 1.19.1.
-      ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 <
-          ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch
-      )
+  set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch)
 else()
   set(ONNXRUNTIME_ONNX_PATCH_COMMAND "")
 endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 68a3e9014b7b0..1d31eb1fbd207 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -182,8 +182,8 @@
 
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
-      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--relocatable-device-code=true>")
-      set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
+
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>")
       endif()
diff --git a/cmake/onnxruntime_test_pch.cmake b/cmake/onnxruntime_test_pch.cmake
index f989774ade35b..4a8735a9c346c 100644
--- a/cmake/onnxruntime_test_pch.cmake
+++ b/cmake/onnxruntime_test_pch.cmake
@@ -5,9 +5,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
   target_precompile_headers(onnxruntime_test_all PRIVATE
     "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
   )
-  target_precompile_headers(onnxruntime_provider_test PRIVATE
-    "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
-  )
+  if (TARGET onnxruntime_provider_test)
+    target_precompile_headers(onnxruntime_provider_test PRIVATE
+      "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h"
+    )
+  endif()
 endif()
 
 # Exclude certain files that might conflict with PCH
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 177bc4229df31..460736ff8506e 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1228,6 +1228,11 @@ block()
     LIBS ${onnxruntime_provider_test_libs}
     DEPENDS ${onnxruntime_provider_test_deps}
   )
+  if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
+    # The test_main.cc includes NvInfer.h where it has many deprecated declarations
+    # simply ignore them for TensorRT EP build
+    set_property(TARGET onnxruntime_provider_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
+  endif()
 
   # enable dynamic plugin EP usage
   target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_ENABLE_DYNAMIC_PLUGIN_EP_USAGE)
diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
deleted file mode 100644
index 804dfeb8f59c2..0000000000000
--- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 479955793..cc3ef1400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-   set(${SRCS})
-
-   set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py")
--  set(GENERATED_FILE_TARGETS)
-+  set(GENERATED_FILES)
-   foreach(INFILE ${ARGN})
-     set(ABS_FILE "${ONNX_ROOT}/${INFILE}")
-     get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY)
-@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}")
-     endif()
-
--    add_custom_target("${GENERATED_FILE_WE}_proto_file"
--                       COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
--                       BYPRODUCTS "${GENERATED_PROTO}"
--                       DEPENDS ${INFILE}
--                       COMMENT "Running gen_proto.py on ${INFILE}"
--                       )
-+    # Use add_custom_command to avoid re-generate of PROTO files
-+    add_custom_command(OUTPUT "${GENERATED_PROTO}"
-+        COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS}
-+        DEPENDS ${INFILE}
-+        COMMENT "Running gen_proto.py on ${INFILE}")
-     message("Generated: ${GENERATED_PROTO}")
-
-     set(PROTOC_ARGS
-@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS)
-         list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR})
-       endif()
-     endif()
--    list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file)
--    add_custom_target(${GENERATED_FILE_WE}_src
-+    list(APPEND GENERATED_FILES "${GENERATED_PROTO}")
-+    add_custom_command(OUTPUT "${OUTPUT_PB_SRC}"
-         COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS}
--        BYPRODUCTS "${OUTPUT_PB_SRC}"
--        DEPENDS ${GENERATED_FILE_TARGETS}
-+        DEPENDS ${GENERATED_FILES}
-         COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}")
-   endforeach()
-
diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch
index e8ae766062d08..047cb527bb4da 100644
--- a/cmake/vcpkg-ports/onnx/binskim.patch
+++ b/cmake/vcpkg-ports/onnx/binskim.patch
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 47995579..6cc439f6 100644
+index cc3ef140..f70312ba 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644
  if(WIN32)
    option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF)
  endif()
-@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
+@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS
  
  add_library(onnx_proto ${ONNX_PROTO_SRCS})
  
@@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644
  
  # Hide all symbols we don't need
  set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
-@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto)
+@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto)
  target_include_directories(onnx_proto PUBLIC
    $<BUILD_INTERFACE:${ONNX_ROOT}>
    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
@@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644
  if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
    # whole-archive linker option not available on AIX.
 diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
-index 40635f97..44770774 100644
+index ad6dd0c1..50259f32 100644
 --- a/onnx/defs/nn/old.cc
 +++ b/onnx/defs/nn/old.cc
-@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA(
      GroupNormalization,
      18,
      OpSchema()
@@ -80,7 +80,7 @@ index 40635f97..44770774 100644
          .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
          .Attr(
 diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
-index ddd95454..34647987 100644
+index 7e9bc27f..4b87c5a5 100644
 --- a/onnx/defs/schema.h
 +++ b/onnx/defs/schema.h
 @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
index 27f5ea5fadd79..882850963a0c0 100644
--- a/cmake/vcpkg-ports/onnx/portfile.cmake
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -4,12 +4,9 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO onnx/onnx
     REF "v${VERSION}"
-    SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0
+    SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60
     PATCHES
         fix-cmakelists.patch
-        # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding.
-        # This change should be included in ONNX 1.19.1.
-        avoid_regenerating_proto_files.patch
         fix-dependency-protobuf.patch
         binskim.patch
 )
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
index 350db2e35061a..ad0d1aaf15f51 100644
--- a/cmake/vcpkg-ports/onnx/vcpkg.json
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "onnx",
-  "version-semver": "1.19.0",
+  "version-semver": "1.19.1",
   "port-version": 1,
   "description": "Open standard for machine learning interoperability",
   "homepage": "https://onnx.ai",
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 8da19ddc51cb7..8c1280431c384 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -35,7 +35,7 @@ git add onnx
 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch).
 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version.
 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.)
-4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
+4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl
 
 Alternatively, directly run Terrapin to upload ONNX package (need SHA512):
 
diff --git a/docs/python/README.rst b/docs/python/README.rst
index fdef200c1d0de..f610b36958fe1 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.24.0
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.24.0
+
 1.23.0
 ^^^^^^
 
diff --git a/docs/python/requirements.txt b/docs/python/requirements.txt
index b528063e1aa94..0be11c8760892 100644
--- a/docs/python/requirements.txt
+++ b/docs/python/requirements.txt
@@ -21,4 +21,4 @@ onnx
 sphinx_exec_code
 sphinx_tabs
 furo
-torch
+torch >= 2.6.0
diff --git a/js/.nvmrc b/js/.nvmrc
new file mode 100644
index 0000000000000..0a39d73000b91
--- /dev/null
+++ b/js/.nvmrc
@@ -0,0 +1 @@
+v24.9.0
\ No newline at end of file
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 4a670e24aa6b7..09316966a2fd1 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -245,7 +245,23 @@ export declare namespace InferenceSession {
   }
   export interface WebGpuExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'webgpu';
+
+    /**
+     * Specify the preferred layout when running layout sensitive operators.
+     *
+     * @default 'NCHW'
+     */
     preferredLayout?: 'NCHW' | 'NHWC';
+
+    /**
+     * Specify a list of node names that should be executed on CPU even when WebGPU EP is used.
+     */
+    forceCpuNodeNames?: readonly string[];
+
+    /**
+     * Specify an optional WebGPU device to be used by the WebGPU execution provider.
+     */
+    device?: TryGetGlobalType<'GPUDevice'>;
   }
 
   // #region WebNN options
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 994eb6f4300c1..1bf7e3ff6b819 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.23.0';
+export const version = '1.24.0';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 12e960e239b29..8b8fe876a16d1 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/common/package.json b/js/common/package.json
index a0eff9095e6d7..df1e50f995390 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 994eb6f4300c1..1bf7e3ff6b819 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.23.0';
+export const version = '1.24.0';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 740be4dd8d9a3..145d11ada7aa3 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "hasInstallScript": true,
       "license": "MIT",
       "os": [
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/node/package.json b/js/node/package.json
index 5520a48aa124a..3490ae8cf0cce 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -11,7 +11,7 @@
       6
     ]
   },
-  "version": "1.23.0",
+  "version": "1.24.0",
   "dependencies": {
     "adm-zip": "^0.5.16",
     "global-agent": "^3.0.0",
diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js
index 3147f90904e7a..f03a78878788b 100644
--- a/js/node/script/install-metadata-versions.js
+++ b/js/node/script/install-metadata-versions.js
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-module.exports = { nuget: [{ feed: 'nuget', version: '1.23.0' }] };
+module.exports = { nuget: [{ feed: 'nuget', version: '1.24.0' }] };
diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc
index 7fff751a29186..9f979110fd644 100644
--- a/js/node/src/session_options_helper.cc
+++ b/js/node/src/session_options_helper.cc
@@ -73,12 +73,37 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions& sess
         for (const auto& nameIter : obj.GetPropertyNames()) {
           Napi::Value nameVar = nameIter.second;
           std::string name = nameVar.As<Napi::String>().Utf8Value();
-          if (name != "name") {
-            Napi::Value valueVar = obj.Get(nameVar);
-            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(), "Invalid argument: sessionOptions.executionProviders must be a string or an object with property 'name'.");
-            std::string value = valueVar.As<Napi::String>().Utf8Value();
-            webgpu_options[name] = value;
+          Napi::Value valueVar = obj.Get(nameVar);
+          std::string value;
+          if (name == "preferredLayout" ||
+              name == "validationMode" ||
+              name == "storageBufferCacheMode" ||
+              name == "uniformBufferCacheMode" ||
+              name == "queryResolveBufferCacheMode" ||
+              name == "defaultBufferCacheMode") {
+            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(),
+                                        "Invalid argument: \"", name, "\" must be a string.");
+            value = valueVar.As<Napi::String>().Utf8Value();
+          } else if (name == "forceCpuNodeNames") {
+            ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsArray(), epList.Env(),
+                                        "Invalid argument: \"forceCpuNodeNames\" must be a string array.");
+            auto arr = valueVar.As<Napi::Array>();
+            for (uint32_t i = 0; i < arr.Length(); i++) {
+              Napi::Value v = arr[i];
+              ORT_NAPI_THROW_TYPEERROR_IF(!v.IsString(), epList.Env(),
+                                          "Invalid argument: elements of \"forceCpuNodeNames\" must be strings.");
+              if (i > 0) {
+                value += '\n';
+              }
+              value += v.As<Napi::String>().Utf8Value();
+            }
+          } else {
+            // unrecognized option
+            ORT_NAPI_THROW_TYPEERROR_IF(name != "name", epList.Env(),
+                                        "Invalid argument: WebGPU EP has an unrecognized option: '", name, "'.");
+            continue;
           }
+          webgpu_options[name] = value;
         }
       }
 #endif
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 994eb6f4300c1..1bf7e3ff6b819 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.23.0';
+export const version = '1.24.0';
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index ec2147b2cc4ba..f83bc60642247 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-react-native",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-react-native",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "dependencies": {
         "buffer": "^6.0.3",
@@ -31,7 +31,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 7a5ee35bdb25a..e776222c56f12 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -37,7 +37,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 295aacc6fffa3..ea88f291e5597 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | GatherElements | ai.onnx(11-12, 13+) | gatherElements | |
 | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 |
 | Gelu | ai.onnx(20+) | gelu | |
-| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input |
+| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | |
 | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input |
 | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input |
 | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 994eb6f4300c1..1bf7e3ff6b819 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.23.0';
+export const version = '1.24.0';
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 52d40bb403c77..d9f3ad70f0c23 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -72,9 +72,10 @@ const appendEpOption = (epOptions: Array<[number, number]>, key: string, value:
 
 const setExecutionProviders = async (
   sessionOptionsHandle: number,
-  executionProviders: readonly InferenceSession.ExecutionProviderConfig[],
+  sessionOptions: InferenceSession.SessionOptions,
   allocs: number[],
 ): Promise<void> => {
+  const executionProviders = sessionOptions.executionProviders!;
   for (const ep of executionProviders) {
     let epName = typeof ep === 'string' ? ep : ep.name;
     const epOptions: Array<[number, number]> = [];
@@ -98,16 +99,36 @@ const setExecutionProviders = async (
           let customDevice: GPUDevice | undefined;
 
           if (typeof ep !== 'string') {
-            const customOptions = ep as unknown as { device: GPUDevice };
-            if (customOptions.device) {
-              if (typeof GPUDevice !== 'undefined' && customOptions.device instanceof GPUDevice) {
-                customDevice = customOptions.device;
+            const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption;
+
+            // set custom GPU device
+            if (webgpuOptions.device) {
+              if (typeof GPUDevice !== 'undefined' && webgpuOptions.device instanceof GPUDevice) {
+                customDevice = webgpuOptions.device;
               } else {
                 throw new Error('Invalid GPU device set in WebGPU EP options.');
               }
             }
 
-            // TODO: handle more options
+            // set graph capture option from session options
+            const { enableGraphCapture } = sessionOptions;
+            if (typeof enableGraphCapture === 'boolean' && enableGraphCapture) {
+              appendEpOption(epOptions, 'enableGraphCapture', '1', allocs);
+            }
+
+            // set layout option
+            if (typeof webgpuOptions.preferredLayout === 'string') {
+              appendEpOption(epOptions, 'preferredLayout', webgpuOptions.preferredLayout, allocs);
+            }
+
+            // set force CPU fallback nodes
+            if (webgpuOptions.forceCpuNodeNames) {
+              const names = Array.isArray(webgpuOptions.forceCpuNodeNames)
+                ? webgpuOptions.forceCpuNodeNames
+                : [webgpuOptions.forceCpuNodeNames];
+
+              appendEpOption(epOptions, 'forceCpuNodeNames', names.join('\n'), allocs);
+            }
           }
 
           const info = getInstance().webgpuRegisterDevice!(customDevice);
@@ -211,7 +232,7 @@ export const setSessionOptions = async (options?: InferenceSession.SessionOption
     }
 
     if (sessionOptions.executionProviders) {
-      await setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
+      await setExecutionProviders(sessionOptionsHandle, sessionOptions, allocs);
     }
 
     if (sessionOptions.enableGraphCapture !== undefined) {
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 2b0a353b59832..86438200886e3 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^25.1.24",
@@ -50,7 +50,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.23.0",
+      "version": "1.24.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/web/package.json b/js/web/package.json
index d5425931bfc9e..ecd87fab4302b 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.23.0",
+  "version": "1.24.0",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^25.1.24",
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 48f0a8f3e9d5c..e880f6bca2ac4 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.3.5"
+        "vite": "^6.3.6"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1114,9 +1114,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.3.5",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz",
-      "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==",
+      "version": "6.3.6",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz",
+      "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index f7d5751354905..84013e2aecb88 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.3.5"
+    "vite": "^6.3.6"
   }
 }
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 3f1face2a043c..80991a3ebbb5f 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -2147,66 +2147,66 @@
       "test_reduce_log_sum_default",
       "test_reduce_log_sum_desc_axes",
       // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
-      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
-      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
       "test_reduce_log_sum_negative_axes",
       "test_reduce_log_sum",
       "test_reduce_max_default_axes_keepdim_example",
-      // "test_reduce_max_default_axes_keepdims_random",
-      // "test_reduce_max_do_not_keepdims_example",
-      // "test_reduce_max_do_not_keepdims_random",
-      // "test_reduce_max_keepdims_example",
-      // "test_reduce_max_keepdims_random",
-      // "test_reduce_max_negative_axes_keepdims_example",
-      // "test_reduce_max_negative_axes_keepdims_random",
-      // "test_reduce_mean_default_axes_keepdims_example",
-      // "test_reduce_mean_default_axes_keepdims_random",
-      // "test_reduce_mean_do_not_keepdims_example",
-      // "test_reduce_mean_do_not_keepdims_random",
-      // "test_reduce_mean_keepdims_example",
-      // "test_reduce_mean_keepdims_random",
-      // "test_reduce_mean_negative_axes_keepdims_example",
-      // "test_reduce_mean_negative_axes_keepdims_random",
-      // "test_reduce_min_default_axes_keepdims_example",
-      // "test_reduce_min_default_axes_keepdims_random",
-      // "test_reduce_min_do_not_keepdims_example",
-      // "test_reduce_min_do_not_keepdims_random",
-      // "test_reduce_min_keepdims_example",
-      // "test_reduce_min_keepdims_random",
-      // "test_reduce_min_negative_axes_keepdims_example",
-      // "test_reduce_min_negative_axes_keepdims_random",
-      // "test_reduce_prod_default_axes_keepdims_example",
-      // "test_reduce_prod_default_axes_keepdims_random",
-      // "test_reduce_prod_do_not_keepdims_example",
-      // "test_reduce_prod_do_not_keepdims_random",
-      // "test_reduce_prod_keepdims_example",
-      // "test_reduce_prod_keepdims_random",
-      // "test_reduce_prod_negative_axes_keepdims_example",
-      // "test_reduce_prod_negative_axes_keepdims_random",
-      // "test_reduce_sum_default_axes_keepdims_example",
-      // "test_reduce_sum_default_axes_keepdims_random",
-      // "test_reduce_sum_do_not_keepdims_example",
-      // "test_reduce_sum_do_not_keepdims_random",
+      "test_reduce_max_default_axes_keepdims_random",
+      "test_reduce_max_do_not_keepdims_example",
+      "test_reduce_max_do_not_keepdims_random",
+      "test_reduce_max_keepdims_example",
+      "test_reduce_max_keepdims_random",
+      "test_reduce_max_negative_axes_keepdims_example",
+      "test_reduce_max_negative_axes_keepdims_random",
+      "test_reduce_mean_default_axes_keepdims_example",
+      "test_reduce_mean_default_axes_keepdims_random",
+      "test_reduce_mean_do_not_keepdims_example",
+      "test_reduce_mean_do_not_keepdims_random",
+      "test_reduce_mean_keepdims_example",
+      "test_reduce_mean_keepdims_random",
+      "test_reduce_mean_negative_axes_keepdims_example",
+      "test_reduce_mean_negative_axes_keepdims_random",
+      "test_reduce_min_default_axes_keepdims_example",
+      "test_reduce_min_default_axes_keepdims_random",
+      "test_reduce_min_do_not_keepdims_example",
+      "test_reduce_min_do_not_keepdims_random",
+      "test_reduce_min_keepdims_example",
+      "test_reduce_min_keepdims_random",
+      "test_reduce_min_negative_axes_keepdims_example",
+      "test_reduce_min_negative_axes_keepdims_random",
+      "test_reduce_prod_default_axes_keepdims_example",
+      "test_reduce_prod_default_axes_keepdims_random",
+      "test_reduce_prod_do_not_keepdims_example",
+      "test_reduce_prod_do_not_keepdims_random",
+      "test_reduce_prod_keepdims_example",
+      "test_reduce_prod_keepdims_random",
+      "test_reduce_prod_negative_axes_keepdims_example",
+      "test_reduce_prod_negative_axes_keepdims_random",
+      "test_reduce_sum_default_axes_keepdims_example",
+      "test_reduce_sum_default_axes_keepdims_random",
+      "test_reduce_sum_do_not_keepdims_example",
+      "test_reduce_sum_do_not_keepdims_random",
       "test_reduce_sum_empty_axes_input_noop_example",
       "test_reduce_sum_empty_axes_input_noop_random",
-      // "test_reduce_sum_keepdims_example",
-      // "test_reduce_sum_keepdims_random",
-      // "test_reduce_sum_negative_axes_keepdims_example",
-      // "test_reduce_sum_negative_axes_keepdims_random",
-      // "test_reduce_sum_square_default_axes_keepdims_example",
-      // "test_reduce_sum_square_default_axes_keepdims_random",
-      // "test_reduce_sum_square_do_not_keepdims_example",
-      // "test_reduce_sum_square_do_not_keepdims_random",
-      // "test_reduce_sum_square_keepdims_example",
-      // "test_reduce_sum_square_keepdims_random",
-      // "test_reduce_sum_square_negative_axes_keepdims_example",
-      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      "test_reduce_sum_keepdims_example",
+      "test_reduce_sum_keepdims_random",
+      "test_reduce_sum_negative_axes_keepdims_example",
+      "test_reduce_sum_negative_axes_keepdims_random",
+      "test_reduce_sum_square_default_axes_keepdims_example",
+      "test_reduce_sum_square_default_axes_keepdims_random",
+      "test_reduce_sum_square_do_not_keepdims_example",
+      "test_reduce_sum_square_do_not_keepdims_random",
+      "test_reduce_sum_square_keepdims_example",
+      "test_reduce_sum_square_keepdims_random",
+      "test_reduce_sum_square_negative_axes_keepdims_example",
+      "test_reduce_sum_square_negative_axes_keepdims_random",
       // "test_reflect_pad",
       "test_relu",
       "test_reshape_allowzero_reordered",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 8b019f60d3e99..6f303acb4e97a 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -8,7 +8,7 @@
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
 
-__version__ = "1.23.0"
+__version__ = "1.24.0"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
index 469084e7b4491..c51fc1cf54815 100644
--- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
@@ -130,6 +130,11 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
         allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V));
   }
 
+  OrtValue RotaryQKV;
+  OrtValue RotaryQ;
+  OrtValue RotaryK;
+  T* q_rotary = Q.GetMutable<Tensor>()->MutableData<T>();
+  T* k_rotary = packed_qkv ? nullptr : K.GetMutable<Tensor>()->MutableData<T>();
   if (do_rotary_) {
     rotary_embedding_helper::RotaryParameters rotary_params = {};
     rotary_params.batch_size = batch_size;
@@ -167,30 +172,22 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
     const T* q_input;
     const T* k_input;
-    T* q_rotary;
-    T* k_rotary;
     if (packed_qkv) {
-      OrtValue RotaryQKV;
       TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = q_input + num_heads_ * sequence_length * head_size;
       q_rotary = RotaryQKV.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = q_rotary + num_heads_ * sequence_length * head_size;
-      Q = RotaryQKV;
     } else {
-      OrtValue RotaryQ;
       TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ);
-      OrtValue RotaryK;
       TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size});
       Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK);
       q_input = Q.Get<Tensor>().Data<T>();
       k_input = K.Get<Tensor>().Data<T>();
       q_rotary = RotaryQ.GetMutable<Tensor>()->MutableData<T>();
       k_rotary = RotaryK.GetMutable<Tensor>()->MutableData<T>();
-      Q = RotaryQ;
-      K = RotaryK;
     }
 
     ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, q_input,
@@ -221,9 +218,8 @@ Status SparseAttention<T>::Compute(OpKernelContext* context) const {
 
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
   // Compute the attention score and apply the score to V
-  return ApplyAttention(Q.Get<Tensor>().Data<T>(), packed_qkv ? nullptr : K.Get<Tensor>().Data<T>(),
-                        packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(), past_key, past_value,
-                        output, present_key, present_value,
+  return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(),
+                        past_key, past_value, output, present_key, present_value,
                         total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context);
 }
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
index 4f901a550e8bf..588f37051b534 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -60,7 +60,7 @@ namespace cutlass_kernels {
 template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType, typename OutputType,
           cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag, typename CTAShape, typename ClusterShape,
           typename MainloopScheduleType, typename EpilogueScheduleType>
-#ifdef COMPILE_HOPPER_TMA_GEMMS
+#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__)
 void sm90_generic_mixed_gemm_kernelLauncher(
     ActivationType const* A, WeightType const* B,
     ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases,
@@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
   }
 }
 #else   // COMPILE_HOPPER_TMA_GEMMS
+// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass.
 void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*,
                                             ScaleZeroType const*, ScaleZeroType const*, BiasType const*,
                                             float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig,
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc
index 925a6913a2890..e5b15856a6c05 100644
--- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h"
 #include "contrib_ops/cuda/llm/common/workspace.h"
 
@@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i
 }
 
 }  // namespace onnxruntime::llm::kernels::weight_only
+#endif
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index b5c1f73d1678d..a9bd4afc5cd09 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform);
   const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape");
+  // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output
+  if (prepare_indirect_dispatch_) {
+    shader.AddInput("seqlen_k", ShaderUsage::None);
+    shader.AddOutput("indirect_buffer", ShaderUsage::None);
+  }
 
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size")
                             << "  let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n"
@@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
                                "  let sequence_id = output_indices[2];\n"
                                "  let num_head_id = output_indices[1];\n"
                                "  let batch = output_indices[0];\n";
+  if (prepare_indirect_dispatch_) {
+    shader.MainFunctionBody() << "  let total_seq_length = u32(seqlen_k[0u]) + 1u;\n";
+  } else {
+    shader.MainFunctionBody() << "  let total_seq_length = uniforms.total_sequence_length;\n";
+  }
+
+  // Add indirect dispatch logic for thread 0
+  if (prepare_indirect_dispatch_) {
+    // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size.
+    shader.MainFunctionBody() << "  // Prepare indirect dispatch buffer for thread 0\n"
+                              << "  if (global_idx == 0u) {\n"
+                              << "    let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n"
+                              << "    indirect_buffer[0] = num_total_seq_length_tile;\n"
+                              << "    indirect_buffer[1] = uniforms.num_heads;\n"
+                              << "    indirect_buffer[2] = 1u;\n"
+                              << "  }\n\n";
+  }
+
   if (has_past_) {
-    shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n";
+    shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n";
     if (past_present_share_buffer_) {
       shader.MainFunctionBody() << "  let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n"
                                 << "  let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n"
@@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
 Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters,
                    const Tensor* K, const Tensor* past_key, Tensor* present_key,
-                   const Tensor* V, const Tensor* past_value, Tensor* present_value) {
+                   const Tensor* V, const Tensor* past_value, Tensor* present_value,
+                   uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) {
   // CopyKVCache takes past key/value and current key/value and copies them to present key and value.
   // This makes it so that FlashAttention only needs to look at present key and value, and saves
   // number of input buffers in the shader, which we run out of (<=8) without this optimization.
+  // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention.
   const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
   bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0;
   // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length.
@@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
   int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_;
   TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components};
   int64_t copy_size = copy_kv_shape.Size();
-  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_};
+
+  // Determine if we need to prepare indirect dispatch
+  bool prepare_indirect_dispatch = (indirect_buffer != nullptr);
+
+  CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_,
+                             prepare_indirect_dispatch};
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, components}});
@@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}});
   }
+
+  if (prepare_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
+
   if (has_past && !parameters.past_present_share_buffer_) {
     program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   }
   program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
-                      {present_value, ProgramTensorMetadataDependency::Rank, components}})
-      .AddIndices(std::move(copy_kv_shape));
+                      {present_value, ProgramTensorMetadataDependency::Rank, components}});
+
+  if (prepare_indirect_dispatch) {
+    program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None});
+  }
+
+  program.AddIndices(std::move(copy_kv_shape));
   program.SetDispatchGroupSize(static_cast<uint32_t>((copy_size + 63) / 64))
       .SetWorkgroupSize(64)
-      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_)
+      .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(copy_size)},
-                            // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to
-                            // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length.
-                            {static_cast<uint32_t>(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}});
+                            {static_cast<uint32_t>(parameters.total_sequence_length_)},
+                            {static_cast<uint32_t>(parameters.kv_sequence_length_)},
+                            {tile_size},
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
@@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   if (has_attention_bias_) {
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
@@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                              WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q,
-                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata,
-                                      const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile,
-                                      uint32_t num_present_sequence_length_tile, uint32_t tile_size,
-                                      uint32_t present_sequence_length) {
+                                      const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k,
+                                      const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) {
   const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size_))
                                                 : parameters.scale_;
 
   const bool has_attention_bias = attention_bias != nullptr;
   const int components = 4;
 
-  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size};
+  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch};
   program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
                      {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   if (has_attention_bias) {
     program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
   }
@@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
                       {metadata, ProgramTensorMetadataDependency::Rank, 2}});
 
   const uint32_t vectorized_head_size = parameters.head_size_ / components;
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .SetWorkgroupSize(64)
-      .CacheHint(tile_size, has_attention_bias)
+  if (use_indirect_dispatch) {
+    program.SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.SetWorkgroupSize(64)
+      .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(vectorized_head_size)},
                             {static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<float>(alpha)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {num_total_seq_length_tile},
                             {num_present_sequence_length_tile},
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
   shader.AddInput("metadata", ShaderUsage::UseUniform);
   shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("out_split_vx", ShaderUsage::UseUniform);
 
   const uint32_t tile_size_k_vec = 8u;
@@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
                              WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context,
@@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
                                                const Tensor* qk,
                                                Tensor* out_split_vx,
                                                Tensor* present_value,
+                                               const Tensor* seqlen_k,
                                                const WebgpuAttentionParameters& parameters,
+                                               const Tensor* indirect_buffer,
                                                uint32_t num_total_seq_length_tile,
                                                uint32_t num_present_sequence_length_tile,
                                                uint32_t tile_size,
+                                               bool use_indirect_dispatch,
                                                uint32_t present_sequence_length) {
   const int components = 4;
   int head_size_vec = parameters.v_head_size_ / components;
-  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec};
+  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch};
   program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2},
                      {qk, ProgramTensorMetadataDependency::TypeAndRank},
                      {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});  // [B, N, split_k, head_size]
-  program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile)
-      .CacheHint(tile_size, head_size_vec)
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None})
+        .SetIndirectDispatchTensor(indirect_buffer);
+  } else {
+    program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
+  }
+  program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch)
       .SetWorkgroupSize(64)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(head_size_vec)},
                             present_sequence_length,
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            num_total_seq_length_tile,
                             num_present_sequence_length_tile,
                             {static_cast<uint32_t>(parameters.num_heads_)}});
 
@@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
 
 Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("input", ShaderUsage::UseUniform);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
   return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template",
-                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_));
+                             WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context,
                                            const Tensor* out_split_vx,
                                            Tensor* output,
+                                           const Tensor* seqlen_k,
                                            const WebgpuAttentionParameters& parameters,
                                            uint32_t num_total_seq_length_tile,
-                                           uint32_t num_present_sequence_length_tile) {
+                                           uint32_t num_present_sequence_length_tile,
+                                           uint32_t seq_tile_size,
+                                           bool use_indirect_dispatch) {
   const int components = 4;
   constexpr int tile_size = 8;
   int tile_head_size = tile_size * components;
-  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size};
+  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch};
   program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}});
   const uint32_t num_head_size_tile = static_cast<uint32_t>((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size);
   program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile)
-      .CacheHint(tile_size)
+      .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch)
       .SetWorkgroupSize(tile_size * tile_size)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.v_head_size_ / components)},
                             num_total_seq_length_tile,
@@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext&
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) {
-  ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value));
-
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
   // Extract present_sequence_length directly from present_key tensor shape:
   // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size)
   const uint32_t present_sequence_length = static_cast<uint32_t>(present_key->Shape()[2]);
+
   if (parameters.sequence_length_ > 1) {
     const uint32_t tile_size = 64;
+    // For encode path, use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
     bool has_attention_bias = attention_bias != nullptr;
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
     bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"};
@@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     return context.RunProgram(program);
   }
 
-  // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled.
+  // For decode path (sequence_length == 1)
   const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_,
                                    parameters.sequence_length_, present_sequence_length});
   const TensorShape qk_shape(qk_dims);
@@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   constexpr uint32_t tile_size = 64;
   const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size;
   const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size;
+
+  // Determine if we should use indirect dispatch
+  const bool use_indirect_dispatch = parameters.past_present_share_buffer_ &&
+                                     seqlen_k != nullptr &&
+                                     context.IsGraphCaptureEnabled();
+
+  // Create indirect dispatch buffer if using indirect dispatch
+  Tensor* indirect_buffer_ptr = nullptr;
+  Tensor indirect_buffer;
+  if (use_indirect_dispatch) {
+    const TensorShape indirect_buffer_shape{3};  // 3 uint32 values for dispatch dimensions
+    indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), indirect_buffer_shape);
+    indirect_buffer_ptr = &indirect_buffer;
+    // Use the fused CopyKVCache that also prepares the indirect dispatch buffer
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr));
+  } else {
+    // Use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
+  }
+
   // The metadata is used to store the max and sum of each tile.
   const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_,
                                          num_present_sequence_length_tile, 2});
   const TensorShape metadata_shape(metadata_dims);
   Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), metadata_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata,
-                                                     parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size,
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k,
+                                                     parameters, indirect_buffer_ptr, num_total_seq_length_tile,
+                                                     num_present_sequence_length_tile, tile_size, use_indirect_dispatch,
                                                      present_sequence_length));
 
-  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_});
+  const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_,
+                                             num_present_sequence_length_tile, parameters.head_size_});
   const TensorShape out_split_vx_shape(out_split_vx_dims);
   Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape);
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters,
-                                                              num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length));
-  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value,
+                                                              seqlen_k, parameters, indirect_buffer_ptr,
+                                                              num_total_seq_length_tile,
+                                                              num_present_sequence_length_tile, tile_size,
+                                                              use_indirect_dispatch, present_sequence_length));
+  ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters,
+                                                          num_total_seq_length_tile,
+                                                          num_present_sequence_length_tile, tile_size, use_indirect_dispatch));
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index c75494df253c1..7d71dc0f4d42d 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu;
 
 class CopyKVCacheProgram final : public Program<CopyKVCacheProgram> {
  public:
-  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer)
-      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) {
+  CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer,
+                     bool prepare_indirect_dispatch = false)
+      : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32},
-                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32});
+                                          {"total_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"tile_size", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_past_;
   bool kv_BNSH_;
   bool past_present_share_buffer_;
+  bool prepare_indirect_dispatch_;
 };
 
 class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
@@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
 class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecodeQKTProgram> {
  public:
   FlashAttentionDecodeQKTProgram(const std::string& kernel_name,
-                                 bool has_attention_bias, uint32_t tile_size)
-      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) {
+                                 bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecode
                                           {"alpha", ProgramUniformVariableDataType::Float32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
   uint32_t tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDecodeSplitVxProgram> {
  public:
-  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec)
-      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) {
+  FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDe
                                           {"head_size_vec", ProgramUniformVariableDataType::Uint32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_present_sequence_length_tile", ProgramUniformVariableDataType::Uint32},
                                           {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t tile_size_;
   int head_size_vec_;
+  bool use_indirect_dispatch_;
 };
 
 class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionDecodeVxReduceProgram> {
  public:
-  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size)
-      : Program{kernel_name}, tile_size_(tile_size) {
+  FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch)
+      : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionD
 
  private:
   uint32_t tile_size_;
+  uint32_t seq_tile_size_;
+  bool use_indirect_dispatch_;
 };
 
 Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                            Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value,
-                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
+                           const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k = nullptr);
 
 bool CanApplyFlashAttention(const Tensor* bias, const Tensor* present_key, const Tensor* present_value,
                             const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
index 7f41f2518b84b..c6f768beffa0f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
@@ -5,6 +5,7 @@
 #param tile_size
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -48,10 +49,15 @@ var<workgroup> tile_qk: array<q_element_t, tile_size>;
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
+#if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+#else
+  let total_sequence_length = uniforms.total_sequence_length;
+#endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let q_offset = head_idx * uniforms.head_size_vec;
-  var total_sequence_length = uniforms.total_sequence_length;
   let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec;
   for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) {
     if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) {
@@ -95,7 +101,7 @@ $MAIN {
     for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) {
       l_sum += exp(f32(tile_qk[i]) - l_max);
     }
-    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile;
+    let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile;
     metadata[meta_offset] = metadata_value_t(l_max, l_sum);
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
index c7593af311ce2..37cf7e8f11b1f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
@@ -5,6 +5,7 @@
 #param head_size_vec
 #param tile_size_k_vec
 #param sub_tile_count
+#param use_indirect_dispatch
 
 // Note that this shader adopts similar algorithm with dp4a generation shader.
 //
@@ -40,9 +41,14 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
 $MAIN {
   let local_row = u32(local_idx / tile_size_k_vec);
   let local_col = local_idx % tile_size_k_vec;
-  let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size;
-  let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile);
-  var total_sequence_length = uniforms.total_sequence_length;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  #else
+  let total_sequence_length = uniforms.total_sequence_length;
+  #endif
+  let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size;
+  let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size;
+  let head_idx = u32(workgroup_idx / num_total_seq_length_tile);
   let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length;
 
   // Calculate the global max and sum in qk.
@@ -50,12 +56,12 @@ $MAIN {
   {
     var g_max = f32(-3.402823e+38f);
     var g_sum = f32(0);
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       g_max = max(g_max, metadata[meta_offset].x);
     }
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    for (var i = 0u; i < num_total_seq_length_tile; i++)
     {
       let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i;
       let m_value = metadata[meta_offset];
@@ -95,7 +101,7 @@ $MAIN {
   }
 
   for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) {
-    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i;
+    let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i;
     out_split_vx[out_offset] = tile_output[i];
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
index a4381baa638ce..22f18655307de 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#param seq_tile_size
 #param tile_size
+#param use_indirect_dispatch
 
 // Inputs are splits of the GQA output, split into num_total_seq_length_tiles
 // rows. This shader needs to add these splits across the row dimension to
@@ -23,10 +25,16 @@ $MAIN {
   var value = output_value_t(0);
   let local_row = u32(local_idx / tile_size);
   let local_col = local_idx % tile_size;
+  #if use_indirect_dispatch
+  let total_sequence_length = u32(seqlens_k[0]) + 1u;
+  let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size;
+  #else
+  let num_total_seq_length_tile = uniforms.num_total_seq_length_tile;
+  #endif
 
   if (head_size_offset + local_col < uniforms.head_size_vec) {
-    for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) {
-      if (r + local_row < uniforms.num_total_seq_length_tile) {
+    for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) {
+      if (r + local_row < num_total_seq_length_tile) {
         value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col];
       }
     }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
index 8b7b257dd2852..49cc0209785c5 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -110,35 +110,87 @@ Status GeneratePositionIDs(onnxruntime::webgpu::ComputeContext& context, const W
   return context.RunProgram(program);
 }
 
-Status RunRotaryEmbedding(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& params, const Tensor* input, const Tensor* pos_ids, const Tensor* cos_cache, const Tensor* sin_cache, Tensor* output, bool is_query_input) {
+// Fused Q/K rotary embedding
+Status RunFusedQKRotaryEmbedding(onnxruntime::webgpu::ComputeContext& context,
+                                 const WebgpuAttentionParameters& params,
+                                 const Tensor* query_in,
+                                 const Tensor* key_in,
+                                 const Tensor* seqlen_k,
+                                 const Tensor* cos_cache,
+                                 const Tensor* sin_cache,
+                                 Tensor* query_out,
+                                 Tensor* key_out) {
+  Tensor pos_ids = context.CreateGPUTensor(DataTypeImpl::GetType<int64_t>(),
+                                           TensorShape({params.batch_size_, params.sequence_length_}));
+  ORT_RETURN_IF_ERROR(GeneratePositionIDs(context, params, seqlen_k, &pos_ids));
+
   const auto half_rotary_embedding_dim = gsl::narrow_cast<uint32_t>(cos_cache->Shape()[1]);
   const auto head_size = params.head_size_;
-  const auto hidden_size = is_query_input ? params.hidden_size_ : params.kv_hidden_size_;
-  const TensorShape global_shape({params.batch_size_, params.sequence_length_, hidden_size / head_size, static_cast<int64_t>(head_size - half_rotary_embedding_dim)});
-  const auto rank = global_shape.NumDimensions();
-  std::vector<uint32_t> global_dims(rank);
-  std::vector<uint32_t> global_strides(rank);
+
+  // Build Q domain
+  const auto hidden_size_q = params.hidden_size_;
+  const TensorShape q_global_shape({params.batch_size_, params.sequence_length_,
+                                    hidden_size_q / head_size,
+                                    static_cast<int64_t>(head_size - half_rotary_embedding_dim)});
+  const auto rank = q_global_shape.NumDimensions();
+  std::vector<uint32_t> q_global_dims(rank);
+  std::vector<uint32_t> q_global_strides(rank);
   for (size_t j = 0; j < rank; ++j) {
-    global_dims[j] = gsl::narrow_cast<uint32_t>(global_shape[j]);
-    global_strides[j] = gsl::narrow_cast<uint32_t>(global_shape.SizeFromDimension(j + 1));
+    q_global_dims[j] = gsl::narrow_cast<uint32_t>(q_global_shape[j]);
+    q_global_strides[j] = gsl::narrow_cast<uint32_t>(q_global_shape.SizeFromDimension(j + 1));
   }
-  const auto input_output_strides = std::vector<uint32_t>({gsl::narrow_cast<uint32_t>(input->Shape().SizeFromDimension(1)), gsl::narrow_cast<uint32_t>(hidden_size), gsl::narrow_cast<uint32_t>(head_size), 1});
-  const auto output_size = gsl::narrow_cast<const uint32_t>(global_shape.Size());
 
-  RotaryEmbeddingProgram program(params.rotary_interleaved_);
+  // Build K domain
+  const auto hidden_size_k = params.kv_hidden_size_;
+  const TensorShape k_global_shape({params.batch_size_, params.sequence_length_,
+                                    hidden_size_k / head_size,
+                                    static_cast<int64_t>(head_size - half_rotary_embedding_dim)});
+  std::vector<uint32_t> k_global_dims(rank);
+  for (size_t j = 0; j < rank; ++j) {
+    k_global_dims[j] = gsl::narrow_cast<uint32_t>(k_global_shape[j]);
+  }
+
+  const auto q_domain_size = gsl::narrow_cast<uint32_t>(q_global_shape.Size());
+
+  const auto q_input_output_strides = std::vector<uint32_t>(
+      {gsl::narrow_cast<uint32_t>(query_in->Shape().SizeFromDimension(1)),
+       gsl::narrow_cast<uint32_t>(hidden_size_q),
+       gsl::narrow_cast<uint32_t>(head_size),
+       1u});
+
+  const auto k_input_output_strides = std::vector<uint32_t>(
+      {gsl::narrow_cast<uint32_t>(key_in->Shape().SizeFromDimension(1)),
+       gsl::narrow_cast<uint32_t>(hidden_size_k),
+       gsl::narrow_cast<uint32_t>(head_size),
+       1u});
+
+  // Dispatch computations only over the Q domain, and fuse K write operations using a head-index-based condition.
+  FusedQKRotaryEmbeddingProgram program(params.rotary_interleaved_);
   program
       .CacheHint(params.rotary_interleaved_)
-      .AddInputs({{input, ProgramTensorMetadataDependency::Rank},
-                  {pos_ids, ProgramTensorMetadataDependency::Rank},
-                  {cos_cache, ProgramTensorMetadataDependency::Rank},
-                  {sin_cache, ProgramTensorMetadataDependency::Rank}})
-      .AddOutput(output)
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
-      .AddUniformVariables({{params.scale_},
-                            {gsl::make_span(global_dims)},
-                            {gsl::make_span(global_strides)},
-                            {gsl::make_span(input_output_strides)}})
+      .AddInputs({
+          {query_in, ProgramTensorMetadataDependency::Rank},
+          {key_in, ProgramTensorMetadataDependency::Rank},
+          {&pos_ids, ProgramTensorMetadataDependency::Rank},
+          {cos_cache, ProgramTensorMetadataDependency::Rank},
+          {sin_cache, ProgramTensorMetadataDependency::Rank},
+      })
+      .AddOutputs({
+          {query_out, ProgramTensorMetadataDependency::Rank},
+          {key_out, ProgramTensorMetadataDependency::Rank},
+      })
+      .SetDispatchGroupSize((q_domain_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({
+          {params.scale_},
+          {gsl::make_span(q_global_dims)},
+          {gsl::make_span(q_global_strides)},
+          {gsl::make_span(q_input_output_strides)},
+          {gsl::make_span(k_global_dims)},
+          {gsl::make_span(k_input_output_strides)},
+          {q_domain_size},
+      })
       .AddIndices(TensorShape{1, 1});
+
   return context.RunProgram(program);
 }
 
@@ -199,15 +251,6 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   parameters.past_present_share_buffer_ = present_key != nullptr && present_value != nullptr && past_key != nullptr && past_value != nullptr && past_key->DataRaw() == present_key->DataRaw() && past_value->DataRaw() == present_value->DataRaw();
 
   ORT_ENFORCE(parameters.total_sequence_length_ <= parameters.seqlen_present_kv_cache_, "Total sequence length cannot be greater than the existing KV cache length.");
-  // Use a sliding window if the total sequence exceeds the window's length.
-  bool use_sliding_window = (local_window_size_ != -1 && local_window_size_ < parameters.total_sequence_length_);
-  if (!do_rotary_ &&
-      head_sink == nullptr && !use_smooth_softmax_ &&
-      !use_sliding_window &&
-      CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) {
-    return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value,
-                               present_value, parameters, context);
-  }
 
   Tensor qSplit;
   Tensor kSplit;
@@ -218,6 +261,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
     vSplit = context.CreateGPUTensor(query->DataType(), TensorShape({parameters.batch_size_, parameters.sequence_length_, parameters.kv_hidden_size_}));
     ORT_RETURN_IF_ERROR(SplitPackedQKV(context, parameters, query, &qSplit, &kSplit, &vSplit));
     parameters.is_packed_qkv_ = false;
+    parameters.qkv_format_ = Q_K_V_BSNH;
     query = &qSplit;
     key = &kSplit;
     value = &vSplit;
@@ -228,15 +272,24 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   if (do_rotary_) {
     qRotary = context.CreateGPUTensor(query->DataType(), query->Shape());
     kRotary = context.CreateGPUTensor(key->DataType(), key->Shape());
-    auto pos_ids_shape = TensorShape({parameters.batch_size_, parameters.sequence_length_});
-    Tensor pos_ids = context.CreateGPUTensor(DataTypeImpl::GetType<int64_t>(), pos_ids_shape);
-    ORT_RETURN_IF_ERROR(GeneratePositionIDs(context, parameters, seqlen_k, &pos_ids));
-    ORT_RETURN_IF_ERROR(RunRotaryEmbedding(context, parameters, query, &pos_ids, cos_cache, sin_cache, &qRotary, /* is_query_input = */ true));
-    ORT_RETURN_IF_ERROR(RunRotaryEmbedding(context, parameters, key, &pos_ids, cos_cache, sin_cache, &kRotary, /* is_query_input = */ false));
+    ORT_RETURN_IF_ERROR(RunFusedQKRotaryEmbedding(context, parameters,
+                                                  query, key,
+                                                  seqlen_k,
+                                                  cos_cache, sin_cache,
+                                                  &qRotary, &kRotary));
     query = &qRotary;
     key = &kRotary;
   }
 
+  // Use a sliding window if the total sequence exceeds the window's length.
+  bool use_sliding_window = (local_window_size_ != -1 && local_window_size_ < parameters.total_sequence_length_);
+  if (head_sink == nullptr && !use_smooth_softmax_ &&
+      !use_sliding_window &&
+      CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) {
+    return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value,
+                               present_value, parameters, context);
+  }
+
   TensorShapeVector q_new_dims({parameters.batch_size_, parameters.num_heads_,
                                 parameters.sequence_length_, parameters.head_size_});
   TensorShape q_new_shape(q_new_dims);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 8f316cfae80e9..79c8f45fb7832 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -50,6 +50,57 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
+Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  // Inputs
+  const auto& q_input = shader.AddInput("q_input", ShaderUsage::UseUniform);
+  const auto& k_input = shader.AddInput("k_input", ShaderUsage::UseUniform);
+  const auto& position_ids = shader.AddInput("position_ids", ShaderUsage::UseUniform);
+  const auto& cos_cache = shader.AddInput("cos_cache", ShaderUsage::UseUniform);
+  const auto& sin_cache = shader.AddInput("sin_cache", ShaderUsage::UseUniform);
+  // Outputs
+  const auto& q_output = shader.AddOutput("q_output", ShaderUsage::UseUniform);
+  const auto& k_output = shader.AddOutput("k_output", ShaderUsage::UseUniform);
+  // Indices helper
+  const auto& dummy_indices = shader.AddIndices("dummy_indices", ShaderUsage::None);
+
+  const auto interleaved_str = interleaved_ ? "true" : "false";
+
+  shader.MainFunctionBody()
+      << "  if (global_idx >= uniforms.q_domain_size) { return; }\n"
+      << "  let half_rotary_dim = uniforms.cos_cache_shape[1];\n"
+      << "  let bsnh = global_idx / uniforms.q_global_stride % uniforms.q_global_shape;\n"
+      << "  if (bsnh[3] < half_rotary_dim) {\n"
+      << "    let pos_ids_idx = " << position_ids.BroadcastedIndicesToOffset("bsnh.xy", dummy_indices) << ";\n"
+      << "    let position_id = u32(" << position_ids.GetByOffset("pos_ids_idx") << ") + select(0u, bsnh[1], pos_ids_idx == 0u);\n"
+      << "    let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "    let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "    let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "    let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "    let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
+      << "    " << q_output.SetByOffset("qi", "q_re") << "\n"
+      << "    let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
+      << "    " << q_output.SetByOffset("qj", "q_im") << "\n"
+      // Conditionally process Key (only for heads that exist in K domain)
+      << "    if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "      let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "      let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "      let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n"
+      << "      " << k_output.SetByOffset("ki", "k_re") << "\n"
+      << "      let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
+      << "      " << k_output.SetByOffset("kj", "k_im") << "\n"
+      << "    }\n"
+      << "  } else {\n"
+      << "    let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n"
+      << "    " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n"
+      // Conditionally process Key (only for heads that exist in K domain)
+      << "    if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "      let kk = dot(bsnh, uniforms.k_input_output_stride) + half_rotary_dim;\n"
+      << "      " << k_output.SetByOffset("kk", k_input.GetByOffset("kk")) << "\n"
+      << "    }\n"
+      << "  }\n";
+  return Status::OK();
+}
+
 RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) {
   scale_ = info.GetAttrOrDefault<float>("scale", 1.0);
   rotary_embedding_dim_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h
index 0d73b89fb62df..e3dc4468cb3ed 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h
@@ -29,6 +29,27 @@ class RotaryEmbeddingProgram final : public Program<RotaryEmbeddingProgram> {
   const bool interleaved_;
 };
 
+class FusedQKRotaryEmbeddingProgram final : public Program<FusedQKRotaryEmbeddingProgram> {
+ public:
+  FusedQKRotaryEmbeddingProgram(bool interleaved) : Program{"FusedQKRotaryEmbedding"}, interleaved_{interleaved} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  // q_* describes query rotation domain (same definition as existing program)
+  // k_* describes key rotation domain
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"scale", ProgramUniformVariableDataType::Float32},
+      {"q_global_shape", ProgramUniformVariableDataType::Uint32},
+      {"q_global_stride", ProgramUniformVariableDataType::Uint32},
+      {"q_input_output_stride", ProgramUniformVariableDataType::Uint32},
+      {"k_global_shape", ProgramUniformVariableDataType::Uint32},
+      {"k_input_output_stride", ProgramUniformVariableDataType::Uint32},
+      {"q_domain_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const bool interleaved_;
+};
+
 class RotaryEmbedding final : public WebGpuKernel {
  public:
   RotaryEmbedding(const OpKernelInfo& info);
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template
index ee6dde3788157..eebe329c104e7 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template
@@ -6,6 +6,8 @@
 #param has_zero_points
 #param is_qualcomm
 
+#use .getByOffset .setByOffset
+
 #include "quantization/dp4a_matmul_common.wgsl.template"
 
 // This shader implements co-operative matrix multiply. The key idea here is to
@@ -57,11 +59,11 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
     {
         return;
     }
-    tile_A[col][row] = input_a[a_global*uniforms.K16+kidx_v+col];
+    tile_A[col][row] = a.getByOffset(a_global*uniforms.K16+kidx_v+col);
     if (col == 0)
     {
         // kidx_v - covers 16 values of k
-        scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/8];
+        scale_A[row] = scales_a.getByOffset(a_global*(uniforms.K/128) + kidx_v/8);
     }
 }
 
@@ -74,14 +76,14 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
             return;
         }
 
-        let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
+        let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col);
         let block_idx = kidx_v/(block_size/16);
         let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col);
         tile_B[col][row] = DequantizedFrom4BitsTo8Bits(b_value, zero);
         if (col == 0)
         {
             // kidx_v - each kidx_v covers 16 values of k
-            scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx];
+            scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx);
         }
     }
 #endif
@@ -95,13 +97,13 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
             return;
         }
 
-        let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
+        let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col);
         tile_B[col][row] = AlignWithZeroPoint(b_value);
         if (col == 0)
         {
             // kidx_v - each kidx_v covers 16 values of k
             let block_idx = kidx_v/(block_size/16);
-            scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx];
+            scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx);
 #if has_zero_points
             zeroes[row] = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col);
 #endif
@@ -117,10 +119,10 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
         {
             return;
         }
-        let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
+        let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col);
         tile_B[col][row] = DequantizedFrom2BitsTo8Bits(b_value);
         let block_idx = kidx_v/(block_size/16);
-        scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx];
+        scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx);
     }
 #endif
 
@@ -362,15 +364,15 @@ $MAIN {
     if (a_global < uniforms.M && b_global < uniforms.N)
     {
 #if is_qualcomm
-        output[output_idx] = vec4<output_element_t>(lane_outputs[0], lane_outputs[1], lane_outputs[2], lane_outputs[3]);
-        output[output_idx+1] = vec4<output_element_t>(lane_outputs[4], lane_outputs[5], lane_outputs[6], lane_outputs[7]);
-        output[output_idx+2] = vec4<output_element_t>(lane_outputs[8], lane_outputs[9], lane_outputs[10], lane_outputs[11]);
-        output[output_idx+3] = vec4<output_element_t>(lane_outputs[12], lane_outputs[13], lane_outputs[14], lane_outputs[15]);
+        output.setByOffset(output_idx, vec4<output_element_t>(lane_outputs[0], lane_outputs[1], lane_outputs[2], lane_outputs[3]));
+        output.setByOffset(output_idx+1, vec4<output_element_t>(lane_outputs[4], lane_outputs[5], lane_outputs[6], lane_outputs[7]));
+        output.setByOffset(output_idx+2, vec4<output_element_t>(lane_outputs[8], lane_outputs[9], lane_outputs[10], lane_outputs[11]));
+        output.setByOffset(output_idx+3, vec4<output_element_t>(lane_outputs[12], lane_outputs[13], lane_outputs[14], lane_outputs[15]));
 #else
-        output[output_idx] = lane_output1;
-        output[output_idx+1] = lane_output2;
-        output[output_idx+2] = lane_output3;
-        output[output_idx+3] = lane_output4;
+        output.setByOffset(output_idx, lane_output1);
+        output.setByOffset(output_idx+1, lane_output2);
+        output.setByOffset(output_idx+2, lane_output3);
+        output.setByOffset(output_idx+3, lane_output4);
 #endif
     }
 }  // MAIN
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 84954946fa6be..d6e15e56f193f 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -10,39 +10,47 @@ namespace contrib {
 namespace webgpu {
 
 Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddOutput("output", ShaderUsage::UseUniform);
-  shader.AddOutput("scales", ShaderUsage::UseUniform);
-  return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_quantize.wgsl.template");
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform);
+  const auto& scales = shader.AddOutput("scales", ShaderUsage::UseUniform);
+  return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_quantize.wgsl.template",
+                             WGSL_TEMPLATE_VARIABLE(a, a),
+                             WGSL_TEMPLATE_VARIABLE(output, output),
+                             WGSL_TEMPLATE_VARIABLE(scales, scales));
 }
 
 Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("scales_a", ShaderUsage::UseUniform);
-  shader.AddInput("input_b", ShaderUsage::UseUniform);
-  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& scales_a = shader.AddInput("scales_a", ShaderUsage::UseUniform);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform);
+  const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform);
   if (has_zero_points_) {
     shader.AddInput("zero_points", ShaderUsage::UseUniform);
   }
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
   return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_matmul.wgsl.template",
                              WGSL_TEMPLATE_PARAMETER(block_size, block_size_),
                              WGSL_TEMPLATE_PARAMETER(has_zero_points, has_zero_points_),
                              WGSL_TEMPLATE_PARAMETER(is_qualcomm, is_qualcomm_),
                              WGSL_TEMPLATE_PARAMETER(n_bits, nbits_),
-                             WGSL_TEMPLATE_PARAMETER(output_type_i32, true));
+                             WGSL_TEMPLATE_PARAMETER(output_type_i32, true),
+                             WGSL_TEMPLATE_VARIABLE(a, a),
+                             WGSL_TEMPLATE_VARIABLE(b, b),
+                             WGSL_TEMPLATE_VARIABLE(output, output),
+                             WGSL_TEMPLATE_VARIABLE(scales_a, scales_a),
+                             WGSL_TEMPLATE_VARIABLE(scales_b, scales_b));
 }
 
 // scale_A components = 1, b components = 4, output components = 1
 Status DP4AMatMulNBitsSmallMProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform);
-  shader.AddInput("scales_a", ShaderUsage::UseUniform);
-  shader.AddInput("input_b", ShaderUsage::UseUniform);
-  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform);
+  const auto& scales_a = shader.AddInput("scales_a", ShaderUsage::UseUniform);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform);
+  const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform);
   if (has_zero_points_) {
     shader.AddInput("zero_points", ShaderUsage::UseUniform);
   }
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
 
   ORT_ENFORCE(WorkgroupSizeX() % tile_size_k_vec_ == 0 && tile_size_k_vec_ % 4 == 0, "tile_size_k_vec_ must evenly divide workgroup size X and be divisible by 4");
   const uint32_t sub_tile_count = WorkgroupSizeX() / tile_size_k_vec_;
@@ -55,7 +63,12 @@ Status DP4AMatMulNBitsSmallMProgram::GenerateShaderCode(ShaderHelper& shader) co
                              WGSL_TEMPLATE_PARAMETER(single_scale_weights, single_scale_weights_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec_));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec_),
+                             WGSL_TEMPLATE_VARIABLE(a, a),
+                             WGSL_TEMPLATE_VARIABLE(b, b),
+                             WGSL_TEMPLATE_VARIABLE(output, output),
+                             WGSL_TEMPLATE_VARIABLE(scales_a, scales_a),
+                             WGSL_TEMPLATE_VARIABLE(scales_b, scales_b));
 }
 
 Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template
index 57e4903ad219f..dc4e244b1ad28 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template
@@ -8,6 +8,9 @@
 #param n_bits
 #param has_zero_points
 
+#use .getByOffset .setByOffset
+
+
 #include "quantization/dp4a_matmul_common.wgsl.template"
 
 // This algorithm works to compute dot product of k in parallel, by processing k at each step amongst tile_size_k_vec threads,
@@ -47,11 +50,11 @@ fn loadSHMA(a_global: u32, kidx_v: u32, col: u32)
     return;
     }
 
-    tile_A[col] = input_a[a_global*uniforms.K16+k_offset];
+    tile_A[col] = a.getByOffset(a_global*uniforms.K16+k_offset);
     if (col < scale_a_size_in_tile_a)
     {
     // kidx_v - covers 16 values of k in input_a
-    scale_A[col] = scales_a[a_global*(uniforms.K/128) + kidx_v/8 + col];
+    scale_A[col] = scales_a.getByOffset(a_global*(uniforms.K/128) + kidx_v/8 + col);
     }
 }
 
@@ -70,7 +73,7 @@ $MAIN {
 #endif
 #if single_scale_weights
     let zero = mm_read_zero(0, 0, uniforms.N, uniforms.zero_blocks_per_col);
-    let own_scale_b = scales_b[0];
+    let own_scale_b = scales_b.getByOffset(0);
 #endif
 
     for (var kidx_v:u32 = 0; kidx_v < uniforms.K32; kidx_v += tile_size_k_vec)
@@ -95,16 +98,16 @@ $MAIN {
                 let b_offset = b_global * uniforms.K32 + k_offset;
 #if !single_scale_weights
                 let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col);
-                let own_scale_b = scales_b[b_global * uniforms.K / uniforms.block_size + block_idx];
+                let own_scale_b = scales_b.getByOffset(b_global * uniforms.K / uniforms.block_size + block_idx);
 #endif
 #if n_bits == 4
-                let b_value = input_b[b_offset];
+                let b_value = b.getByOffset(b_offset);
                 let own_b = DequantizedFrom4BitsTo8Bits(b_value.xy, zero);
                 let own_b1 = DequantizedFrom4BitsTo8Bits(b_value.zw, zero);
                 inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b);
 #elif n_bits == 8
-                let own_b = AlignWithZeroPoint(input_b[b_offset * 2]);
-                let own_b1 = AlignWithZeroPoint(input_b[b_offset * 2 + 1]);
+                let own_b = AlignWithZeroPoint(b.getByOffset(b_offset * 2));
+                let own_b1 = AlignWithZeroPoint(b.getByOffset(b_offset * 2 + 1));
 #if has_zero_points
                 inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b, zero);
 #else
@@ -112,7 +115,7 @@ $MAIN {
 #endif
 
 #elif n_bits == 2
-                let b_value = input_b[b_offset];
+                let b_value = b.getByOffset(b_offset);
                 let own_b = DequantizedFrom2BitsTo8Bits(b_value.x);
                 let own_b1 = DequantizedFrom2BitsTo8Bits(b_value.y);
                 inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b);
@@ -131,7 +134,7 @@ $MAIN {
       let b_global =  b_global_base + local_idx;
       let output_idx = a_global * uniforms.N + b_global;
       if (b_global < uniforms.N) {
-        output[output_idx] = output_value;
+        output.setByOffset(output_idx, output_value);
       }
     }
 } // MAIN
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template
index 8576dfeed7b82..09cbd78fd6ccd 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template
@@ -5,6 +5,8 @@
 // Quantizes input matrix A for DP4A computation
 // This shader quantizes float values to 8-bit signed integers using pack4x8snorm
 
+#use .getByOffset .setByOffset
+
 var<workgroup> a_values : array<array<input_a_value_t, 32>, 2>;
 var<workgroup> max_values : array<input_a_value_t, 4>;
 
@@ -13,7 +15,7 @@ fn readInput(offset: u32) -> input_a_value_t
   if (offset >= uniforms.output_size) {
     return input_a_value_t(0);
   }
-  return input_a[offset];
+  return a.getByOffset(offset);
 }
 
 $MAIN {
@@ -26,11 +28,11 @@ $MAIN {
     let max_temp = max(max_val.xy, max_val.zw);
     let scale = max(max_temp[0], max_temp[1]);
     let norm_a = local_a/scale;
-    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    output.setByOffset(global_idx, pack4x8snorm(vec4<f32>(norm_a)));
     if (local_idx % 32 == 0)
     {
       // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
-      scales[workgroup_idx * 2 + local_idx / 32] = scale/127;
+      scales.setByOffset(workgroup_idx * 2 + local_idx / 32, scale/127);
     }
   } else if (sg_size == 16) {
     let local_a = readInput(global_idx);
@@ -53,11 +55,11 @@ $MAIN {
     let max_temp = max(max_val.xy, max_val.zw);
     let scale = max(max_temp[0], max_temp[1]);
     let norm_a = local_a/scale;
-    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    output.setByOffset(global_idx, pack4x8snorm(vec4<f32>(norm_a)));
     if (local_idx % 32 == 0)
     {
       // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
-      scales[workgroup_idx * 2 + local_idx / 32] = scale/127;
+      scales.setByOffset(workgroup_idx * 2 + local_idx / 32, scale/127);
     }
   } else {
     let local_row = local_idx / 32u;
@@ -78,11 +80,11 @@ $MAIN {
     let max_temp = max(max_val.xy, max_val.zw);
     let scale = max(max_temp[0], max_temp[1]);
     let norm_a = a_values[local_row][local_col]/scale;
-    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    output.setByOffset(global_idx, pack4x8snorm(vec4<f32>(norm_a)));
     if (local_col == 0u)
     {
       // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
-      scales[workgroup_idx * 2 + local_row] = scale/127;
+      scales.setByOffset(workgroup_idx * 2 + local_row, scale/127);
     }
   }
 }
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index a78eef98ce1ad..f0480a2e3c886 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -42,13 +42,13 @@ ONNX_OPERATOR_KERNEL_EX(
     MatMulNBits);
 
 Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddInput("input_b", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddInput("scales", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& scales = shader.AddInput("scales", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   if (has_zero_points_) {
     shader.AddInput("zero_points", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   }
-  shader.AddOutput("output", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
 
   const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY();
   ORT_ENFORCE(tile_m_ == workgroup_size / 8, "tile_m must be workgroup_size / 8.");
@@ -59,18 +59,22 @@ Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) cons
                              WGSL_TEMPLATE_PARAMETER(has_zero_points, has_zero_points_),
                              WGSL_TEMPLATE_PARAMETER(nbits, nbits_),
                              WGSL_TEMPLATE_PARAMETER(tile_m, tile_m_),
-                             WGSL_TEMPLATE_PARAMETER(tile_n, tile_n_));
+                             WGSL_TEMPLATE_PARAMETER(tile_n, tile_n_),
+                             WGSL_TEMPLATE_VARIABLE(a, a),
+                             WGSL_TEMPLATE_VARIABLE(b, b),
+                             WGSL_TEMPLATE_VARIABLE(output, output),
+                             WGSL_TEMPLATE_VARIABLE(scales, scales));
 }
 
 // Apply similar idea with DP4AMatMulNBitsSmallMProgram algorithm.
 Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& a = shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias);
   const auto& b = shader.AddInput("input_b");
-  shader.AddInput("scales_b");
+  const auto& scales_b = shader.AddInput("scales_b");
   if (has_zero_points_) {
     shader.AddInput("zero_points", ShaderUsage::UseUniform);
   }
-  shader.AddOutput("output", ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseElementTypeAlias);
 
   const uint32_t components_a = a.NumComponents();
   const uint32_t components_b = b.NumComponents() / 4;  // b is stored as uint32 which includes 4 uint8.
@@ -92,7 +96,11 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
                              WGSL_TEMPLATE_PARAMETER(tile_size_k, tile_size_k),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_VARIABLE(a, a),
+                             WGSL_TEMPLATE_VARIABLE(b, b),
+                             WGSL_TEMPLATE_VARIABLE(output, output),
+                             WGSL_TEMPLATE_VARIABLE(scales_b, scales_b));
 }
 
 Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template
index aba6e3d57c72a..0fe3ec92ef3de 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template
@@ -12,6 +12,8 @@
 #param tile_size_k
 #param tile_size
 
+#use .getByOffset .setByOffset
+
 #include "quantization/matmul_nbits_zero_pt.wgsl.template"
 
 // Shared memory
@@ -22,7 +24,7 @@ fn loadSHMA(batch: u32, a_global: u32, kidx: u32, col: u32)
 {
     let k_offset = kidx / component_a + col;
     if (batch < uniforms.batch_count && k_offset < uniforms.K_of_a) {
-        tile_A[col] = input_a[batch * uniforms.M * uniforms.K_of_a + a_global * uniforms.K_of_a + k_offset];
+        tile_A[col] = a.getByOffset(batch * uniforms.M * uniforms.K_of_a + a_global * uniforms.K_of_a + k_offset);
     } else {
         tile_A[col] = input_a_value_t(0);
     }
@@ -38,7 +40,7 @@ $MAIN {
 
 #if single_scale_weights
   let block_idx = 0;
-  let scale_b = scales_b[0];
+  let scale_b = scales_b.getByOffset(0);
   let zero = mm_read_zero(0, 0, uniforms.N, uniforms.zero_blocks_per_col);
 #endif
 
@@ -58,10 +60,10 @@ $MAIN {
       {
 #if !single_scale_weights
         let block_idx = (kidx + idx * elements_in_value_b) / uniforms.block_size;
-        let scale_b = scales_b[b_global * uniforms.blocks_per_col + block_idx];
+        let scale_b = scales_b.getByOffset(b_global * uniforms.blocks_per_col + block_idx);
         let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col);
 #endif
-        var b_value = input_b[b_global * uniforms.K_of_b + k_offset];
+        var b_value = b.getByOffset(b_global * uniforms.K_of_b + k_offset);
 
 #if n_bits == 4
         var sum = output_element_t(0);
@@ -152,7 +154,7 @@ $MAIN {
     let b_global =  b_global_base + local_idx;
     let output_idx = batch * uniforms.M * uniforms.N + a_global * uniforms.N + b_global;
     if (b_global < uniforms.N) {
-      output[output_idx] = output_value;
+      output.setByOffset(output_idx, output_value);
     }
   }
 }  // MAIN
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template
index 462f9a340c1b8..7c2fca615a99b 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template
@@ -6,6 +6,8 @@
 #param tile_m
 #param tile_n
 
+#use .getByOffset .setByOffset
+
 // Only support Block32 at the moment.
 const KAVecSizeForBlock32 = 8u;
 
@@ -58,7 +60,7 @@ fn load_zero(row : u32, col : u32, r_dim : u32, c_dim : u32) -> output_element_t
 fn load_a(batch : u32, row : u32, col : u32) -> input_a_value_t {
   if (batch < uniforms.Batch && row < uniforms.M && col < uniforms.K_of_a) {
     let offset = batch * uniforms.M * uniforms.K_of_a + row * uniforms.K_of_a + col;
-    return input_a[offset];
+    return a.getByOffset(offset);
   }
   return input_a_value_t();
 }
@@ -66,7 +68,7 @@ fn load_a(batch : u32, row : u32, col : u32) -> input_a_value_t {
 fn load_scale(row : u32, block_idx : u32) -> output_element_t {
   if (row < uniforms.N && block_idx < uniforms.n_blocks_per_col) {
     let offset = row * uniforms.n_blocks_per_col + block_idx;
-    return scales[offset];
+    return scales.getByOffset(offset);
   }
   return output_element_t();
 }
@@ -74,7 +76,7 @@ fn load_scale(row : u32, block_idx : u32) -> output_element_t {
 fn write_output(batch : u32, row : u32, col : u32, value : output_element_t) {
   if (batch < uniforms.Batch && row < uniforms.M && col < uniforms.N) {
     let offset = batch * uniforms.M * uniforms.N + row * uniforms.N + col;
-    output[offset] = value;
+    output.setByOffset(offset, value);
   }
 }
 
@@ -82,7 +84,7 @@ fn write_output(batch : u32, row : u32, col : u32, value : output_element_t) {
 fn load_b(row : u32, block_idx : u32) -> vec4<input_b_element_t> {
   if (row < uniforms.N && block_idx < uniforms.K_of_b) {
     let offset = row * uniforms.K_of_b + block_idx;
-    return input_b[offset];
+    return b.getByOffset(offset);
   }
   return vec4<input_b_element_t>();
 }
@@ -112,10 +114,10 @@ fn load_b(row : u32, block_idx : u32) -> array<vec2<u32>, 4> {
   if (row < uniforms.N) {
     let offset = 2 * block_idx;
     let b_data_0 = select(input_b_value_t(),
-                          input_b[row * uniforms.K_of_b + offset],
+                          b.getByOffset(row * uniforms.K_of_b + offset),
                           offset < uniforms.K_of_b);
     let b_data_1 = select(input_b_value_t(),
-                          input_b[row * uniforms.K_of_b + offset + 1],
+                          b.getByOffset(row * uniforms.K_of_b + offset + 1),
                           offset + 1 < uniforms.K_of_b);
 
     let b_data = array<vec2<u32>, 4>(
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
index c8781631fb19c..db1a6319b3247 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
@@ -132,7 +132,9 @@ Status PrepackProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t config_index, bool has_zero_points) {
+Status GenerateShaderCodeOnIntel(ShaderHelper& shader, const ShaderVariableHelper& b,
+                                 const ShaderVariableHelper& scales_b,
+                                 uint32_t nbits, int32_t config_index, bool has_zero_points) {
   auto& config = intel_supported_subgroup_matrix_configs[config_index];
   shader.AdditionalImplementation() << "alias component_type = " << ComponentTypeName[static_cast<uint32_t>(std::get<2>(config))] << ";\n"
                                     << "alias result_component_type = " << ComponentTypeName[static_cast<uint32_t>(std::get<3>(config))] << ";\n"
@@ -150,7 +152,7 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c
   var<workgroup> tile_B: array<component_type, tile_cols * tile_k>;       // 64 x 32 - RxC
   )ADDNL_FN" << GenerateZeroPointReadingCode(nbits, has_zero_points, "component_type");
   if (nbits == 4) {
-    shader.AdditionalImplementation() << R"ADDNL_FN(
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
         fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) {
             let b_global = tile_base + row;
             if (b_global >= uniforms.N) {
@@ -161,9 +163,14 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c
             // 256 threads need to load 64 x 32. 4 threads per row or 8 col per thread.
             // Stored in column major fashion.
             let b_idx = u32((b_global * uniforms.K + k_idx + col) / 8);
-            let scale = component_type(scales_b[(b_global * uniforms.K + k_idx + col) / quantization_block_size]);
-            let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);
-            let b_value = input_b[b_idx];
+            )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "let scale = component_type("
+                                      << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size")
+                                      << ");"
+                                      << "let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);"
+                                      << "let b_value = "
+                                      << b.GetByOffset("b_idx") << ';';
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
             let b_value_lower = (vec4<component_type>(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4<component_type>(zero)) * scale;
             let b_value_upper = (vec4<component_type>(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4<component_type>(zero)) * scale;
             let tile_b_base = row * tile_k + col;
@@ -176,10 +183,10 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c
             tile_B[tile_b_base + 6] = b_value_lower[3];
             tile_B[tile_b_base + 7] = b_value_upper[3];
         }
-    )ADDNL_FN";
+    )ADDNL_FN_PART";
   } else {
     ORT_ENFORCE(nbits == 8, "Only 4/8 bits are supported for webgpu matmulnbits");
-    shader.AdditionalImplementation() << R"ADDNL_FN(
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
         fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) {
             let b_global = tile_base + row;
             if (b_global >= uniforms.N) {
@@ -190,22 +197,28 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c
             // 256 threads need to load 64 x 32. 4 threads per row or 8 col per thread.
             // Stored in column major fashion.
             let b_idx = u32((b_global * uniforms.K + k_idx + col) / 8);
-            let scale   = component_type(scales_b[(b_global * uniforms.K + k_idx + col) / quantization_block_size]);
-            let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);
-            let b_value = input_b[b_idx];
-            let b_value0 = (vec4<component_type>(unpack4xU8(b_value[0])) - vec4<component_type>(zero)) * scale;
-            let b_value1 = (vec4<component_type>(unpack4xU8(b_value[1])) - vec4<component_type>(zero)) * scale;
-            let tile_b_base = row * tile_k + col;
-            tile_B[tile_b_base]     = b_value0[0];
-            tile_B[tile_b_base + 1] = b_value0[1];
-            tile_B[tile_b_base + 2] = b_value0[2];
-            tile_B[tile_b_base + 3] = b_value0[3];
-            tile_B[tile_b_base + 4] = b_value1[0];
-            tile_B[tile_b_base + 5] = b_value1[1];
-            tile_B[tile_b_base + 6] = b_value1[2];
-            tile_B[tile_b_base + 7] = b_value1[3];
-        }
-    )ADDNL_FN";
+        )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "let scale = component_type("
+                                      << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size")
+                                      << ");"
+                                      << " let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);"
+                                      << "let b_value = "
+                                      << b.GetByOffset("b_idx") << ';';
+
+    shader.AdditionalImplementation() <<
+        R"ADDNL_FN_PART(let b_value0 = (vec4<component_type>(unpack4xU8(b_value[0])) - vec4<component_type>(zero)) * scale;
+    let b_value1 = (vec4<component_type>(unpack4xU8(b_value[1])) - vec4<component_type>(zero)) * scale;
+    let tile_b_base = row * tile_k + col;
+    tile_B[tile_b_base] = b_value0[0];
+    tile_B[tile_b_base + 1] = b_value0[1];
+    tile_B[tile_b_base + 2] = b_value0[2];
+    tile_B[tile_b_base + 3] = b_value0[3];
+    tile_B[tile_b_base + 4] = b_value1[0];
+    tile_B[tile_b_base + 5] = b_value1[1];
+    tile_B[tile_b_base + 6] = b_value1[2];
+    tile_B[tile_b_base + 7] = b_value1[3];
+  }
+    )ADDNL_FN_PART";
   }
 
   shader.MainFunctionBody() << R"MAIN_FN(
@@ -266,10 +279,12 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c
   return Status::OK();
 }
 
-Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_zero_points) {
+Status GenerateShaderCodeOnApple(ShaderHelper& shader, const ShaderVariableHelper& a, const ShaderVariableHelper& b,
+                                 const ShaderVariableHelper& scales_b,
+                                 const ShaderVariableHelper& output, uint32_t nbits, bool has_zero_points) {
   // tile/subtile sizes and work distribution are inspired from metal shaders in llama.cpp (kernel_mul_mm)
   // https://github.com/ggml-org/llama.cpp/blob/d04e7163c85a847bc61d58c22f2c503596db7aa8/ggml/src/ggml-metal/ggml-metal.metal#L6066
-  shader.AdditionalImplementation() << R"ADDNL_FN(
+  shader.AdditionalImplementation() << R"ADDNL_FN_PART(
         const tile_cols = 64;
         const tile_rows = 32;
         const tile_k = 32;
@@ -292,13 +307,17 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_
             // 128 threads need to load 32 x 32. 4 threads per row or 8 col per thread.
             for (var col_offset:u32 = 0; col_offset < 8; col_offset++)
             {
-                tile_A[row * tile_k + col + col_offset] = compute_precision(input_a[a_global*uniforms.K + k_idx + col + col_offset]);
+        )ADDNL_FN_PART";
+  shader.AdditionalImplementation()
+      << " tile_A[row * tile_k + col + col_offset] = compute_precision("
+      << a.GetByOffset("a_global * uniforms.K + k_idx + col + col_offset")
+      << ");";
+  shader.AdditionalImplementation() << R"ADDNL_FN_PART(
             }
-        }
-    )ADDNL_FN"
-                                    << GenerateZeroPointReadingCode(nbits, has_zero_points, "compute_precision");
+          })ADDNL_FN_PART";
+  shader.AdditionalImplementation() << GenerateZeroPointReadingCode(nbits, has_zero_points, "compute_precision");
   if (nbits == 4) {
-    shader.AdditionalImplementation() << R"ADDNL_FN(
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
         fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) {
             let b_global = tile_base + row;
             if (b_global >= uniforms.N) {
@@ -309,28 +328,35 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_
             // 128 threads need to load 64 x 32. 2 threads per row or 16 col per thread.
             // Stored in column major fashion.
             let b_idx = u32((b_global*uniforms.K + k_idx + col)/8);
-            let scale = compute_precision(scales_b[(b_global*uniforms.K + k_idx + col)/quantization_block_size]);
+                                 )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "let scale = compute_precision("
+                                      << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size")
+                                      << ");";
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
             let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);
             for (var step:u32 = 0; step < 2; step++)
             {
-                var b_value = input_b[b_idx+step];
-                var b_value_lower = (vec4<compute_precision>(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4<compute_precision>(zero)) * scale;
-                var b_value_upper = (vec4<compute_precision>(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4<compute_precision>(zero)) * scale;
-                let tile_b_base = row * tile_k + col + step * 8;
-                tile_B[tile_b_base]     = b_value_lower[0];
-                tile_B[tile_b_base + 1] = b_value_upper[0];
-                tile_B[tile_b_base + 2] = b_value_lower[1];
-                tile_B[tile_b_base + 3] = b_value_upper[1];
-                tile_B[tile_b_base + 4] = b_value_lower[2];
-                tile_B[tile_b_base + 5] = b_value_upper[2];
-                tile_B[tile_b_base + 6] = b_value_lower[3];
-                tile_B[tile_b_base + 7] = b_value_upper[3];
-            }
-        }
-    )ADDNL_FN";
+            )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "var b_value = "
+                                      << b.GetByOffset("b_idx+step")
+                                      << ';';
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(var b_value_lower = (vec4<compute_precision>(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4<compute_precision>(zero)) * scale;
+    var b_value_upper = (vec4<compute_precision>(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4<compute_precision>(zero)) * scale;
+    let tile_b_base = row * tile_k + col + step * 8;
+    tile_B[tile_b_base] = b_value_lower[0];
+    tile_B[tile_b_base + 1] = b_value_upper[0];
+    tile_B[tile_b_base + 2] = b_value_lower[1];
+    tile_B[tile_b_base + 3] = b_value_upper[1];
+    tile_B[tile_b_base + 4] = b_value_lower[2];
+    tile_B[tile_b_base + 5] = b_value_upper[2];
+    tile_B[tile_b_base + 6] = b_value_lower[3];
+    tile_B[tile_b_base + 7] = b_value_upper[3];
+  }
+}
+    )ADDNL_FN_PART";
   } else {
     ORT_ENFORCE(nbits == 8, "Only 4/8 bits are supported for webgpu matmulnbits");
-    shader.AdditionalImplementation() << R"ADDNL_FN(
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
         fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) {
             let b_global = tile_base + row;
             if (b_global >= uniforms.N) {
@@ -341,42 +367,49 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_
             // 128 threads need to load 64 x 32. 2 threads per row or 16 col per thread.
             // Stored in column major fashion.
             let b_idx = u32((b_global*uniforms.K + k_idx + col)/8);
-            let scale = compute_precision(scales_b[(b_global*uniforms.K + k_idx + col)/quantization_block_size]);
-            let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);
-            for (var step:u32 = 0; step < 2; step++)
-            {
-                var b_value = input_b[b_idx+step];
-                var b_value0 = (vec4<compute_precision>(unpack4xU8(b_value[0])) - vec4<compute_precision>(zero)) * scale;
-                var b_value1 = (vec4<compute_precision>(unpack4xU8(b_value[1])) - vec4<compute_precision>(zero)) * scale;
-                let tile_b_base = row * tile_k + col + step * 8;
-                tile_B[tile_b_base]     = b_value0[0];
-                tile_B[tile_b_base + 1] = b_value0[1];
-                tile_B[tile_b_base + 2] = b_value0[2];
-                tile_B[tile_b_base + 3] = b_value0[3];
-                tile_B[tile_b_base + 4] = b_value1[0];
-                tile_B[tile_b_base + 5] = b_value1[1];
-                tile_B[tile_b_base + 6] = b_value1[2];
-                tile_B[tile_b_base + 7] = b_value1[3];
-            }
-        }
-    )ADDNL_FN";
+            )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "let scale = compute_precision("
+                                      << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size")
+                                      << ");";
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
+    let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);
+    for (var step : u32 = 0; step < 2; step++) {
+            )ADDNL_FN_PART";
+    shader.AdditionalImplementation() << "var b_value = "
+                                      << b.GetByOffset("b_idx+step")
+                                      << ';';
+
+    shader.AdditionalImplementation() << R"ADDNL_FN_PART(
+    var b_value0 = (vec4<compute_precision>(unpack4xU8(b_value[0])) - vec4<compute_precision>(zero)) * scale;
+    var b_value1 = (vec4<compute_precision>(unpack4xU8(b_value[1])) - vec4<compute_precision>(zero)) * scale;
+    let tile_b_base = row * tile_k + col + step * 8;
+    tile_B[tile_b_base]     = b_value0[0];
+    tile_B[tile_b_base + 1] = b_value0[1];
+    tile_B[tile_b_base + 2] = b_value0[2];
+    tile_B[tile_b_base + 3] = b_value0[3];
+    tile_B[tile_b_base + 4] = b_value1[0];
+    tile_B[tile_b_base + 5] = b_value1[1];
+    tile_B[tile_b_base + 6] = b_value1[2];
+    tile_B[tile_b_base + 7] = b_value1[3];
   }
-  shader.AdditionalImplementation() << R"ADDNL_FN(
-        fn storeOutput(offset:u32, row: u32, col:u32, src_slot:u32, row_limit:i32) {
-            if (row_limit > 0 && row < u32(row_limit))
-            {
-                output[offset + row * uniforms.N + col] = output_element_t(scratch[src_slot][0][row * 8 + col]);
-                output[offset + row * uniforms.N + col + 8] = output_element_t(scratch[src_slot][1][row * 8 + col]);
-                output[offset + row * uniforms.N + col + 16] = output_element_t(scratch[src_slot][2][row * 8 + col]);
-                output[offset + row * uniforms.N + col + 24] = output_element_t(scratch[src_slot][3][row * 8 + col]);
-                let col2 = col + 1;
-                output[offset + row * uniforms.N + col2] = output_element_t(scratch[src_slot][0][row * 8 + col2]);
-                output[offset + row * uniforms.N + col2 + 8] = output_element_t(scratch[src_slot][1][row * 8 + col2]);
-                output[offset + row * uniforms.N + col2 + 16] = output_element_t(scratch[src_slot][2][row * 8 + col2]);
-                output[offset + row * uniforms.N + col2 + 24] = output_element_t(scratch[src_slot][3][row * 8 + col2]);
-            }
-        }
-    )ADDNL_FN";
+}
+    )ADDNL_FN_PART";
+  }
+  shader.AdditionalImplementation()
+      << "        fn storeOutput(offset:u32, row: u32, col:u32, src_slot:u32, row_limit:i32) {\n"
+      << "            if (row_limit > 0 && row < u32(row_limit))\n"
+      << "            {\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col", "output_element_t(scratch[src_slot][0][row * 8 + col])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col + 8", "output_element_t(scratch[src_slot][1][row * 8 + col])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col + 16", "output_element_t(scratch[src_slot][2][row * 8 + col])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col + 24", "output_element_t(scratch[src_slot][3][row * 8 + col])") << ";\n"
+      << "                let col2 = col + 1;\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col2", "output_element_t(scratch[src_slot][0][row * 8 + col2])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col2 + 8", "output_element_t(scratch[src_slot][1][row * 8 + col2])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col2 + 16", "output_element_t(scratch[src_slot][2][row * 8 + col2])") << ";\n"
+      << "                " << output.SetByOffset("offset + row * uniforms.N + col2 + 24", "output_element_t(scratch[src_slot][3][row * 8 + col2])") << ";\n"
+      << "            }\n"
+      << "        }\n";
 
   shader.MainFunctionBody() << R"MAIN_FN(
         let a_global_base = workgroup_id.y * tile_rows;
@@ -463,18 +496,18 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_
 }
 
 Status SubgroupMatrixMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("input_b", ShaderUsage::UseUniform);
-  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform);
+  const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform);
   if (has_zero_points_) {
     shader.AddInput("zero_points", ShaderUsage::UseUniform);
   }
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
 
   if (!vendor_.compare("apple")) {
-    return GenerateShaderCodeOnApple(shader, nbits_, has_zero_points_);
+    return GenerateShaderCodeOnApple(shader, a, b, scales_b, output, nbits_, has_zero_points_);
   } else if (!vendor_.compare("intel")) {
-    return GenerateShaderCodeOnIntel(shader, nbits_, config_index_, has_zero_points_);
+    return GenerateShaderCodeOnIntel(shader, b, scales_b, nbits_, config_index_, has_zero_points_);
   } else {
     return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::NOT_IMPLEMENTED,
                   "onnxruntime does not support subgroup matrix on this verdor.");
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3f6443aa73d4c..8b599dc86d997 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
     // only return data if it's for a constant initializer. checks for outer scope initializers
     // if this is a subgraph and the name isn't found locally.
     const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true);
+    if (initializer != nullptr) {
+      // Check if this is in-memory external data (data stored in OrtValue)
+      // ONNX shape inference cannot handle external data, so we need to materialize it
+      if (utils::HasExternalDataInMemory(*initializer)) {
+        // Try to get the OrtValue for this initializer
+        OrtValue ort_value;
+        if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) {
+          // Create a temporary TensorProto with the actual data from the OrtValue
+          // This allows ONNX shape inference to access the data
+          const Tensor& tensor = ort_value.Get<Tensor>();
+          auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false);
+          // Store the temporary proto so it outlives this call, maintain pointers steady
+          temp_tensor_protos_.push_back(std::make_unique<ONNX_NAMESPACE::TensorProto>(std::move(temp_tensor_proto)));
+          return temp_tensor_protos_.back().get();
+        } else {
+          // If we can't get the OrtValue, it is a bug
+          ORT_THROW("Initializer ", def->Name(),
+                    " has in-memory external data but cannot get OrtValue during shape inference");
+        }
+      }
+    }
     return initializer;
   }
 
@@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext {
   std::vector<std::unique_ptr<GraphInferencerImpl>> graph_inferencers_;
   const Graph& graph_;
   const Graph::ResolveOptions& options_;
+  // Temporary TensorProtos created for in-memory external data during shape inference
+  // These need to outlive the shape inference call, so we store them here
+  // Inference is per node and the instance of this context is on the stack,
+  // so this is safe.
+  mutable InlinedVector<std::unique_ptr<ONNX_NAMESPACE::TensorProto>> temp_tensor_protos_;
 };
 
 Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph,
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index bc1221475fd90..9518134631f2d 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -729,6 +729,82 @@ Return Value:
     }
 }
 
+void
+MlasConvExpandThenGemmSegmentedThreaded(
+    void* Context,
+    ptrdiff_t Index
+)
+/*++
+
+Routine Description:
+
+    This routine is invoked from a worker thread to execute a segment of a
+    convolution operation.
+
+    If using this, the entire convolution operation is parallelized on the
+    (batch size * group count) parameter and this routine has logic to
+    perform a specific thread's shard of the entire Convolution operation.
+
+Arguments:
+
+    Context - Supplies the pointer to the context for the threaded operation.
+
+    Index - Supplies the current index of the threaded operation.
+
+Return Value:
+
+    None.
+
+--*/
+
+{
+    MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context;
+
+    const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters;
+
+    const size_t GroupCount = Parameters->GroupCount;
+    const size_t BatchGroupCount = Parameters->BatchCount * GroupCount;
+
+    const size_t TargetThreadCount = WorkBlock->TargetThreadCount;
+
+    const size_t BatchGroupCountPerThread = BatchGroupCount / TargetThreadCount;
+    const size_t BatchGroupCountExtra = BatchGroupCount % TargetThreadCount;
+
+    size_t BatchGroupStart;
+    size_t BatchGroupEnd;
+
+    if (static_cast<size_t>(Index) < BatchGroupCountExtra) {
+        BatchGroupStart = (BatchGroupCountPerThread + 1) * Index;
+        BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread + 1;
+    } else {
+        BatchGroupStart = BatchGroupCountPerThread * Index + BatchGroupCountExtra;
+        BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread;
+    }
+
+    const size_t FilterCount = Parameters->FilterCount;
+    const size_t OutputSize = Parameters->OutputSize;
+    const size_t K = Parameters->K;
+
+    const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize;
+    const size_t OutputGroupSize = FilterCount * OutputSize;
+    const size_t FilterGroupSize = FilterCount * K;
+
+    for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) {
+        size_t group = bg % GroupCount;
+
+        const float* input = WorkBlock->Input + bg * InputGroupSize;
+        const float* filter = WorkBlock->Filter + group * FilterGroupSize;
+        float* output = WorkBlock->Output + bg * OutputGroupSize;
+        const float* bias = WorkBlock->Bias;
+        if (bias != nullptr) {
+            bias += group * FilterCount;
+        }
+        float* ColumnBuffer = WorkBlock->WorkingBuffer + Index * OutputSize * K;
+
+        MlasConvOperation(Parameters, input, filter, bias, ColumnBuffer, output, 0, OutputSize);
+    }
+}
+
 inline
 bool
 MlasConvTryMultithread(
@@ -890,8 +966,8 @@ Return Value:
 
         ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
 
-        if (size_t(TargetThreadCount) >= BatchGroupCount) {
-            TargetThreadCount = ptrdiff_t(BatchGroupCount);
+        if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
+            TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
         }
 
         MLAS_CONV_WORK_BLOCK WorkBlock;
@@ -919,6 +995,30 @@ Return Value:
 
 #endif
 
+    if (Algorithm == MlasConvAlgorithmExpandThenGemmSegmented && ((BatchCount > 1) || (GroupCount > 1))) {
+        const size_t BatchGroupCount = BatchCount * GroupCount;
+
+        ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+        if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
+            TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
+        }
+
+        MLAS_CONV_WORK_BLOCK WorkBlock;
+
+        WorkBlock.Parameters = Parameters;
+        WorkBlock.Input = Input;
+        WorkBlock.Filter = Filter;
+        WorkBlock.Bias = Bias;
+        WorkBlock.WorkingBuffer = WorkingBuffer;
+        WorkBlock.Output = Output;
+        WorkBlock.TargetThreadCount = TargetThreadCount;
+
+        MlasExecuteThreaded(MlasConvExpandThenGemmSegmentedThreaded, &WorkBlock, TargetThreadCount, ThreadPool);
+
+        return;
+    }
+
     //
     // Iterate over each batch and group.
     //
@@ -1308,6 +1408,18 @@ Return Value:
         Parameters->u.ExpandThenGemmSegmented.ThreadStrideN = StrideN;
 
         *WorkingBufferSize = TargetThreadCount * MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD;
+
+        if (Parameters->BatchCount > 1 || Parameters->GroupCount > 1) {
+
+            size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K, 
+                                                          Parameters->FilterCount * Parameters->OutputSize, 
+                                                          static_cast<size_t>(MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD)});
+            TargetThreadCount = MaximumThreadCount;
+            if (static_cast<size_t>(TargetThreadCount) >= Parameters->BatchCount * Parameters->GroupCount) {
+                TargetThreadCount = static_cast<ptrdiff_t>(Parameters->BatchCount * Parameters->GroupCount);
+            }
+            *WorkingBufferSize = TargetThreadCount * WorkingBufferSizePerThread;
+        }
     }
 }
 #if defined(_MSC_VER) && !defined(__clang__)
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index 5136061c4769d..2e9c4574fd057 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -115,3 +115,37 @@ MlasConv(
     MLAS_THREADPOOL* ThreadPool
     );
 }
+
+/*++
+
+Routine Description:
+
+    This routine determines if a wraparound will occur when multiplying two size_t variables
+    Uses __builtin_mul_overflow if available on the current system and if not falls back
+    to a default implementation to check this wraparound.
+
+Arguments:
+
+    a - Supplies the first number to be muliplied.
+
+    b - Supplies the second number to be muliplied.
+
+    out - pointer to a size_t which acts as the return value in success cases.
+
+Return Value:
+
+    Returns false if the operation was successful
+    Returns true if wraparound of size_t was detected
+
+--*/
+inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) {
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_mul_overflow)
+    return __builtin_mul_overflow(a, b, out);
+#  endif
+#endif
+    // Fallback to manual check if builtin not available
+    if (b != 0 && a > SIZE_MAX / b) return true;
+    if (out) *out = a * b;
+    return false;
+}
diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
index ea38f16205a7c..435ff1fb10017 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
@@ -14,6 +14,16 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h"
 #include "mlasi_kleidiai.h"
 
+
+// Thread-local reusable buffers to reduce allocation overhead across tiles.
+struct KaiTlsBuffers {
+    std::vector<float> output_tile;
+    std::vector<float> bias_zero;
+    std::vector<std::byte> rhs_packed;
+    std::vector<std::byte> lhs_packed;
+};
+static thread_local KaiTlsBuffers g_kai_tls;
+
 size_t
 MLASCALL
 ArmKleidiAI::MlasGemmPackBSize(
@@ -51,7 +61,6 @@ Return Value:
     // Compute the number of bytes required to hold the packed buffer.
     //
     size_t bytes = 0;
-
     if (TransA == CblasNoTrans) {
         switch (TransB) {
             case CblasNoTrans:
@@ -125,15 +134,15 @@ Return Value:
         const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                                   : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-        // pass zeroed bias values
-        const std::vector<float> bias(N);
+        // Ensure size and zero the used span.
+        g_kai_tls.bias_zero.resize(N, 0.0f);
 
         switch (TransB) {
             case CblasNoTrans:
-                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             case CblasTrans:
-                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr);
+                kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr);
                 break;
             default:
                 return false;
@@ -225,22 +234,29 @@ Return Value:
     size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa()
                             : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa();
 
-    if (M < m_step && N < n_step && !Data->BIsPacked) {
+    if ((M < m_step || N < n_step) && !Data->BIsPacked) {
         // Fallback to MLAS
         return false;
     }
 
-    std::vector<MLAS_SGEMM_DATA_PARAMS> KaiPackedData;
-    KaiPackedData.resize(BatchSize);
-
     size_t LhsPackedStride = 0;
     std::byte* LhsPackedData = nullptr;
 
     LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr);
-    auto LhsPacked = std::make_unique<std::byte[]>(LhsPackedStride * BatchSize);
-    LhsPackedData = LhsPacked.get();
 
-    std::unique_ptr<std::byte[]> RhsPacked{nullptr};
+    size_t lhs_resize = 0;
+    if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize))
+    {
+        // size_t wraparound detected for LhsPackedStride, fallback to MLAS
+        return false;
+    }
+
+    g_kai_tls.lhs_packed.resize(lhs_resize);
+    LhsPackedData = g_kai_tls.lhs_packed.data();
+
+    // RHS packed buffer: use TLS reusable vector to minimize allocations
+    size_t RhsPackedStride = 0;
+    std::byte* RhsPackedData = nullptr;
 
     // It is assumed all B batches require packing or not
     if (Data[0].BIsPacked) {
@@ -248,36 +264,31 @@ Return Value:
         MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) {
             std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
             kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-            KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
-            KaiPackedData[batch_idx].B = Data[batch_idx].B;
         });
     } else {
         // Multithread pack lhs and rhs
-        size_t RhsPackedStride = 0;
-        std::byte* RhsPackedData = nullptr;
-
         RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K);
-        RhsPacked = std::make_unique<std::byte[]>(RhsPackedStride * BatchSize);
-        RhsPackedData = RhsPacked.get();
+        size_t rhs_resize = 0;
+        if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize))
+        {
+            // size_t wraparound detected for RhsPackedStride, fallback to MLAS
+            return false;
+        }
+
+        g_kai_tls.rhs_packed.resize(rhs_resize);
+        RhsPackedData = g_kai_tls.rhs_packed.data();
 
         MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) {
-            // lhs odd, rhs even
             if (batch_idx & 0x1) {
                 batch_idx >>= 1;
-
                 std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]);
-
                 kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr);
-
-                KaiPackedData[batch_idx].A = reinterpret_cast<const float*>(LhsPackedPtr);
             } else {
                 batch_idx >>= 1;
-
                 std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]);
-
-                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast<const float*>(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr);
-
-                KaiPackedData[batch_idx].B = reinterpret_cast<const float*>(RhsPackedPtr);
+                ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K,
+                                           reinterpret_cast<const float*>(Data[batch_idx].B),
+                                           Data[batch_idx].ldb, RhsPackedPtr);
             }
         });
     }
@@ -303,6 +314,14 @@ Return Value:
     dim[1] = MlasDivRoundup(M, m_step);
     dim[2] = MlasDivRoundup(N, n_step);
 
+    // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop.
+    // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively.
+    size_t max_tile_elems = 0;
+    if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) {
+        // size_t wraparound detected for tile size, fallback to MLAS
+        return false;
+    }
+
     MlasTrySimpleParallel(ThreadPool, static_cast<ptrdiff_t>(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) {
         // compute B,M,N index from iteration index
         ptrdiff_t BIdx = tid / (dim[1] * dim[2]);
@@ -314,18 +333,18 @@ Return Value:
             UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K)
                     : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K);
 
-        auto BTile = reinterpret_cast<const void*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].B) + rhs_packed_offset
-        );
+        const std::byte* B_base = Data[0].BIsPacked
+            ? reinterpret_cast<const std::byte*>(Data[BIdx].B)
+            : (RhsPackedData + RhsPackedStride * BIdx);
+        auto BTile = reinterpret_cast<const void*>(B_base + rhs_packed_offset);
 
         // Get lhs tile, A
         const size_t lhs_packed_offset =
             UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K)
                     : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K);
 
-        auto ATile = reinterpret_cast<const float*>(
-            reinterpret_cast<const std::byte*>(KaiPackedData[BIdx].A) + lhs_packed_offset
-        );
+        const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx;
+        auto ATile = reinterpret_cast<const float*>(A_base + lhs_packed_offset);
 
         auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step;
         auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step;
@@ -336,9 +355,14 @@ Return Value:
             MIdx * m_step * Data[BIdx].ldc * sizeof(float) +
             NIdx * n_step * sizeof(float)
         );
-        // Allocate temporary buffer for raw A*B result
-        std::vector<float> OutputTile(TileSizeM * TileSizeN, 0.0f);
-        float* temp_tile = OutputTile.data();
+        // Allocate temporary buffer for raw A*B result (TLS reusable buffer)
+        size_t tile_elems = TileSizeM * TileSizeN;
+
+        // resize the tile to the required size
+        g_kai_tls.output_tile.resize(tile_elems);
+
+        float* temp_tile = g_kai_tls.output_tile.data();
+        std::fill_n(temp_tile, tile_elems, 0.0f);
 
         if (UseSME2) {
             kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(
diff --git a/onnxruntime/core/platform/linux/device_discovery.cc b/onnxruntime/core/platform/linux/device_discovery.cc
index 6a02a1b46028f..e9c45a6966ef8 100644
--- a/onnxruntime/core/platform/linux/device_discovery.cc
+++ b/onnxruntime/core/platform/linux/device_discovery.cc
@@ -6,6 +6,7 @@
 #include <filesystem>
 #include <fstream>
 #include <iterator>
+#include <optional>
 #include <string_view>
 
 #include "core/common/common.h"
@@ -100,27 +101,44 @@ Status ReadValueFromFile(const fs::path& file_path, ValueType& value) {
   return ParseStringWithClassicLocale<ValueType>(file_text, value);
 }
 
+std::optional<bool> IsGpuDiscrete(uint16_t vendor_id, uint16_t device_id) {
+  ORT_UNUSED_PARAMETER(device_id);
+
+  // Currently, we only assume that all Nvidia GPUs are discrete.
+
+  constexpr auto kNvidiaPciId = 0x10de;
+  if (vendor_id == kNvidiaPciId) {
+    return true;
+  }
+
+  return std::nullopt;
+}
+
 Status GetGpuDeviceFromSysfs(const GpuSysfsPathInfo& path_info, OrtHardwareDevice& gpu_device_out) {
   OrtHardwareDevice gpu_device{};
   const auto& sysfs_path = path_info.path;
 
   // vendor id
-  {
-    const auto vendor_id_path = sysfs_path / "device" / "vendor";
-    ORT_RETURN_IF_ERROR(ReadValueFromFile(vendor_id_path, gpu_device.vendor_id));
-  }
+  uint16_t vendor_id{};
+  const auto vendor_id_path = sysfs_path / "device" / "vendor";
+  ORT_RETURN_IF_ERROR(ReadValueFromFile(vendor_id_path, vendor_id));
+  gpu_device.vendor_id = vendor_id;
 
   // TODO vendor name
 
   // device id
-  {
-    const auto device_id_path = sysfs_path / "device" / "device";
-    ORT_RETURN_IF_ERROR(ReadValueFromFile(device_id_path, gpu_device.device_id));
-  }
+  uint16_t device_id{};
+  const auto device_id_path = sysfs_path / "device" / "device";
+  ORT_RETURN_IF_ERROR(ReadValueFromFile(device_id_path, device_id));
+  gpu_device.device_id = device_id;
 
   // metadata
   gpu_device.metadata.Add("card_idx", MakeString(path_info.card_idx));
-  // TODO is card discrete?
+
+  if (const auto is_gpu_discrete = IsGpuDiscrete(vendor_id, device_id);
+      is_gpu_discrete.has_value()) {
+    gpu_device.metadata.Add("Discrete", (*is_gpu_discrete ? "1" : "0"));
+  }
 
   gpu_device.type = OrtHardwareDeviceType_GPU;
 
diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
index ffd954f13e568..f08f134d0c080 100644
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
@@ -78,8 +78,9 @@ class ConstantOfShapeBase {
     auto* t_proto_p = t_proto.get();
 #endif
     if (info.GetAttr<ONNX_NAMESPACE::TensorProto>("value", t_proto_p).IsOK()) {
-      ORT_ENFORCE(t_proto_p->dims_size() == 1, "Must have a single dimension");
-      ORT_ENFORCE(t_proto_p->dims()[0] == 1, "Must have a single dimension of 1");
+      for (auto dim : t_proto_p->dims()) {
+        ORT_ENFORCE(dim == 1, "The value attribute of ConstantOfShape must be a single-element tensor");
+      }
       SetValueFromTensorProto(*t_proto_p);
     } else {
       float f_value = 0.f;
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
index ca568e485da11..09db2e4c46245 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
@@ -134,11 +134,6 @@ struct TreeEnsembleAttributesV5 {
     for (auto i : nodes_modes_i) {
       nodes_modes.push_back(static_cast<NODE_MODE_ONNX>(i));
     }
-#else
-    // GetVectorAttrsOrDefault is not part of the minimal build.
-    // As a result, TreeEnsemble v5 cannot be available in this build.
-    ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
-#endif
 
     aggregate_function = info.GetAttrOrDefault<int64_t>("aggregate_function", 1);
     leaf_targetids = info.GetAttrsOrDefault<int64_t>("leaf_targetids");
@@ -151,6 +146,11 @@ struct TreeEnsembleAttributesV5 {
     nodes_truenodeids = info.GetAttrsOrDefault<int64_t>("nodes_truenodeids");
     post_transform = info.GetAttrOrDefault<int64_t>("post_transform", 0);
     tree_roots = info.GetAttrsOrDefault<int64_t>("tree_roots");
+#else
+    // GetVectorAttrsOrDefault is not part of the minimal build.
+    // As a result, TreeEnsemble v5 cannot be available in this build.
+    ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
+#endif
   }
 
   void convert_to_v3(TreeEnsembleAttributesV3<ThresholdType>& output) const {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index a59347841be95..55f901164bdac 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) {
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ:
     case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2:
@@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
       mgx_type = migraphx_shape_fp8e5m2fnuz_type;
       break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1:
+      mgx_type = migraphx_shape_fp4x2_type;
+      break;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4:
       mgx_type = migraphx_shape_int8_type;
       break;
@@ -949,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "QLinearAdd",
                                                     "QLinearConv",
                                                     "QLinearMatMul",
+                                                    "QLinearAveragePool",
+                                                    "QLinearGlobalAveragePool",
                                                     "QuantizeLinear",
                                                     "QuickGelu",
                                                     "DynamicQuantizeLinear",
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index a47ba7893d8fe..368caa518b7ba 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -21,6 +21,7 @@
 #include "core/providers/qnn/builder/qnn_node_group/udo_fusion.h"
 #include "core/providers/qnn/builder/qnn_node_group/lpbqgemm_fusion.h"
 #include "core/providers/qnn/builder/qnn_node_group/lpbqmatmul_fusion.h"
+#include "core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h"
 
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/ort_api.h"
@@ -82,6 +83,7 @@ static std::unordered_map<std::string, std::vector<FusionFunc>> fusions = {
     {"Gemm", {LowPowerBlockQuantizedGemmFusion::TryFusion, ReshapeGemmFusion::TryFusion}},
     {"Mul", {ScaleSoftmaxFusion::TryFusion}},
     {"Cast", {CastLoneQFusion::TryFusion}},
+    {"Reshape", {Rank6ToRank5Fusion::TryFusion}},
     {"Transpose", {ChannelShuffleFusion::TryFusion}}};
 
 void registerUDO(const std::string& node_type, const std::string& op_package) {
@@ -117,8 +119,10 @@ static std::unique_ptr<IQnnNodeGroup> TryQnnFusions(
     const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
     const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
     const logging::Logger& logger) {
-  // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes) except MatMul w/ LPBQ encodings
-  if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode && starting_node_unit.OpType() != "MatMul") {
+  // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes) except MatMul w/ LPBQ encodings and Reshape
+  if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode &&
+      starting_node_unit.OpType() != "MatMul" &&
+      starting_node_unit.OpType() != "Reshape") {
     return nullptr;
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc
new file mode 100644
index 0000000000000..3218e32cac097
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc
@@ -0,0 +1,459 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h"
+
+#include <gsl/gsl>
+#include <optional>
+#include <utility>
+#include <string>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <sstream>
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_node_group/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/common/safeint.h"
+
+namespace onnxruntime {
+namespace qnn {
+namespace {
+
+constexpr size_t kRank6 = 6;
+constexpr size_t kRank5 = 5;
+constexpr const char* kOpTypeReshape = "Reshape";
+constexpr const char* kOpTypeTranspose = "Transpose";
+constexpr const char* kAttrTransposePerm = "perm";
+
+using MapNodeToNodeUnit = std::unordered_map<const Node*, const NodeUnit*>;
+using MapNodeUnitToGroup = std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>;
+
+/// @brief Get the shape of a tensor from its NodeArg
+std::optional<TensorShape> GetTensorShape(const NodeArg* node_arg) {
+  if (node_arg == nullptr) {
+    return std::nullopt;
+  }
+  auto shape_proto = node_arg->Shape();
+  if (shape_proto == nullptr) {
+    return std::nullopt;
+  }
+  return utils::GetTensorProtoShape(*shape_proto);
+}
+
+/// @brief Get child NodeUnit of specified type, allowing QDQ-wrapped nodes
+const NodeUnit* GetChildNodeUnit(
+    const GraphViewer& graph_viewer,
+    const NodeUnit& parent_node_unit,
+    const std::string& child_op_type,
+    const MapNodeToNodeUnit& node_to_node_unit,
+    const MapNodeUnitToGroup& node_unit_to_qnn_node_group,
+    const logging::Logger& logger) {
+  const Node& parent_node = parent_node_unit.GetNode();
+
+  ORT_UNUSED_PARAMETER(logger);
+  // For QDQ NodeUnits, we need to look at the Q node's output, not the target node's output
+  const Node* search_node = &parent_node;
+  if (parent_node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    const auto& q_nodes = parent_node_unit.GetQNodes();
+    if (!q_nodes.empty()) {
+      search_node = q_nodes[0];  // Use first Q node
+    }
+  }
+
+  // Search node must have a single child (1 output edge) and must not produce a graph output
+  if (search_node->GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(*search_node)) {
+    return nullptr;
+  }
+
+  // Get the child node from the search node's output edge
+  const Node* potential_child = &search_node->OutputEdgesBegin()->GetNode();
+  if (graph_viewer.GetNode(potential_child->Index()) == nullptr) {
+    return nullptr;
+  }
+
+  // If the child is a DequantizeLinear, skip it and look at its child (the target op of the next QDQ group)
+  if (potential_child->OpType() == "DequantizeLinear") {
+    if (potential_child->GetOutputEdgesCount() != 1) {
+      return nullptr;
+    }
+    potential_child = &potential_child->OutputEdgesBegin()->GetNode();
+    if (graph_viewer.GetNode(potential_child->Index()) == nullptr) {
+      return nullptr;
+    }
+  }
+
+  // Check if this node matches the target type
+  if (potential_child->OpType() != child_op_type) {
+    return nullptr;
+  }
+
+  // Get the NodeUnit for the child
+  const auto child_node_unit_it = node_to_node_unit.find(potential_child);
+  if (child_node_unit_it == node_to_node_unit.end()) {
+    return nullptr;
+  }
+
+  const NodeUnit* child_node_unit = child_node_unit_it->second;
+
+  // Check if child node has already been handled
+  if (node_unit_to_qnn_node_group.count(child_node_unit) != 0) {
+    return nullptr;
+  }
+
+  return child_node_unit;
+}
+
+/// @brief Match the pattern: Reshape -> Transpose -> Reshape with rank-6 intermediate tensors
+std::optional<std::array<const NodeUnit*, 3>> MatchRank6ToRank5Pattern(
+    const GraphViewer& graph_viewer,
+    const NodeUnit* reshape1,
+    const MapNodeToNodeUnit& node_to_node_unit,
+    const MapNodeUnitToGroup& node_unit_to_qnn_node_group,
+    const logging::Logger& logger) {
+  LOGS(logger, VERBOSE) << "[Rank6ToRank5] MatchPattern: Checking node " << reshape1->Name()
+                        << " OpType=" << reshape1->OpType()
+                        << " UnitType=" << static_cast<int>(reshape1->UnitType());
+
+  // Validate first Reshape in pattern - allow both SingleNode and QDQGroup
+  if (reshape1->OpType() != kOpTypeReshape) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] First node in pattern is not a Reshape op";
+    return std::nullopt;
+  }
+
+  // Get Transpose child (middle node in pattern) - allow both SingleNode and QDQGroup
+  const NodeUnit* transpose = GetChildNodeUnit(
+      graph_viewer, *reshape1, kOpTypeTranspose, node_to_node_unit, node_unit_to_qnn_node_group, logger);
+  if (transpose == nullptr) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] Transpose (middle node in pattern) not found after first Reshape";
+    return std::nullopt;
+  }
+
+  LOGS(logger, VERBOSE) << "[Rank6ToRank5] Found Transpose (middle node): " << transpose->Name();
+
+  // Get second Reshape child (last node in pattern) - allow both SingleNode and QDQGroup
+  const NodeUnit* reshape2 = GetChildNodeUnit(
+      graph_viewer, *transpose, kOpTypeReshape, node_to_node_unit, node_unit_to_qnn_node_group, logger);
+  if (reshape2 == nullptr) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] Second Reshape (last node in pattern) not found after Transpose";
+    return std::nullopt;
+  }
+
+  LOGS(logger, VERBOSE) << "[Rank6ToRank5] Found second Reshape (last node): " << reshape2->Name();
+  LOGS(logger, INFO) << "[Rank6ToRank5] Pattern matched: Reshape -> Transpose -> Reshape";
+
+  return std::array<const NodeUnit*, 3>{reshape1, transpose, reshape2};
+}
+
+/// @brief Validate the pattern conditions and find the unit dimension index
+std::optional<size_t> ValidatePatternConditions(
+    const NodeUnit* reshape1,
+    const NodeUnit* transpose,
+    const NodeUnit* reshape2,
+    const QnnModelWrapper& qnn_model_wrapper,
+    const logging::Logger& logger) {
+  // Check if reshape shape inputs are constants
+  const NodeArg* reshape1_shape_input = reshape1->GetNode().InputDefs()[1];
+  const NodeArg* reshape2_shape_input = reshape2->GetNode().InputDefs()[1];
+
+  if (!qnn_model_wrapper.IsConstantInput(reshape1_shape_input->Name())) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Reshape1 shape input is not constant";
+    return std::nullopt;
+  }
+
+  if (!qnn_model_wrapper.IsConstantInput(reshape2_shape_input->Name())) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Reshape2 shape input is not constant";
+    return std::nullopt;
+  }
+
+  // Get tensor shapes
+  auto t0_shape = GetTensorShape(reshape1->GetNode().InputDefs()[0]);
+  auto t1_shape = GetTensorShape(reshape1->GetNode().OutputDefs()[0]);
+  auto t2_shape = GetTensorShape(transpose->GetNode().OutputDefs()[0]);
+  auto t3_shape = GetTensorShape(reshape2->GetNode().OutputDefs()[0]);
+
+  if (!t0_shape.has_value() || !t1_shape.has_value() ||
+      !t2_shape.has_value() || !t3_shape.has_value()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Failed to get tensor shapes";
+    return std::nullopt;
+  }
+
+  auto t1_dims = t1_shape->GetDims();
+  auto t2_dims = t2_shape->GetDims();
+
+  // Condition 1: Rank(t1) == Rank(t2) == 6
+  if (t1_shape->NumDimensions() != kRank6 || t2_shape->NumDimensions() != kRank6) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Condition 1 failed - not rank-6: t1_rank="
+                          << t1_shape->NumDimensions() << " t2_rank=" << t2_shape->NumDimensions();
+    return std::nullopt;
+  }
+
+  if (t1_dims.empty() || t2_dims.empty()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Empty dims";
+    return std::nullopt;
+  }
+
+  // Condition 2: Find a dimension with value 1 that exists at the same index in both t1 and t2
+  std::optional<size_t> unit_dim_index;
+  for (size_t i = 0; i < kRank6; ++i) {
+    if (t1_dims[i] == 1 && t2_dims[i] == 1) {
+      unit_dim_index = i;
+      break;
+    }
+  }
+
+  if (!unit_dim_index.has_value()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: No common unit dimension found in t1 and t2";
+    return std::nullopt;
+  }
+
+  // Condition 3: Transpose must leave the unit dimension in place
+  NodeAttrHelper transpose_helper(transpose->GetNode());
+  std::vector<int64_t> perm = transpose_helper.Get(kAttrTransposePerm, std::vector<int64_t>{});
+  if (perm.size() != kRank6) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Invalid permutation size: " << perm.size();
+    return std::nullopt;
+  }
+
+  if (perm[unit_dim_index.value()] != static_cast<int64_t>(unit_dim_index.value())) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Transpose moves unit dimension from index "
+                          << unit_dim_index.value() << " to " << perm[unit_dim_index.value()];
+    return std::nullopt;
+  }
+
+  LOGS(logger, INFO) << "[Rank6ToRank5] ValidateConditions: All conditions passed! Unit dimension at index "
+                     << unit_dim_index.value();
+  return unit_dim_index;
+}
+
+/// @brief Create or validate the QNN nodes with rank-5 tensors
+Status CreateOrValidateOnQnn(
+    QnnModelWrapper* qnn_model_wrapper,
+    gsl::span<const NodeUnit* const> node_units,
+    size_t unit_dim_index,
+    bool validate,
+    const logging::Logger& logger) {
+  LOGS(logger, VERBOSE) << "[Rank6ToRank5] CreateOrValidateOnQnn: validate=" << validate
+                        << " unit_dim_index=" << unit_dim_index;
+
+  const NodeUnit* reshape1 = node_units[0];
+  const NodeUnit* transpose = node_units[1];
+  const NodeUnit* reshape2 = node_units[2];
+
+  // Get input and output definitions
+  const NodeUnitIODef& reshape1_input = reshape1->Inputs()[0];
+  const NodeUnitIODef& reshape2_output = reshape2->Outputs()[0];
+
+  // Get original shapes
+  auto t1_shape = GetTensorShape(reshape1->GetNode().OutputDefs()[0]);
+  auto t2_shape = GetTensorShape(transpose->GetNode().OutputDefs()[0]);
+
+  if (!t1_shape.has_value() || !t2_shape.has_value()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to get intermediate tensor shapes");
+  }
+
+  auto t1_dims = t1_shape->GetDims();
+  auto t2_dims = t2_shape->GetDims();
+
+  // Create rank-5 shape for t1 (remove unit dimension at unit_dim_index)
+  std::vector<uint32_t> t1_rank5_dims;
+  t1_rank5_dims.reserve(kRank5);
+  for (size_t i = 0; i < t1_dims.size(); ++i) {
+    if (i != unit_dim_index) {
+      t1_rank5_dims.push_back(static_cast<uint32_t>(t1_dims[i]));
+    }
+  }
+
+  // Create rank-5 shape for t2 (remove unit dimension at unit_dim_index)
+  std::vector<uint32_t> t2_rank5_dims;
+  t2_rank5_dims.reserve(kRank5);
+  for (size_t i = 0; i < t2_dims.size(); ++i) {
+    if (i != unit_dim_index) {
+      t2_rank5_dims.push_back(static_cast<uint32_t>(t2_dims[i]));
+    }
+  }
+
+  // Get transpose permutation and adjust for rank-5
+  NodeAttrHelper transpose_helper(transpose->GetNode());
+  std::vector<int64_t> perm = transpose_helper.Get(kAttrTransposePerm, std::vector<int64_t>{});
+  if (perm.size() != kRank6) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Expected rank-6 permutation, got rank-", perm.size());
+  }
+
+  // Remove unit dimension and adjust indices
+  std::vector<uint32_t> perm_rank5;
+  perm_rank5.reserve(kRank5);
+  for (size_t i = 0; i < perm.size(); ++i) {
+    if (i != unit_dim_index) {
+      int64_t perm_val = perm[i];
+      // Adjust index: if perm_val > unit_dim_index, subtract 1
+      if (perm_val > static_cast<int64_t>(unit_dim_index)) {
+        perm_val--;
+      }
+      perm_rank5.push_back(static_cast<uint32_t>(perm_val));
+    }
+  }
+
+  // Use original tensor names from ONNX
+  const std::string& t1_name = reshape1->GetNode().OutputDefs()[0]->Name();
+  const std::string& t2_name = transpose->GetNode().OutputDefs()[0]->Name();
+
+  // Get data type from the NodeUnit's output (handles both quantized and float types)
+  const NodeUnitIODef& reshape1_output = reshape1->Outputs()[0];
+  Qnn_DataType_t data_type;
+  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(reshape1_output.quant_param.has_value(),
+                                            reshape1_output.node_arg.TypeAsProto(),
+                                            data_type));
+
+  // Get input shape for first Reshape
+  std::vector<uint32_t> reshape1_input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper->GetOnnxShape(reshape1_input.node_arg, reshape1_input_shape),
+                    "Failed to get first Reshape input shape");
+
+  // Get quantization params for first Reshape input
+  QnnQuantParamsWrapper quant_param;
+  ORT_RETURN_IF_ERROR(quant_param.Init(*qnn_model_wrapper, reshape1_input));
+
+  // Create Reshape1 with rank-5 output using AddReshapeNode
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddReshapeNode(
+      reshape1_input.node_arg.Name(),
+      t1_name,
+      reshape1_input_shape,
+      t1_rank5_dims,
+      data_type,
+      quant_param,
+      validate,
+      false,  // is_for_input
+      false   // is_for_output
+      ));
+
+  // Create Transpose with rank-5 input/output
+  {
+    // Get quantization params for transpose output
+    const NodeUnitIODef& transpose_output = transpose->Outputs()[0];
+    QnnQuantParamsWrapper transpose_quant_param;
+    ORT_RETURN_IF_ERROR(transpose_quant_param.Init(*qnn_model_wrapper, transpose_output));
+
+    // Check if output tensor already exists
+    if (!qnn_model_wrapper->IsQnnTensorWrapperExist(t2_name)) {
+      // Create rank-5 output tensor for transpose with proper quantization params
+      QnnTensorWrapper t2_tensor(t2_name, QNN_TENSOR_TYPE_NATIVE, data_type, std::move(transpose_quant_param),
+                                 std::vector<uint32_t>(t2_rank5_dims));
+      ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(t2_tensor)), "Failed to add transpose output");
+    }
+
+    // Create perm parameter
+    std::vector<uint32_t> perm_shape = {static_cast<uint32_t>(perm_rank5.size())};
+    QnnParamWrapper perm_param(transpose->Index(), transpose->Name(), QNN_OP_TRANSPOSE_PARAM_PERM,
+                               std::move(perm_shape), std::move(perm_rank5));
+    std::vector<std::string> param_tensor_names = {perm_param.GetParamTensorName()};
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(perm_param)), "Failed to add perm param");
+
+    std::vector<std::string> transpose_input_names = {t1_name};
+    std::vector<std::string> transpose_output_names = {t2_name};
+
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->CreateQnnNode(
+                          utils::GetUniqueName(*transpose),
+                          QNN_OP_PACKAGE_NAME_QTI_AISW,
+                          QNN_OP_TRANSPOSE,
+                          std::move(transpose_input_names),
+                          std::move(transpose_output_names),
+                          std::move(param_tensor_names),
+                          validate),
+                      "Failed to create rank-5 Transpose node");
+  }
+
+  // Get output shape for reshape2
+  std::vector<uint32_t> reshape2_output_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper->GetOnnxShape(reshape2_output.node_arg, reshape2_output_shape),
+                    "Failed to get reshape2 output shape");
+
+  // Get quantization params for reshape2
+  QnnQuantParamsWrapper quant_param2;
+  ORT_RETURN_IF_ERROR(quant_param2.Init(*qnn_model_wrapper, reshape2_output));
+
+  // Get data type from the NodeUnit's output (handles both quantized and float types)
+  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(reshape2_output.quant_param.has_value(),
+                                            reshape2_output.node_arg.TypeAsProto(),
+                                            data_type));
+
+  // Create Reshape2 with rank-5 input using AddReshapeNode
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddReshapeNode(
+      t2_name,
+      reshape2_output.node_arg.Name(),
+      t2_rank5_dims,
+      reshape2_output_shape,
+      data_type,
+      quant_param2,
+      validate,
+      false,  // is_for_input
+      false   // is_for_output
+      ));
+
+  return Status::OK();
+}
+
+}  // namespace
+
+std::unique_ptr<IQnnNodeGroup> Rank6ToRank5Fusion::TryFusion(
+    QnnModelWrapper& qnn_model_wrapper,
+    const NodeUnit& reshape1_node_unit,
+    const MapNodeToNodeUnit& node_to_node_unit,
+    const MapNodeUnitToGroup& node_unit_to_qnn_node_group,
+    const logging::Logger& logger) {
+  LOGS(logger, VERBOSE) << "[Rank6ToRank5] TryFusion called for node: " << reshape1_node_unit.Name()
+                        << " OpType: " << reshape1_node_unit.OpType();
+
+  const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
+
+  // Match the pattern
+  std::optional<std::array<const NodeUnit*, 3>> pattern = MatchRank6ToRank5Pattern(
+      graph_viewer, &reshape1_node_unit, node_to_node_unit, node_unit_to_qnn_node_group, logger);
+
+  if (!pattern.has_value()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] Pattern match failed for node: " << reshape1_node_unit.Name();
+    return nullptr;
+  }
+
+  const NodeUnit* reshape1 = pattern->at(0);
+  const NodeUnit* transpose = pattern->at(1);
+  const NodeUnit* reshape2 = pattern->at(2);
+
+  // Validate pattern conditions and get unit dimension index
+  auto unit_dim_index = ValidatePatternConditions(reshape1, transpose, reshape2, qnn_model_wrapper, logger);
+  if (!unit_dim_index.has_value()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] Pattern condition validation failed";
+    return nullptr;
+  }
+
+  // Validate on QNN
+  if (CreateOrValidateOnQnn(&qnn_model_wrapper, pattern.value(), unit_dim_index.value(), /*validate=*/true, logger) != Status::OK()) {
+    LOGS(logger, VERBOSE) << "[Rank6ToRank5] QNN validation failed";
+    return nullptr;
+  }
+
+  LOGS(logger, INFO) << "[Rank6ToRank5] Fusion successful! Creating Rank6ToRank5Fusion node group";
+  return std::make_unique<Rank6ToRank5Fusion>(pattern.value(), unit_dim_index.value());
+}
+
+gsl::span<const NodeUnit* const> Rank6ToRank5Fusion::GetNodeUnits() const {
+  return gsl::span<const NodeUnit* const>{node_units_.data(), node_units_.size()};
+}
+
+Status Rank6ToRank5Fusion::IsSupported(
+    QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), unit_dim_index_, /*validate=*/true, logger);
+}
+
+Status Rank6ToRank5Fusion::AddToModelBuilder(
+    QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), unit_dim_index_, /*validate=*/false, logger);
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h
new file mode 100644
index 0000000000000..cbce6933fc8d7
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <gsl/gsl>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
+#include "core/providers/qnn/ort_api.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class QnnModelWrapper;
+
+/// <summary>
+/// Represents a fusion of pattern: Reshape -> Transpose -> Reshape where intermediate tensors are rank-6.
+/// QNN doesn't support rank-6 Reshape and Transpose operators, so this fusion converts them to rank-5
+/// by removing a unit dimension (value of 1) from intermediate tensors.
+/// Pattern: Tensor(t0) -> Reshape(R1) -> Tensor(t1) -> Transpose(T1) -> Tensor(t2) -> Reshape(R2) -> Tensor(t3)
+/// Conditions:
+/// - Rank(t0) == Rank(t3) AND Last dimension of t0 equals last dimension of t3
+/// - Rank(t1) == Rank(t2) == 6
+/// - There exists a dimension index where both t1 and t2 have value 1
+/// - Transpose must leave that unit dimension in place (perm[unit_dim_index] == unit_dim_index)
+/// </summary>
+class Rank6ToRank5Fusion : public IQnnNodeGroup {
+ public:
+  explicit Rank6ToRank5Fusion(gsl::span<const NodeUnit* const> node_units, size_t unit_dim_index)
+      : unit_dim_index_(unit_dim_index) {
+    ORT_ENFORCE(node_units.size() == 3, "Pattern expects exactly 3 NodeUnits.");
+    node_units_[0] = node_units[0];
+    node_units_[1] = node_units[1];
+    node_units_[2] = node_units[2];
+  }
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(Rank6ToRank5Fusion);
+
+  Status IsSupported(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  Status AddToModelBuilder(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  gsl::span<const NodeUnit* const> GetNodeUnits() const override;
+  const NodeUnit* GetTargetNodeUnit() const override { return node_units_[0]; }
+  std::string_view Type() const override { return "Rank6ToRank5Fusion"; }
+
+  /// <summary>
+  /// Traverses graph to check if the given starting NodeUnit is part of a valid Reshape -> Transpose -> Reshape
+  /// pattern with rank-6 intermediate tensors.
+  /// </summary>
+  static std::unique_ptr<IQnnNodeGroup> TryFusion(
+      QnnModelWrapper& qnn_model_wrapper,
+      const NodeUnit& reshape1_node_unit,
+      const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
+      const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
+      const logging::Logger& logger);
+
+ private:
+  std::array<const NodeUnit*, 3> node_units_;  // Reshape1, Transpose, Reshape2
+  size_t unit_dim_index_;                      // Index of the unit dimension (value 1) to remove
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 1ab32e649ed40..cdbd0c074f443 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -78,6 +78,8 @@ struct int64s final {
   const int64_t* data() const { return g_host->int64s__data(this); }
   const int64_t& operator[](int index) const { return Get(index); }
   void Reserve(int size) { g_host->int64s__Reserve(this, size); }
+  const int64_t* begin() const { return data(); }
+  const int64_t* end() const { return data() + size(); }
   PROVIDER_DISALLOW_ALL(int64s)
 };
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 508d932459bf9..cd0c0e4bffdb5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
       trt_state->context->reset();
       trt_state->engine->reset();
+
+      // Clear dds output allocator map since the engine and context will be recreated.
+      dds_output_allocator_map.clear();
+
       auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
       if (max_workspace_size_ > 0) {
         trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h
index fe95917e4e906..c4a88754deffe 100644
--- a/onnxruntime/core/providers/webgpu/compute_context.h
+++ b/onnxruntime/core/providers/webgpu/compute_context.h
@@ -8,6 +8,7 @@
 #include <utility>
 
 #include "core/framework/execution_provider.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
 
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -16,7 +17,6 @@
 namespace onnxruntime {
 
 class Tensor;
-class WebGpuExecutionProvider;
 
 namespace webgpu {
 
@@ -42,6 +42,9 @@ class ComputeContext {
   inline bool HasFeature(wgpu::FeatureName feature) const {
     return webgpu_context_.DeviceHasFeature(feature);
   }
+  inline bool IsGraphCaptureEnabled() const {
+    return ep_.IsGraphCaptureEnabled();
+  }
 #if !defined(__wasm__)
   inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const {
     return webgpu_context_.SubgroupMatrixConfigs();
@@ -120,7 +123,7 @@ class ComputeContext {
   //
   // Run a compute shader program.
   //
-  inline Status RunProgram(const ProgramBase& program) {
+  inline Status RunProgram(ProgramBase& program) {
     return webgpu_context_.Run(*this, program);
   }
 
diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
index ee7a36d17cf55..bf5208883508f 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
@@ -103,7 +103,7 @@ std::string Conv2dMMProgram::Conv2dCommonSnippet(const ShaderVariableHelper& x,
     }
   } else {
     sample_w << "let col = colIn * " << inner_element_size_w << ";\n"
-             << "if (row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_b_outer)) {\n"
+             << "if (row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_a_outer)) {\n"
              << "  " << get_w_snippet(inner_element_size_w) << "\n"
              << "}\n"
              << "return " << TypeSnippet(inner_element_size_w, data_type) << "(0.0);\n";
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
index 80f6d831d0909..c8f50837cd8e5 100644
--- a/onnxruntime/core/providers/webgpu/program.h
+++ b/onnxruntime/core/providers/webgpu/program.h
@@ -226,6 +226,7 @@ struct ProgramInput {
   ProgramInput(const Tensor* tensor, ProgramTensorMetadataDependency dependency, const TensorShape& override_shape, int component);
 
   const Tensor* tensor;
+  uint32_t segments = 1;
   ProgramTensorMetadataDependency dependency;
   ProgramVariableDataType var_type;
   bool use_override_shape;
@@ -245,6 +246,7 @@ struct ProgramOutput {
   ProgramOutput(Tensor* tensor, ProgramTensorMetadataDependency dependency, const TensorShape& override_shape, int component);
 
   Tensor* tensor;
+  uint32_t segments = 1;
   ProgramTensorMetadataDependency dependency;
   ProgramVariableDataType var_type;
   bool is_atomic;
@@ -346,6 +348,18 @@ class ProgramBase {
   inline const ProgramMetadata& Metadata() const { return metadata_; }
   inline const std::string& CacheHint() const { return cache_hint_; }
   inline const std::vector<ProgramInput>& Inputs() const { return inputs_; }
+  inline void setSegmentsForInput(size_t index, uint32_t segments) {
+    if (index >= inputs_.size()) {
+      throw std::out_of_range("input index out of range");
+    }
+    inputs_[index].segments = segments;
+  }
+  inline void setSegmentsForOutput(size_t index, uint32_t segments) {
+    if (index >= outputs_.size()) {
+      throw std::out_of_range("output index out of range");
+    }
+    outputs_[index].segments = segments;
+  }
   inline const std::vector<ProgramOutput>& Outputs() const { return outputs_; }
   inline const std::vector<TensorShape>& Indices() const { return indices_; }
   inline uint32_t DispatchGroupSizeX() const { return dispatch_group_size_x_; }
diff --git a/onnxruntime/core/providers/webgpu/program_cache_key.cc b/onnxruntime/core/providers/webgpu/program_cache_key.cc
index a351cacc783cf..51c004fadaa8d 100644
--- a/onnxruntime/core/providers/webgpu/program_cache_key.cc
+++ b/onnxruntime/core/providers/webgpu/program_cache_key.cc
@@ -18,7 +18,7 @@ namespace webgpu {
 namespace {
 // append the info of an input or output to the cachekey
 void AppendTensorInfo(std::ostream& ss, const TensorShape& tensor_shape, ProgramVariableDataType var_type, ProgramTensorMetadataDependency dependency,
-                      bool& first) {
+                      bool& first, uint32_t segments = 1) {
   if (first) {
     first = false;
   } else {
@@ -34,6 +34,8 @@ void AppendTensorInfo(std::ostream& ss, const TensorShape& tensor_shape, Program
     ss << ';';
   }
 
+  ss D("Segs=") << segments << ';';
+
   if ((dependency & ProgramTensorMetadataDependency::Shape) == ProgramTensorMetadataDependency::Shape) {
     ss D("Dims=") << tensor_shape.ToString();
   } else if ((dependency & ProgramTensorMetadataDependency::Rank) == ProgramTensorMetadataDependency::Rank) {
@@ -97,13 +99,18 @@ std::string CalculateProgramCacheKey(const ProgramBase& program, bool is_1d_disp
   ss << ":" D("Inputs=");
   first = true;
   for (const auto& input : program.Inputs()) {
-    AppendTensorInfo(ss, input.use_override_shape ? input.override_shape : input.tensor->Shape(), input.var_type, input.dependency, first);
+    AppendTensorInfo(ss, input.use_override_shape ? input.override_shape : input.tensor->Shape(), input.var_type, input.dependency, first, input.segments);
   }
 
   ss << ":" D("Outputs=");
   first = true;
   for (const auto& output : program.Outputs()) {
-    AppendTensorInfo(ss, output.use_override_shape ? output.override_shape : output.tensor->Shape(), output.var_type, output.dependency, first);
+    AppendTensorInfo(ss,
+                     output.use_override_shape ? output.override_shape : output.tensor->Shape(),
+                     output.var_type,
+                     output.dependency,
+                     first,
+                     output.segments);
   }
 
   if (!program.Indices().empty()) {
diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc
index dcf89d8bb06a1..33c3514f8f6d3 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.cc
+++ b/onnxruntime/core/providers/webgpu/program_manager.cc
@@ -38,6 +38,28 @@ Status ProgramManager::NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint
   return Status::OK();
 }
 
+Status ProgramManager::CalculateSegmentsForInputsAndOutputs(ProgramBase& program) {
+  const uint64_t maxStorageBufferBindingSize = limits_.maxStorageBufferBindingSize;
+
+  // Inputs
+  for (size_t i = 0; i < program.Inputs().size(); ++i) {
+    const auto& input = program.Inputs()[i];
+    if (input.tensor && input.tensor->SizeInBytes() > maxStorageBufferBindingSize) {
+      uint32_t segments = static_cast<uint32_t>((input.tensor->SizeInBytes() + maxStorageBufferBindingSize - 1) / maxStorageBufferBindingSize);
+      program.setSegmentsForInput(i, segments);
+    }
+  }
+  // Outputs
+  for (size_t i = 0; i < program.Outputs().size(); ++i) {
+    const auto& output = program.Outputs()[i];
+    if (output.tensor && output.tensor->SizeInBytes() > maxStorageBufferBindingSize) {
+      uint32_t segments = static_cast<uint32_t>((output.tensor->SizeInBytes() + maxStorageBufferBindingSize - 1) / maxStorageBufferBindingSize);
+      program.setSegmentsForOutput(i, segments);
+    }
+  }
+  return Status::OK();
+}
+
 Status ProgramManager::Build(const ProgramBase& program,
                              const ProgramMetadata& program_metadata,
 #ifndef NDEBUG  // if debug build
diff --git a/onnxruntime/core/providers/webgpu/program_manager.h b/onnxruntime/core/providers/webgpu/program_manager.h
index feeb703b95aa2..a473051593852 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.h
+++ b/onnxruntime/core/providers/webgpu/program_manager.h
@@ -37,6 +37,7 @@ class ProgramManager {
   ProgramManager(const wgpu::Device& device, const wgpu::Limits& limits) : device_(device), limits_(limits) {}
 
   Status NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint32_t& z) const;
+  Status CalculateSegmentsForInputsAndOutputs(ProgramBase& program);
 
   Status Build(const ProgramBase& program,
                const ProgramMetadata& metadata,
diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc
index bdeea726a2cf5..0e4a3e08e1c13 100644
--- a/onnxruntime/core/providers/webgpu/shader_helper.cc
+++ b/onnxruntime/core/providers/webgpu/shader_helper.cc
@@ -91,7 +91,7 @@ const ShaderVariableHelper& ShaderHelper::AddInput(const std::string& name, Shad
 
   const auto& dims = program_.Inputs()[input_index].use_override_shape ? program_.Inputs()[input_index].override_shape
                                                                        : program_.Inputs()[input_index].tensor->Shape();
-  return AddVariableImpl(true, name, usage, dims);
+  return AddVariableImpl(true, name, usage, dims, program_.Inputs()[input_index].segments);
 }
 
 const ShaderVariableHelper& ShaderHelper::AddOutput(const std::string& name, ShaderUsage usage) {
@@ -101,7 +101,7 @@ const ShaderVariableHelper& ShaderHelper::AddOutput(const std::string& name, Sha
 
   const auto& dims = program_.Outputs()[output_index].use_override_shape ? program_.Outputs()[output_index].override_shape
                                                                          : program_.Outputs()[output_index].tensor->Shape();
-  return AddVariableImpl(false, name, usage, dims);
+  return AddVariableImpl(false, name, usage, dims, program_.Outputs()[output_index].segments);
 }
 
 const ShaderIndicesHelper& ShaderHelper::AddIndices(const std::string& name, ShaderUsage usage) {
@@ -263,12 +263,16 @@ Status ShaderHelper::ValidateVariable(const ProgramOutput& output, const ShaderV
 
 #endif  // NDEBUG
 
-const ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input,
-                                                          const std::string& name,
-                                                          ShaderUsage usage,
-                                                          const TensorShape& dims) {
-  ORT_ENFORCE(input_vars_.size() + output_vars_.size() < limits_.maxStorageBuffersPerShaderStage,
-              "Too many storage buffers in shader. Max is ", limits_.maxStorageBuffersPerShaderStage);
+ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input,
+                                                    const std::string& name,
+                                                    ShaderUsage usage,
+                                                    const TensorShape& dims,
+                                                    uint32_t segments) {
+  // Add the segments for the new variable we're about to create
+  numbers_storage_buffers_ += segments;
+  ORT_ENFORCE(numbers_storage_buffers_ <= limits_.maxStorageBuffersPerShaderStage,
+              "Too many storage buffers in shader. Current: ", numbers_storage_buffers_,
+              ", Max is ", limits_.maxStorageBuffersPerShaderStage);
 
   ProgramVariableDataType type = ProgramVariableDataType::InvalidType;
   auto& vars = is_input ? input_vars_ : output_vars_;
@@ -276,12 +280,18 @@ const ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input,
   if (is_input) {
     const auto& input = program_.Inputs()[vars.size()];
     type = input.var_type;
+    if (segments > 1) {
+      usage |= ShaderUsage::UseGetByOffsetSegments;
+    }
   } else {
     const auto& output = program_.Outputs()[vars.size()];
     type = output.var_type;
+    if (segments > 1) {
+      usage |= ShaderUsage::UseSetByOffsetSegments;
+    }
   }
 
-  const auto& var = vars.emplace_back(std::make_unique<ShaderVariableHelper>(name, type, usage, dims));
+  const auto& var = vars.emplace_back(std::make_unique<ShaderVariableHelper>(name, type, usage, dims, segments, limits_.maxStorageBufferBindingSize));
   return *var;
 }
 
@@ -418,28 +428,49 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector<int>& sha
   //
   // Input/output variables
   //
+  size_t binding_index = 0;  // running binding index accounting for segmented buffers
+  // inputs
   for (size_t i = 0; i < input_vars_.size(); ++i) {
     const auto& input = input_vars_[i];
-    ss << "@group(0) @binding(" << i << ") var<storage, read> " << input->name_ << ": array<" << input->StorageType() << ">;\n";
+    uint32_t segments = input->segments_;
+    for (uint32_t seg = 0; seg < segments; ++seg) {
+      ss << "@group(0) @binding(" << binding_index++ << ") var<storage, read> ";
+      if (seg == 0) {
+        ss << input->name_;
+      } else {
+        ss << input->name_ << seg;  // naming convention matches ShaderVariableHelper::Impl usage (name + index)
+      }
+      ss << ": array<" << input->StorageType() << ">;\n";
+    }
   }
+  // outputs
   for (size_t i = 0; i < output_vars_.size(); ++i) {
     const auto& output = output_vars_[i];
     bool is_atomic = program_.Outputs()[i].is_atomic;
-    ss << "@group(0) @binding(" << input_vars_.size() + i << ") var<storage, read_write> " << output->name_ << ": array<";
-    if (is_atomic) {
-      if (output->type_ == ProgramVariableDataType::Float32) {
-        ss << "atomic<i32>";
-      } else if (output->type_ == ProgramVariableDataType::Uint32) {
-        ss << "atomic<u32>";
-      } else if (output->type_ == ProgramVariableDataType::Int32) {
-        ss << "atomic<i32>";
+    uint32_t segments = output->segments_;
+    for (uint32_t seg = 0; seg < segments; ++seg) {
+      ss << "@group(0) @binding(" << binding_index++ << ") var<storage, read_write> ";
+      if (seg == 0) {
+        ss << output->name_;
       } else {
-        ORT_RETURN_IF(true, "Unsupported atomic type: ", int(output->type_));
+        ss << output->name_ << seg;
       }
-    } else {
-      ss << output->StorageType();
+      ss << ": array<";
+      if (is_atomic) {
+        if (output->type_ == ProgramVariableDataType::Float32) {
+          ss << "atomic<i32>";  // emulate float atomic via i32
+        } else if (output->type_ == ProgramVariableDataType::Uint32) {
+          ss << "atomic<u32>";
+        } else if (output->type_ == ProgramVariableDataType::Int32) {
+          ss << "atomic<i32>";
+        } else {
+          ORT_RETURN_IF(true, "Unsupported atomic type: ", int(output->type_));
+        }
+      } else {
+        ss << output->StorageType();
+      }
+      ss << ">;\n";
     }
-    ss << ">;\n";
   }
 
   //
@@ -559,7 +590,7 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector<int>& sha
 
     ss << "\n};\n"
           "@group(0) @binding("
-       << input_vars_.size() + output_vars_.size() << ") var<uniform> uniforms: Uniforms;\n";
+       << binding_index << ") var<uniform> uniforms: Uniforms;\n";
   }
 
   //
diff --git a/onnxruntime/core/providers/webgpu/shader_helper.h b/onnxruntime/core/providers/webgpu/shader_helper.h
index ea19a6ae9a875..6878f5236fddf 100644
--- a/onnxruntime/core/providers/webgpu/shader_helper.h
+++ b/onnxruntime/core/providers/webgpu/shader_helper.h
@@ -128,10 +128,11 @@ class ShaderHelper final {
     }
   }
 
-  const ShaderVariableHelper& AddVariableImpl(bool is_input,
-                                              const std::string& name,
-                                              ShaderUsage usage,
-                                              const TensorShape& dims);
+  ShaderVariableHelper& AddVariableImpl(bool is_input,
+                                        const std::string& name,
+                                        ShaderUsage usage,
+                                        const TensorShape& dims,
+                                        uint32_t segments);
 
 #ifndef NDEBUG  // if debug build
   Status ValidateVariable(const ProgramInput& input, const ShaderVariableHelper& var) const;
@@ -165,6 +166,8 @@ class ShaderHelper final {
   const ProgramBase& program_;
   const ProgramMetadata& program_metadata_;
 
+  uint32_t numbers_storage_buffers_ = 0;
+
   std::vector<std::unique_ptr<ShaderVariableHelper>> input_vars_;
   std::vector<std::unique_ptr<ShaderVariableHelper>> output_vars_;
   std::vector<std::unique_ptr<ShaderIndicesHelper>> indices_vars_;
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc
index c197e227e2a8c..aa1f6c9a0ec0b 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.cc
+++ b/onnxruntime/core/providers/webgpu/shader_variable.cc
@@ -4,6 +4,7 @@
 #include <memory>
 #include <string>
 #include <sstream>
+#include <cstdint>
 
 #include "core/providers/webgpu/shader_variable.h"
 
@@ -94,6 +95,33 @@ constexpr static const std::string_view ELEMENT_TYPE_ARRAY[] = {
 };
 constexpr static const auto ELEMENT_TYPE = details::_to_std_array(ELEMENT_TYPE_ARRAY);
 
+constexpr static const uint32_t BYTES_ARRAY[] = {
+    4,   // Float32
+    8,   // Float32x2
+    16,  // Float32x4
+    2,   // Float16
+    4,   // Float16x2
+    8,   // Float16x4
+    4,   // Int32
+    8,   // Int32x2
+    16,  // Int32x4
+    4,   // Uint32
+    8,   // Uint32x2
+    16,  // Uint32x4
+    8,   // Int64 (vec2<u32>)
+    8,   // Uint64 (vec2<u32>)
+    4,   // Boolx4 (packed in u32)
+    4,   // Uint8x4 (packed in u32)
+    8,   // Uint8x8 (vec2<u32>)
+    16,  // Uint8x16 (vec4<u32>)
+    4,   // Int8x4 (packed in u32)
+    8,   // Int8x8 (vec2<u32>)
+    16,  // Int8x16 (vec4<u32>)
+    4,   // Uint4x8 (packed in u32)
+    4,   // Int4x8 (packed in u32)
+};
+constexpr static const auto BYTES = details::_to_std_array(BYTES_ARRAY);
+
 inline std::string GetIndicesType(int rank) {
   return rank < 2 ? "u32"
                   : (rank <= 4 ? MakeStringWithClassicLocale("vec", rank, "<u32>")
@@ -114,8 +142,10 @@ ShaderIndicesHelper::ShaderIndicesHelper(std::string_view name, ProgramVariableD
       element_type_alias_{name_ + "_element_t"},
       indices_type_alias_{name_ + "_indices_t"} {}
 
-ShaderVariableHelper::ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims)
-    : ShaderIndicesHelper{name, type, usage, dims} {
+ShaderVariableHelper::ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims, uint32_t segments, uint64_t maxStorageBufferBindingSize)
+    : ShaderIndicesHelper{name, type, usage, dims},
+      segments_{segments},
+      max_storage_buffer_binding_size_{maxStorageBufferBindingSize} {
   ORT_ENFORCE(type_ != ProgramVariableDataType::InvalidType, "Invalid type for variable ", name_);
   ORT_ENFORCE(num_components_ > 0, "Invalid number of components for variable ", name_);
 }
@@ -273,11 +303,47 @@ void ShaderVariableHelper::Impl(std::ostream& ss) const {
       SS_APPEND(ss, "}\n");
     }
   }
+  // Implementation of "fn get_{name}_by_offset" for multi-buffer segmented inputs
+  if (usage_ & ShaderUsage::UseGetByOffsetSegments) {
+    // Multi-buffer segmented input accessor.
+    // Compute which physical storage buffer chunk the global linear element offset belongs to.
+    SS_APPEND(ss, "fn get_", name_, "_by_offset(global_offset: u32) -> ", ValueType(), " {\n");
+    SS_APPEND(ss, "  const CHUNK_SIZE_IN_ELEMENTS: u32 = ", max_storage_buffer_binding_size_, "u / ", BYTES[static_cast<int>(type_)], "u;\n");
+    SS_APPEND(ss, "  let buffer_index: u32 = global_offset / CHUNK_SIZE_IN_ELEMENTS;\n");
+    SS_APPEND(ss, "  let local_offset: u32 = global_offset % CHUNK_SIZE_IN_ELEMENTS;\n");
+    SS_APPEND(ss, "  switch(buffer_index) {\n");
+    // case 0 (base buffer name_)
+    SS_APPEND(ss, "    case 0u: { return ", name_, "[local_offset]; }\n");
+    for (uint32_t i = 1; i < segments_; ++i) {
+      SS_APPEND(ss, "    case ", i, "u: { return ", name_, i, "[local_offset]; }\n");
+    }
+    SS_APPEND(ss, "    default: { return ", name_, "[local_offset]; }\n");
+    SS_APPEND(ss, "  }\n");
+    SS_APPEND(ss, "}\n");
+  }
+  // Implementation of "fn set_{name}_by_offset" for multi-buffer segmented variables
+  if (usage_ & ShaderUsage::UseSetByOffsetSegments) {
+    SS_APPEND(ss, "fn set_", name_, "_by_offset(global_offset: u32, value: ", ValueType(), ") {\n");
+    SS_APPEND(ss, "  const CHUNK_SIZE_IN_ELEMENTS: u32 = ", max_storage_buffer_binding_size_, "u / ", BYTES[static_cast<int>(type_)], "u;\n");
+    SS_APPEND(ss, "  let buffer_index: u32 = global_offset / CHUNK_SIZE_IN_ELEMENTS;\n");
+    SS_APPEND(ss, "  let local_offset: u32 = global_offset % CHUNK_SIZE_IN_ELEMENTS;\n");
+    SS_APPEND(ss, "  switch(buffer_index) {\n");
+    SS_APPEND(ss, "    case 0u: { ", name_, "[local_offset] = value; return; }\n");
+    for (uint32_t i = 1; i < segments_; ++i) {
+      SS_APPEND(ss, "    case ", i, "u: { ", name_, i, "[local_offset] = value; return; }\n");
+    }
+    SS_APPEND(ss, "    default: { ", name_, "[local_offset] = value; return; }\n");
+    SS_APPEND(ss, "  }\n");
+    SS_APPEND(ss, "}\n");
+  }
 }
 
 std::string ShaderVariableHelper::GetByOffsetImpl(std::string_view offset) const {
   SS(ss, kStringInitialSizeGetByOffsetImpl);
 
+  if (usage_ & ShaderUsage::UseGetByOffsetSegments) {
+    return MakeStringWithClassicLocale("get_", name_, "_by_offset(", offset, ")");
+  }
   switch (type_) {
     case onnxruntime::webgpu::ProgramVariableDataType::InvalidType:
       ORT_THROW("Invalid type");
@@ -303,12 +369,16 @@ std::string ShaderVariableHelper::GetByOffsetImpl(std::string_view offset) const
 std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std::string_view value) const {
   SS(ss, kStringInitialSizeSetByOffsetImpl);
 
+  if (usage_ & ShaderUsage::UseSetByOffsetSegments) {
+    return MakeStringWithClassicLocale("set_", name_, "_by_offset(", offset, ",", value, ");");
+  }
+
   switch (type_) {
     case onnxruntime::webgpu::ProgramVariableDataType::InvalidType:
       ORT_THROW("Invalid type");
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Int64:
-      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));";
+      ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));";
       break;
     case onnxruntime::webgpu::ProgramVariableDataType::Uint64:
       ss << name_ << "[" << offset << "]=vec2<u32>(u32(" << value << "), 0u);";
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 78c98ab26f5b8..8e921d6deafbb 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -69,6 +69,8 @@ struct ShaderUsage {
     UseSetByIndices = 512,                // use implementation of fn set_{name}_by_indices
     UseGet = 1024,                        // use implementation of fn get_{name}
     UseGetByIndices = 2048,               // use implementation of fn get_{name}_by_indices
+    UseGetByOffsetSegments = 4096,        // use implementation of fn get_{name}_by_offset
+    UseSetByOffsetSegments = 8192,        // use implementation of fn set_{name}_by_offset
     UseUniform = 32768,                   // use uniform for shape and stride
   } usage;
 
@@ -157,7 +159,7 @@ class ShaderIndicesHelper {
 // A helper class to make it easier to generate shader code related to a variable setting/getting and its indices calculation.
 class ShaderVariableHelper : public ShaderIndicesHelper {
  public:
-  ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims);
+  ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims, uint32_t segments, uint64_t maxStorageBufferBindingSize);
 
   ShaderVariableHelper(ShaderVariableHelper&&) = default;
   ShaderVariableHelper& operator=(ShaderVariableHelper&&) = default;
@@ -203,6 +205,9 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
   std::string_view ValueType() const;
   std::string_view ElementType() const;
 
+  uint32_t segments_ = 1;
+  uint64_t max_storage_buffer_binding_size_ = 0;
+
   friend class ShaderHelper;
 };
 #if defined(__GNUC__)
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc
index 313a96ba25509..daf4aa323c12e 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc
@@ -11,75 +11,29 @@ namespace onnxruntime {
 namespace webgpu {
 
 namespace {
-const std::vector<MLDataType>& CastOpTypeConstraints() {
-  // currently support boolean, integer and float types that explicitly allowed in WGSL:
+const std::vector<MLDataType>& CastOpTypeConstraints(bool enable_graph_capture) {
+  // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL:
   // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section
-  //
-  static std::vector<MLDataType> types{
+  static std::vector<MLDataType> base_types{
       DataTypeImpl::GetTensorType<MLFloat16>(),
       DataTypeImpl::GetTensorType<float>(),
       DataTypeImpl::GetTensorType<int32_t>(),
       DataTypeImpl::GetTensorType<uint32_t>(),
       DataTypeImpl::GetTensorType<bool>()};
-  return types;
+
+  if (enable_graph_capture) {
+    static std::vector<MLDataType> types_with_int64 = []() {
+      auto types = base_types;
+      types.push_back(DataTypeImpl::GetTensorType<int64_t>());
+      return types;
+    }();
+    return types_with_int64;
+  } else {
+    return base_types;
+  }
 }
 }  // namespace
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    6, 8,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    9, 12,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    13, 18,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    19, 20,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    21, 22,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-ONNX_OPERATOR_KERNEL_EX(
-    Cast,
-    kOnnxDomain,
-    23,
-    kWebGpuExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T1", CastOpTypeConstraints())
-        .TypeConstraint("T2", CastOpTypeConstraints()),
-    Cast);
-
 Status Cast::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   auto* output_tensor = context.Output(0, input_tensor->Shape());
@@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const {
   if (size == 0) {
     return Status::OK();
   }
+  bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType<int64_t>();
+  const int in_components = is_from_int64 ? 1 : 4;
+  const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4;
   uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
+  uint32_t in_vec_size = onnxruntime::narrow<uint32_t>(in_components == 1 ? size : vec_size);
+  uint32_t out_vec_size = onnxruntime::narrow<uint32_t>(out_components == 1 ? size : vec_size);
 
-  CastProgram program{to_};
+  CastProgram program{to_, is_from_int64};
   program
-      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4})
-      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4})
+      .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components})
+      .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components})
       .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
           {static_cast<uint32_t>(vec_size)},
@@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
       expression = "vec4<bool>(a)";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      expression = "int32(a)";
+      break;
     default:
       ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported.");
   }
-  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")
-                        << "  let a = " << input.GetByOffset("global_idx") << ";\n  "
-                        << output.SetByOffset("global_idx", expression);
+
+  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
+  if (is_from_int64_) {
+    sh.MainFunctionBody() << "  let a0 = " << input.GetByOffset("global_idx * 4") << ";\n"
+                          << "  let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n"
+                          << "  let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n"
+                          << "  let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n"
+                          << "  let a = vec4<i32>(a0, a1, a2, a3);\n";
+  } else {
+    sh.MainFunctionBody() << "  let a = " << input.GetByOffset("global_idx") << ";\n";
+  }
+  if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n"
+                          << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n";
+  } else {
+    sh.MainFunctionBody() << output.SetByOffset("global_idx", expression);
+  }
 
   return Status::OK();
 }
 
+template <int StartVersion, int EndVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) {
+  const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture);
+
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+    out = std::make_unique<Cast>(info);
+    return Status::OK();
+  };
+
+  if constexpr (StartVersion == EndVersion) {
+    // Non-versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  } else {
+    // Versioned kernel
+    return {
+        KernelDefBuilder()
+            .SetName("Cast")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(StartVersion, EndVersion)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T1", type_constraints)
+            .TypeConstraint("T2", type_constraints)
+            .Build(),
+        kernel_create_fn};
+  }
+}
+
+// Explicit template instantiations
+template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool);
+template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool);
+template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool);
+template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool);
+template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool);
+template KernelCreateInfo CreateCastKernelInfo<23>(bool);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h
index 925cd200f0aba..7dfb50e3241c8 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.h
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "core/framework/kernel_registry.h"
+#include "core/framework/op_kernel.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
 namespace onnxruntime {
@@ -10,7 +12,7 @@ namespace webgpu {
 
 class CastProgram final : public Program<CastProgram> {
  public:
-  CastProgram(int32_t to) : Program{"Cast"}, to_{to} {}
+  CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -18,6 +20,7 @@ class CastProgram final : public Program<CastProgram> {
 
  private:
   int32_t to_;
+  bool is_from_int64_;
 };
 
 class Cast final : public WebGpuKernel {
@@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel {
   int32_t to_;
 };
 
+// Create Cast kernel info with appropriate type constraints based on graph capture support
+template <int StartVersion, int EndVersion = StartVersion>
+KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
index 7c3aced3f0295..cab1dc03848b9 100644
--- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc
@@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const {
   data_dim += indices_innerest_dim_;
 
   for (uint32_t i = 0; i < static_cast<uint32_t>(data.Rank() - data_dim); i++) {
-    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
+    shader.MainFunctionBody() << "  " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n";
   }
 
   shader.MainFunctionBody() << "  " << output.SetByOffset("global_idx", data.GetByIndices("data_indices"));
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 985fcd03f33ac..f48b78c9adb91 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -178,7 +178,7 @@ Status WebGpuContext::Wait(wgpu::Future f) {
   return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to wait for the operation:", uint32_t(status));
 }
 
-Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
+Status WebGpuContext::Run(ComputeContext& context, ProgramBase& program) {
   const auto& inputs = program.Inputs();
   const auto& outputs = program.Outputs();
 
@@ -263,6 +263,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
     ORT_ENFORCE(x == 0 && y == 0 && z == 0,
                 "Only one of SetIndirectDispatchTensor and SetDispatchGroupSize should be called for program", program.Name());
   }
+  ORT_RETURN_IF_ERROR(program_mgr_->CalculateSegmentsForInputsAndOutputs(program));
 
   bool is_1d_dispatch = (y == 1 && z == 1);
 
@@ -437,19 +438,26 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
 
   WriteTimestamp(num_pending_dispatches_ * 2);
 
+  const size_t total_buffer_count = inputs.size() + outputs.size() + (uniform_buffer ? 1 : 0);
+
   std::vector<WGPUBuffer> bind_buffers;
-  bind_buffers.reserve(inputs.size() + outputs.size() + (uniform_buffer ? 1 : 0));
+  std::vector<uint32_t> bind_buffers_segments;
+  bind_buffers.reserve(total_buffer_count);
+  bind_buffers_segments.reserve(total_buffer_count);
   for (const auto& input : inputs) {
     bind_buffers.push_back(reinterpret_cast<WGPUBuffer>(const_cast<void*>(input.tensor->DataRaw())));
+    bind_buffers_segments.push_back(input.segments);
   }
   for (const auto& output : outputs) {
     bind_buffers.push_back(reinterpret_cast<WGPUBuffer>(output.tensor->MutableDataRaw()));
+    bind_buffers_segments.push_back(output.segments);
   }
   if (uniform_buffer) {
     bind_buffers.push_back(uniform_buffer);
+    bind_buffers_segments.push_back(1);  // uniform buffer defaults to 1 segment
   }
 
-  LaunchComputePipeline(compute_pass_encoder, bind_buffers, *program_artifact, x, y, z, program.IndirectDispatchTensor());
+  LaunchComputePipeline(compute_pass_encoder, bind_buffers, bind_buffers_segments, *program_artifact, x, y, z, program.IndirectDispatchTensor());
   if (uniform_buffer) {
     buffer_mgr.Release(uniform_buffer);
   }
@@ -535,7 +543,15 @@ wgpu::Limits WebGpuContext::GetRequiredLimits(const wgpu::Adapter& adapter) cons
   required_limits.maxBindGroups = adapter_limits.maxBindGroups;
   required_limits.maxComputeWorkgroupStorageSize = adapter_limits.maxComputeWorkgroupStorageSize;
   required_limits.maxComputeWorkgroupsPerDimension = adapter_limits.maxComputeWorkgroupsPerDimension;
-  required_limits.maxStorageBufferBindingSize = adapter_limits.maxStorageBufferBindingSize;
+  required_limits.maxStorageBuffersPerShaderStage = adapter_limits.maxStorageBuffersPerShaderStage;
+
+  if (small_storage_buffer_binding_size_for_testing_) {
+    // No matter how small it is set, the minimum storage buffer binding size in WebGPU is 128 MB.
+    required_limits.maxStorageBufferBindingSize = 134217728;
+  } else {
+    required_limits.maxStorageBufferBindingSize = adapter_limits.maxStorageBufferBindingSize;
+  }
+
   required_limits.maxBufferSize = adapter_limits.maxBufferSize;
   required_limits.maxComputeInvocationsPerWorkgroup = adapter_limits.maxComputeInvocationsPerWorkgroup;
   required_limits.maxComputeWorkgroupSizeX = adapter_limits.maxComputeWorkgroupSizeX;
@@ -728,15 +744,37 @@ void WebGpuContext::OnRunEnd() {
 
 void WebGpuContext::LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder,
                                           const std::vector<WGPUBuffer>& bind_buffers,
+                                          const std::vector<uint32_t>& bind_buffers_segments,
                                           const ProgramArtifact& program_artifact,
                                           uint32_t x, uint32_t y, uint32_t z,
                                           const Tensor* indirect_dispatch_tensor) {
   uint32_t entry_index = 0;
   std::vector<WGPUBindGroupEntry> bind_group_entries;
-  for (WGPUBuffer buffer : bind_buffers) {
-    bind_group_entries.push_back({nullptr, entry_index++, buffer, 0, WGPU_WHOLE_SIZE, nullptr, nullptr});
+
+  for (size_t buffer_idx = 0; buffer_idx < bind_buffers.size(); ++buffer_idx) {
+    WGPUBuffer buffer = bind_buffers[buffer_idx];
+    uint64_t buffer_size = wgpuBufferGetSize(buffer);
+    const uint64_t kMaxBufferSize = device_limits_.maxStorageBufferBindingSize;
+    const uint32_t total_segments = bind_buffers_segments[buffer_idx];
+    // `total_segments` we used is calculated by tensor size, not actual buffer size. Because for bucketed buffer,
+    // the actual buffer size may be larger than the tensor size, an extreme case is that tensor size = 127MB, buffer size = 256MB,
+    // maxStorageBufferBindingSize = 128MB, in this case we only need to bind 1 segment instead of 2 segments because
+    // there is no data for the second segment.
+    if (total_segments > 1) {
+      uint64_t offset = 0;
+      for (uint32_t segment = 0; segment < total_segments; ++segment) {
+        uint64_t segment_size = std::min(kMaxBufferSize, buffer_size - offset);
+        bind_group_entries.push_back({nullptr, entry_index++, buffer, offset, segment_size, nullptr, nullptr});
+        offset += segment_size;
+      }
+    } else {
+      bind_group_entries.push_back({nullptr, entry_index++, buffer, 0, std::min(kMaxBufferSize, buffer_size), nullptr, nullptr});
+    }
   }
 
+  ORT_ENFORCE(entry_index < device_limits_.maxBindingsPerBindGroup, "Number of bind group entries (", entry_index,
+              ") exceeds device limit (", device_limits_.maxBindingsPerBindGroup, ").");
+
   WGPUBindGroupLayout bind_group_layout = program_artifact.compute_pipeline.GetBindGroupLayout(0).MoveToCHandle();
   WGPUBindGroupDescriptor bind_group_desc{};
   bind_group_desc.layout = bind_group_layout;
@@ -912,7 +950,7 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
   auto it = contexts_.find(context_id);
   if (it == contexts_.end()) {
     GSL_SUPPRESS(r.11)
-    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, device, config.validation_mode, config.preserve_device));
+    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, device, config.validation_mode, config.preserve_device, config.small_storage_buffer_binding_size_for_testing));
     it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first;
   } else if (context_id != 0) {
     ORT_ENFORCE(it->second.context->instance_.Get() == instance &&
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index 0c0d116cf9394..e21a0e577311f 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -40,6 +40,7 @@ struct WebGpuContextConfig {
   const void* dawn_proc_table;
   ValidationMode validation_mode;
   bool preserve_device;
+  bool small_storage_buffer_binding_size_for_testing;
 };
 
 struct WebGpuBufferCacheConfig {
@@ -166,7 +167,7 @@ class WebGpuContext final {
   //
   Status PopErrorScope();
 
-  Status Run(ComputeContext& context, const ProgramBase& program);
+  Status Run(ComputeContext& context, ProgramBase& program);
   void OnRunEnd();
 
  private:
@@ -176,12 +177,13 @@ class WebGpuContext final {
     AtPasses
   };
 
-  WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode, bool preserve_device)
-      : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None}, preserve_device_{preserve_device} {}
+  WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode, bool preserve_device, bool small_storage_buffer_binding_size_for_testing = false)
+      : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None}, preserve_device_{preserve_device}, small_storage_buffer_binding_size_for_testing_{small_storage_buffer_binding_size_for_testing} {}
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuContext);
 
   void LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder,
                              const std::vector<WGPUBuffer>& bind_buffers,
+                             const std::vector<uint32_t>& bind_buffers_segments,
                              const ProgramArtifact& program_artifact,
                              uint32_t x, uint32_t y, uint32_t z,
                              const Tensor* indirect_dispatch_tensor = nullptr);
@@ -264,6 +266,7 @@ class WebGpuContext final {
   uint64_t gpu_timestamp_offset_ = 0;
   bool is_profiling_ = false;
   bool preserve_device_;
+  bool small_storage_buffer_binding_size_for_testing_;
   GraphCaptureState graph_capture_state_{GraphCaptureState::Default};
 
   // External vector to store captured commands, owned by EP
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index bbb3fbdd221d3..0f7607ac1dbfe 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -28,6 +28,7 @@
 #include "core/providers/webgpu/data_transfer.h"
 #include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
+#include "core/providers/webgpu/tensor/cast.h"
 
 namespace onnxruntime {
 
@@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND);
 
-std::unique_ptr<KernelRegistry> RegisterKernels() {
+std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = false) {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -464,13 +465,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(13, Tanh),
       KERNEL_CREATE_INFO(1, Not),
 
-      KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast),
-      KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast),
-      KERNEL_CREATE_INFO(23, Cast),
-
       // // activations
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, float, Clip)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, float, Clip)>,
@@ -771,6 +765,14 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
     }
   }
 
+  // Register Cast kernels with conditional int64 support based on graph capture
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture)));
+
 #ifndef DISABLE_CONTRIB_OPS
   Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry);
   ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage());
@@ -869,9 +871,13 @@ std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapa
 }
 
 std::shared_ptr<KernelRegistry> WebGpuExecutionProvider::GetKernelRegistry() const {
-  static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels();
-
-  return registry;
+  if (enable_graph_capture_) {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(true);
+    return registry;
+  } else {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false);
+    return registry;
+  }
 }
 
 std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTransfer() const {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 80b3988215c6b..60934bef574fa 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -155,6 +155,19 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
     }
   }
 
+  std::string small_storage_buffer_binding_size_for_testing_str;
+  bool small_storage_buffer_binding_size_for_testing = false;
+  if (config_options.TryGetConfigEntry(kSmallStorageBufferBindingSizeForTesting, small_storage_buffer_binding_size_for_testing_str)) {
+    if (small_storage_buffer_binding_size_for_testing_str == "1" || small_storage_buffer_binding_size_for_testing_str == "true") {
+      small_storage_buffer_binding_size_for_testing = true;
+    } else if (small_storage_buffer_binding_size_for_testing_str == "0" || small_storage_buffer_binding_size_for_testing_str == "false") {
+      small_storage_buffer_binding_size_for_testing = false;
+    } else {
+      ORT_THROW("Invalid small storage buffer binding size for testing: ", small_storage_buffer_binding_size_for_testing_str);
+    }
+  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP small storage buffer binding size for testing: " << small_storage_buffer_binding_size_for_testing;
+
   webgpu::WebGpuContextConfig context_config{
       context_id,
       reinterpret_cast<WGPUInstance>(webgpu_instance),
@@ -162,6 +175,7 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
       reinterpret_cast<const void*>(dawn_proc_table),
       validation_mode,
       preserve_device,
+      small_storage_buffer_binding_size_for_testing,
   };
 
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
index a3b6cca4ceaf0..761ff0d85fc98 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -32,6 +32,8 @@ constexpr const char* kEnablePIXCapture = "ep.webgpuexecutionprovider.enablePIXC
 
 constexpr const char* kPreserveDevice = "ep.webgpuexecutionprovider.preserveDevice";
 
+constexpr const char* kSmallStorageBufferBindingSizeForTesting = "ep.webgpuexecutionprovider.smallStorageBufferBindingSizeForTesting";
+
 // The following are the possible values for the provider options.
 
 constexpr const char* kDawnBackendType_D3D12 = "D3D12";
diff --git a/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc b/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc
index c239605733df8..7208cbd5a7d8a 100644
--- a/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc
+++ b/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc
@@ -258,6 +258,15 @@ duk_ret_t ShaderVariable_SetByOffset(duk_context* ctx) {
   return 1;
 }
 
+/** @brief JavaScript binding for ShaderVariableHelper::GetByOffset */
+duk_ret_t ShaderVariable_GetByOffset(duk_context* ctx) {
+  const char* offset_expr = duk_require_string(ctx, 0);
+  const ShaderVariableHelper* helper = GetHelperFromFunction<const ShaderVariableHelper>(ctx);
+  std::string result = helper->GetByOffset(offset_expr);
+  duk_push_string(ctx, result.c_str());
+  return 1;
+}
+
 /** @brief JavaScript binding for ShaderVariableHelper::Rank */
 duk_ret_t ShaderVariable_Rank(duk_context* ctx) {
   const ShaderVariableHelper* helper = GetHelperFromFunction<const ShaderVariableHelper>(ctx);
@@ -363,6 +372,7 @@ Status ApplyTemplateDynamic(ShaderHelper& shader_helper,
 
         CreateShaderVariableMethod(ctx, "OffsetToIndices", ShaderVariable_OffsetToIndices, 1, var_helper);
         CreateShaderVariableMethod(ctx, "SetByOffset", ShaderVariable_SetByOffset, 2, var_helper);
+        CreateShaderVariableMethod(ctx, "GetByOffset", ShaderVariable_GetByOffset, 1, var_helper);
         CreateShaderVariableMethod(ctx, "Rank", ShaderVariable_Rank, 0, var_helper);
         duk_put_prop_string(ctx, -2, arg.name.c_str());
       }
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index baedb98a34c28..fbabc23504636 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type);
 // Collects all the initializer tensors in the subGraph and its ancestor graphs.
 InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
 
-inline std::vector<int64_t> HandleNegativeAxes(const std::vector<int64_t>& axes, size_t input_size) {
+inline std::vector<int64_t> HandleNegativeAxes(const gsl::span<const int64_t> axes, size_t input_size) {
   std::vector<int64_t> new_axes(axes.size());
   for (size_t i = 0; i < axes.size(); ++i) {
     new_axes[i] = HandleNegativeAxis(axes[i], input_size);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 0ea927967d989..5a80f01c17236 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&,
       std::vector<int64_t> c_shape;
       if (!GetShape(*input_defs[c_idx], c_shape, logger))
         return false;
-
-      size_t c_dim = c_shape.size();
-
-      if (c_dim > 1) {
-        // TODO: Supports other shape of C.
-        // Currently WebNN implementation in Chromium only supports 1-D C.
-        return false;
-      }
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm is a scalar";
-      } else {
-        auto c_size = c_shape[c_dim - 1];
-        NodeAttrHelper helper(node);
-        const auto transB = helper.Get("transB", 0);
-        if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-          LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                                << (transB == 0 ? "1" : "0") << "]"
-                                << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                                << " c_size: " << c_size;
-
-          return false;
-        }
-      }
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 6ea9b0a440d93..d07e636d578b1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -19,6 +19,8 @@ namespace webnn {
 class ReductionOpBuilder : public BaseOpBuilder {
   // Add operator related.
  public:
+  // Allow axes potentially being empty inputs that are ignored during processing.
+  ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   // Add operator related.
@@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() > 1) {
     model_builder.AddInitializerToSkip(input_defs[1]->Name());  // axes
+    model_builder.AddInputToSkip(input_defs[1]->Name());        // axes
   }
 }
 
@@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   NodeAttrHelper helper(node);
   const auto keep_dims = helper.Get("keepdims", 1);
+
   emscripten::val options = emscripten::val::object();
   options.set("label", node.Name());
   options.set("keepDimensions", keep_dims == 1);
-  std::vector<int32_t> axes_data;
-
-  emscripten::val output = emscripten::val::object();
 
+  std::vector<int64_t> axes_data;
   const auto opset = node.SinceVersion();
   const auto& op_type = node.OpType();
   if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) {
     // 'axes' is an optional input.
-    const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0);
-    if (!GetTensorName(input_defs, 1).empty()) {
-      // Optional input axes is provided, use axes initializer data.
-      const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
-      Initializer axes_initializer(axes_tensor);
-      const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
-      std::transform(
-          axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
-    } else {
-      if (noop_with_empty_axes) {
-        // When axes is empty and this attribute is set to true, input tensor will not be reduced.
-        output = input;
-        model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
-        return Status::OK();
+    std::vector<int64_t> axes_shape;
+    if (TensorExists(input_defs, 1)) {
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes");
+      if (axes_shape[0] != 0) {
+        // Optional input axes is provided and we already ensure it is an initializer.
+        // Use that initializer data.
+        const auto& initializers(model_builder.GetInitializerTensors());
+        const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
+        Initializer axes_initializer(axes_tensor);
+        const auto axes_data_span = axes_initializer.DataAsSpan<int64_t>();
+        axes_data = HandleNegativeAxes(axes_data_span, input_rank);
       }
     }
   } else {
     if (helper.HasAttr("axes")) {
-      auto axes = helper.Get("axes", std::vector<int64_t>{});
-      std::transform(
-          axes.begin(), axes.end(), std::back_inserter(axes_data),
-          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
+      axes_data = GetResolvedAxes(helper, input_rank);
     }
   }
-  if (axes_data.size() > 0) {
-    options.set("axes", emscripten::val::array(axes_data));
-  }
 
-  if (op_type == "ReduceL1") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL1", input, options);
-  } else if (op_type == "ReduceL2") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceL2", input, options);
-  } else if (op_type == "ReduceLogSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSum", input, options);
-  } else if (op_type == "ReduceLogSumExp") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceLogSumExp", input, options);
-  } else if (op_type == "ReduceMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMax", input, options);
-  } else if (op_type == "ReduceMean") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMean", input, options);
-  } else if (op_type == "ReduceMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceMin", input, options);
-  } else if (op_type == "ReduceProd") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceProduct", input, options);
-  } else if (op_type == "ReduceSum") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", input, options);
-  } else if (op_type == "ReduceSumSquare") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reduceSumSquare", input, options);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type);
+  // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute:
+  // - If it is false, perform reduction over all dimensions.
+  //   (In WebNN, this means the 'axes' option is not set.)
+  // - If it is true, no reduction is applied, but other operations are still performed.
+  //   (In WebNN, this requires setting 'axes' to an empty array.)
+  if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) {
+    options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64<uint32_t>(axes_data)));
   }
 
+  const std::string_view webnn_op_type = GetWebNNOpType(op_type);
+  ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type");
+
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>(
+      std::string(webnn_op_type).c_str(), input, options);
+
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
@@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                            const WebnnDeviceType /* device_type */,
                                            const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const std::string axes_name = GetTensorName(input_defs, 1);
-  // If the optional input 'axes' is provided, it must be an initializer.
-  if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) {
-    LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
-    return false;
+
+  if (TensorExists(input_defs, 1)) {
+    std::vector<int64_t> axes_shape;
+    if (!GetShape(*input_defs[1], axes_shape, logger)) {
+      LOGS(logger, VERBOSE) << "Cannot get shape of input axes";
+      return false;
+    }
+
+    if (axes_shape.size() != 1) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D";
+      return false;
+    }
+
+    const std::string axes_name = GetTensorName(input_defs, 1);
+    // If the optional input 'axes' is provided and not empty, it must be an initializer.
+    if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant";
+      return false;
+    }
   }
 
   return true;
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 00f5017a55db0..9bc6c8d0a96a1 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -71,16 +71,22 @@ struct OrtShapeInferContext {
     auto num_inputs = ctx_.getNumInputs();
     for (size_t ith_input = 0; ith_input < num_inputs; ++ith_input) {
       const auto* input_type = ctx_.getInputType(ith_input);
-      const auto& value_case = input_type->value_case();
-      ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType,
-                  "shape inference not yet supported for non-tensor types");
-      const auto& shape_proto = input_type->tensor_type().shape();
-      const auto& type_proto = input_type->tensor_type();
-      auto elem_type = ::onnxruntime::utils::CApiElementTypeFromProtoType(type_proto.elem_type());
-      auto tensor_shape = ::onnxruntime::utils::GetTensorShapeFromTensorShapeProto(shape_proto);
-      auto symbolic_dims = GetSymbolicDims(shape_proto);
-      input_type_shapes_.emplace_back(
-          OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, &tensor_shape, &symbolic_dims));
+      if (input_type != nullptr) {
+        const auto& value_case = input_type->value_case();
+        ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType,
+                    "shape inference not yet supported for non-tensor types");
+        const auto& shape_proto = input_type->tensor_type().shape();
+        const auto& type_proto = input_type->tensor_type();
+        auto elem_type = ::onnxruntime::utils::CApiElementTypeFromProtoType(type_proto.elem_type());
+        auto tensor_shape = ::onnxruntime::utils::GetTensorShapeFromTensorShapeProto(shape_proto);
+        auto symbolic_dims = GetSymbolicDims(shape_proto);
+        input_type_shapes_.emplace_back(
+            OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, &tensor_shape, &symbolic_dims));
+      } else {
+        input_type_shapes_.emplace_back(
+            OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(
+                ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED, nullptr, nullptr));
+      }
     }
   }
 
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 9b258d0983570..7603397ea9cad 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -3758,7 +3758,7 @@ Second example, if we wanted to add and remove some members, we'd do this:
     In GetApi we now make it return ort_api_3 for version 3.
 */
 
-static constexpr OrtApi ort_api_1_to_23 = {
+static constexpr OrtApi ort_api_1_to_24 = {
     // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering.
 
     // Shipped as version 1 - DO NOT MODIFY (see above text for more information)
@@ -4266,16 +4266,16 @@ static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Siz
 static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.23.0",
+static_assert(std::string_view(ORT_VERSION) == "1.24.0",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
-// 2. If there were any APIs added to ort_api_1_to_23 above:
+// 2. If there were any APIs added to ort_api_1_to_24 above:
 //    a. Add the 'End of version #' markers (pattern above should be obvious)
 //    b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version
 
 ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) {
   if (version >= 1 && version <= ORT_API_VERSION)
-    return &ort_api_1_to_23;
+    return &ort_api_1_to_24;
 
   fprintf(stderr,
           "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build."
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 4a72916d3e485..6912d19897d67 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -23,9 +23,9 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ not in ["10", "11"]:
+        if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]:
             warnings.warn(
-                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above."
             )
     elif __my_system__ == "linux":
         """Although the 'platform' python module for getting Distro information works well on standard OS images
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 85ac77be2af31..d1612af3d75b1 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -417,7 +417,14 @@ def collect_data(self, data_reader: CalibrationDataReader):
             inputs = data_reader.get_next()
             if not inputs:
                 break
-            self.intermediate_outputs.append(self.infer_session.run(None, inputs))
+            self.intermediate_outputs.append(
+                [
+                    value if sess_o.name not in self.model_original_outputs else None
+                    for sess_o, value in zip(
+                        self.infer_session.get_outputs(), self.infer_session.run(None, inputs), strict=False
+                    )
+                ]
+            )
             if (
                 self.max_intermediate_outputs is not None
                 and len(self.intermediate_outputs) == self.max_intermediate_outputs
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index d6b39a6b2aeb4..66ab0c44f8814 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
             for ref_o, o in zip(ref_output, output, strict=False):
                 # abs(desired-actual) < rtol * abs(desired) + atol
                 try:
-                    np.testing.assert_allclose(ref_o, o, rtol, atol)
+                    np.testing.assert_allclose(o, ref_o, rtol, atol)
                 except Exception as e:
                     if percentage_in_allowed_threshold(e, percent_mismatch):
                         continue
diff --git a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc
index 3e339d86c7943..1652d16f5cb66 100644
--- a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc
+++ b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc
@@ -3,7 +3,7 @@
 
 // Test can be run like the following:
 //  ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.*
-
+#if USE_FPA_INTB_GEMM
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
@@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) {
     }
   }
 }
+#endif
diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 574ec49da67ea..3bf37ea193245 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector<T>& data, const std::vector<int64_t>& s
 
   ORT_ENFORCE(static_cast<int64_t>(data.size()) == total_elements, "Data size does not match the shape",
               "Data size: ", data.size(), ", Expected size: ", total_elements,
-              ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name());
+              ", Shape: ", VectorToString(shape), " Name:", name);
 }
 
 // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 3a9bd02ef8d72..cc0e3207e6795 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -25,6 +25,7 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
+#include "core/providers/webgpu/webgpu_provider_options.h"
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
@@ -545,7 +546,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
-
+#ifdef USE_WEBGPU
+    ConfigOptions config_options{};
+    ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kSmallStorageBufferBindingSizeForTesting, "1").IsOK());
+    execution_providers.push_back(WebGpuExecutionProviderWithOptions(config_options));
+#endif
     RunTest<float>(opts, std::move(execution_providers));
   }
 }
@@ -599,6 +604,23 @@ TEST(MatMulNBits, Float16_Large) {
   }
 }
 
+#ifdef USE_WEBGPU
+// Similar to Float16_Large but for float32 and crafted so that the input_b and output buffer size exceeds
+// maxStorageBufferBindingSize (128MB) so it must be split into 2 segments internally (~128.00006MB).
+//
+// input_b size(4-bits): N * K / 2 = 8388612 * 32 / 2 = 134217792 bytes > 134217728 bytes (128MB)
+// output size(float32): M * N * 4 = 4 * 8388612 * 4 = 134217792 bytes > 134217728 bytes (128MB)
+TEST(MatMulNBits, Float32_Large) {
+  // Keep tolerance similar to Float16_Large (float path typically equal or better numerically).
+  constexpr float abs_error = 0.1f;
+  constexpr bool zp_is_4bit = true;
+  constexpr bool has_zeropoint = false;
+  constexpr auto block_size = 16;
+
+  RunTest<float>(4 /*M*/, 8388612 /*N*/, 32 /*K*/, block_size, has_zeropoint, zp_is_4bit, abs_error);
+}
+#endif
+
 #ifdef USE_CUDA
 TEST(MatMulNBits, Fp16_Int4_Int4ZeroPoint) {
   constexpr float abs_error = 0.1f;
diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc
index f5258760eb20d..2d5c3a43ee8ed 100644
--- a/onnxruntime/test/framework/shape_inference_test.cc
+++ b/onnxruntime/test/framework/shape_inference_test.cc
@@ -129,6 +129,9 @@ const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata
 // that inference proceeds for all of the outputs when absent optional inputs are present
 TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) {
   MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
+  custom_op.InferOutputShapeFn = [](const OrtCustomOp* /*op*/, OrtShapeInferContext* /*ctx*/) -> OrtStatusPtr {
+    return nullptr;
+  };
 
   const auto& env = GetEnvironment();
 
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 4fd9830440846..7371ad5cf0ded 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -2,13 +2,17 @@
 // Licensed under the MIT License.
 
 #include <iostream>
+#include <fstream>
 #include "core/common/inlined_containers.h"
 #include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/graph/op.h"
+#include "core/session/inference_session.h"
+#include "core/session/environment.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/test_environment.h"
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 #include "onnx/defs/function.h"
@@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener
 
 #endif
 
+// Test for shape inference with in-memory external data (issue #26261)
+// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes)
+// stored as in-memory external data would cause shape inference to fail
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) {
+  // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes)
+  // This will trigger the in-memory externalization path
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  // Add the value attribute with a tensor
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);  // 16 elements * 8 bytes = 128 bytes
+  // Each split will be size 1, totaling 16
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node that uses the constant as input
+  // Split requires constant input for the 'split' parameter, which triggers shape inference
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");  // Use constant as split sizes
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  // Add axis attribute
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load the model - this should succeed with the fix
+  // Before the fix, this would fail with:
+  // "Cannot parse data from external tensors. Please load external data into raw data for tensor"
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  // Verify the graph was properly constructed
+  Graph& graph = model->MainGraph();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Verify the constant node was converted to an initializer
+  const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer));
+  ASSERT_NE(initializer, nullptr);
+
+  // Verify the Split node can access the constant data during shape inference
+  const Node* split_node_ptr = nullptr;
+  for (const auto& node : graph.Nodes()) {
+    if (node.Name() == "split_node") {
+      split_node_ptr = &node;
+      break;
+    }
+  }
+  ASSERT_NE(split_node_ptr, nullptr);
+
+  // Verify outputs are properly shaped
+  ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u);
+}
+
+// Test for shape inference with in-memory external data using InferenceSession
+// This test more accurately reproduces the issue by going through the full session initialization
+// which includes graph optimizations that trigger the in-memory externalization
+TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) {
+  // Create the same model as above
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create a Constant node with a tensor of 16 INT64 values (128 bytes)
+  auto* constant_node = graph_proto->add_node();
+  constant_node->set_op_type("Constant");
+  constant_node->set_name("const_node");
+  constant_node->add_output("const_output");
+
+  auto* attr = constant_node->add_attribute();
+  attr->set_name("value");
+  attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  auto* tensor = attr->mutable_t();
+  tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  tensor->add_dims(16);
+  for (int64_t i = 0; i < 16; ++i) {
+    tensor->add_int64_data(1);
+  }
+
+  // Create a Split node
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("const_output");
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Save to a temporary file
+  const std::string model_path = "test_in_memory_external_data.onnx";
+  {
+    std::ofstream file(model_path, std::ios::binary);
+    ASSERT_TRUE(file.is_open());
+    ASSERT_TRUE(model_proto.SerializeToOstream(&file));
+  }
+
+  // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::Default;  // This triggers the issue
+  so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession";
+
+  InferenceSession session_object{so, GetEnvironment()};
+
+  // This should succeed with the fix, fail without it
+  ASSERT_STATUS_OK(session_object.Load(model_path));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  // Clean up
+  std::remove(model_path.c_str());
+}
+
+// Test that explicitly triggers the in-memory externalization and then shape inference
+// This test directly reproduces the bug scenario
+TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) {
+  // Create a model with a Split node that depends on a constant initializer
+  ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  auto* opset = model_proto.add_opset_import();
+  opset->set_version(17);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("test_graph");
+
+  // Create initializer directly (not as Constant node) with 128 bytes
+  auto* initializer = graph_proto->add_initializer();
+  initializer->set_name("split_sizes");
+  initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  initializer->add_dims(16);  // 16 * 8 = 128 bytes
+  for (int64_t i = 0; i < 16; ++i) {
+    initializer->add_int64_data(1);
+  }
+
+  // Create a Split node that uses this initializer
+  auto* split_node = graph_proto->add_node();
+  split_node->set_op_type("Split");
+  split_node->set_name("split_node");
+  split_node->add_input("input_data");
+  split_node->add_input("split_sizes");  // Uses the large initializer
+  for (int i = 0; i < 16; ++i) {
+    split_node->add_output("split_output_" + std::to_string(i));
+  }
+
+  auto* axis_attr = split_node->add_attribute();
+  axis_attr->set_name("axis");
+  axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  axis_attr->set_i(0);
+
+  // Add graph input
+  auto* input = graph_proto->add_input();
+  input->set_name("input_data");
+  auto* input_type = input->mutable_type()->mutable_tensor_type();
+  input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_type->mutable_shape()->add_dim()->set_dim_value(16);
+  input_type->mutable_shape()->add_dim()->set_dim_value(10);
+
+  // Add graph outputs
+  for (int i = 0; i < 16; ++i) {
+    auto* output = graph_proto->add_output();
+    output->set_name("split_output_" + std::to_string(i));
+  }
+
+  // Load model
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_));
+
+  Graph& graph = model->MainGraph();
+  // First resolve should succeed
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Now trigger the in-memory externalization
+  // This converts initializers > 127 bytes to OrtValues with external data references
+  Status convert_status = graph.ConvertInitializersIntoOrtValues();
+  ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage();
+
+  // Check if the initializer was actually externalized
+  const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr;
+  ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after));
+  ASSERT_NE(initializer_after, nullptr);
+  // Debug: verify it was externalized
+  ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after))
+      << "Initializer was not externalized to in-memory external data";
+
+  // Mark the graph as needing resolve to force shape inference to run again
+  graph.SetGraphResolveNeeded();
+
+  // Resolve again - this should trigger shape inference with the externalized initializer
+  // Without the fix, this will fail with "Cannot parse data from external tensors"
+  // With the fix, getInputData() materializes the external data for shape inference
+  Status second_resolve = graph.Resolve();
+  ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 39d135236b89c..dc37980002978 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -3,6 +3,7 @@
 
 #include "mlas.h"
 #include "bench_util.h"
+#include "core/util/thread_utils.h"
 
 #include <stdexcept>
 #include <numeric>
@@ -138,6 +139,113 @@ void SCONV_NCHW(benchmark::State& state, const char* /*dummy*/) {
   }
 }
 
+static MLAS_THREADPOOL* GetMlasThreadPoolForConvBenchmark(void) {
+  static auto threadpool = std::make_unique<onnxruntime::concurrency::ThreadPool>(
+      &onnxruntime::Env::Default(), onnxruntime::ThreadOptions(), nullptr, 4, true);
+  return threadpool.get();
+}
+
+void SCONV_NCHW_THREADED(benchmark::State& state, const char* /*dummy*/) {
+  MLAS_THREADPOOL* tp = GetMlasThreadPoolForConvBenchmark();
+
+  const int64_t rank = state.range(0);                       // Rank
+  const int64_t batch_size = state.range(1);                 // N
+  const int64_t groups = state.range(2);                     // G
+  const int64_t input_channels_per_group = state.range(3);   // Cpg
+  const int64_t output_channels_per_group = state.range(4);  // Fpg
+
+  if (rank <= 0) throw std::invalid_argument("Kernel rank must greater than 0!");
+  if (batch_size <= 0) throw std::invalid_argument("Batch size must greater than 0!");
+  if (groups <= 0) throw std::invalid_argument("Group count must greater than 0!");
+  if (input_channels_per_group <= 0) throw std::invalid_argument("input_channels_per_group must greater than 0!");
+  if (output_channels_per_group <= 0) throw std::invalid_argument("output_channels_per_group must greater than 0!");
+
+  size_t arg_position = 5;
+  const auto input_shape = BenchArgsVector(state, arg_position, rank);
+  const auto kernel_shape = BenchArgsVector(state, arg_position, rank);
+  const auto paddings = BenchArgsVector(state, arg_position, rank * 2);
+  const auto strides = BenchArgsVector(state, arg_position, rank);
+  const auto dilations = BenchArgsVector(state, arg_position, rank);
+
+  // do not check the size of each vector as they are forced from args.
+  if (std::any_of(input_shape.begin(), input_shape.end(), [](const int64_t& dim) { return dim <= 0; })) {
+    throw std::invalid_argument("all input image dim must > 0");
+  }
+
+  if (std::any_of(kernel_shape.begin(), kernel_shape.end(), [](const int64_t& dim) { return dim <= 0; })) {
+    throw std::invalid_argument("all kernel dim must > 0");
+  }
+
+  if (std::any_of(strides.begin(), strides.end(), [](const int64_t& dim) { return dim <= 0; })) {
+    throw std::invalid_argument("all strides dim must > 0");
+  }
+
+  if (std::any_of(dilations.begin(), dilations.end(), [](const int64_t& dim) { return dim <= 0; })) {
+    throw std::invalid_argument("all dilations dim must > 0");
+  }
+
+  const int64_t GC = groups * input_channels_per_group;
+  const int64_t GF = groups * output_channels_per_group;
+  std::vector<int64_t> x_shape = {batch_size, GC};
+  x_shape.insert(x_shape.end(), input_shape.begin(), input_shape.end());
+  std::vector<int64_t> f_shape = {GF, input_channels_per_group};
+  f_shape.insert(f_shape.end(), kernel_shape.begin(), kernel_shape.end());
+
+  std::vector<int64_t> output_shape((size_t)rank);
+  for (int64_t i = 0; i < rank; ++i) {
+    auto km = 1 + dilations[i] * (kernel_shape[i] - 1);
+    output_shape[i] = (paddings[i] + paddings[i + rank] + input_shape[i] - km) / strides[i] + 1;
+  }
+  std::vector<int64_t> y_shape = {batch_size, GF};
+  y_shape.insert(y_shape.end(), output_shape.begin(), output_shape.end());
+
+  MLAS_ACTIVATION activation;
+  activation.ActivationKind = MlasIdentityActivation;
+  MLAS_CONV_PARAMETERS Parameters;
+  size_t WorkingBufferSize = 0;
+  MlasConvPrepare(&Parameters,
+                  static_cast<size_t>(rank),
+                  static_cast<size_t>(batch_size),
+                  static_cast<size_t>(groups),
+                  static_cast<size_t>(input_channels_per_group),
+                  input_shape.data(),
+                  kernel_shape.data(),
+                  dilations.data(),
+                  paddings.data(),
+                  strides.data(),
+                  output_shape.data(),
+                  static_cast<size_t>(output_channels_per_group),
+                  &activation,
+                  &WorkingBufferSize,
+                  0.0f,
+                  tp);
+
+  auto X = RandomVectorUniform(x_shape, -2.0, 2.0);
+  auto F = RandomVectorUniform(f_shape, -1.0, 1.0);
+  int64_t y_size = std::accumulate(y_shape.begin(), y_shape.end(), 1LL, std::multiplies<int64_t>());
+  std::vector<float> Y(static_cast<size_t>(y_size));
+  std::vector<float> working_buffer(WorkingBufferSize);
+
+  // warm up first round.
+  MlasConv(&Parameters,
+           X.data(),
+           F.data(),
+           nullptr,
+           working_buffer.data(),
+           Y.data(),
+           tp);
+
+  for (auto _ : state) {
+    MlasConv(&Parameters,
+             X.data(),
+             F.data(),
+             nullptr,
+             working_buffer.data(),
+             Y.data(),
+             tp);
+  }
+}
+
 static void ResNet50(benchmark::internal::Benchmark* b) {
   b->ArgNames(ArgNamesForConv(2));
 
@@ -221,6 +329,7 @@ static void TeamsModel(benchmark::internal::Benchmark* b) {
 }
 
 BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();
+BENCHMARK_CAPTURE(SCONV_NCHW_THREADED, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();
 
 static void General_Conv2d(benchmark::internal::Benchmark* b) {
   b->ArgNames(ArgNamesForConv(2));
diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp
index a048ded8349b8..6d05e93f517ae 100644
--- a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp
@@ -4,10 +4,12 @@
 // SPDX-License-Identifier: MIT
 //
 
-#include "test_util.h"
 // Currently this test only applies to KleidiAI Guard against it running in any other situation
 #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
 
+#include "test_util.h"
+#include "core/mlas/lib/mlasi.h"  // for MLAS_CPUIDINFO
+
 class MlasDynamicQgemmTest {
  private:
   MatrixGuardBuffer<float> buffer_a;
@@ -18,6 +20,11 @@ class MlasDynamicQgemmTest {
 
  public:
   void Test(size_t M, size_t N, size_t K, size_t BatchSize) {
+    // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops.
+    if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()) {
+      GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME but it was not detected. Skipping test.";
+    }
+
     // Setup buffers for holding various data
 
     float* A = buffer_a.GetBuffer(M * K * BatchSize);
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 6df98ff505fa1..cbb25bb9b629e 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1435,9 +1435,22 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"});
     // Fails since ONNX==1.19.0
     broken_tests->insert({"l2normalization_axis_0", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"});
+    broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"});
+    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"});
     broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"});
-    broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"});
   }
 
 #ifdef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index b6f2cb2683677..463634b370d4c 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     // Please make no more changes to the list
     static const ORTCHAR_T* immutable_broken_tests[] =
         {
-            // pending ONNX update
-            ORT_TSTR("attention_3d_gqa"),
-            ORT_TSTR("attention_3d_gqa_attn_mask"),
-            ORT_TSTR("attention_3d_gqa_causal"),
-            ORT_TSTR("attention_3d_gqa_scaled"),
-            ORT_TSTR("attention_3d_gqa_softcap"),
-            ORT_TSTR("attention_3d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_gqa"),
-            ORT_TSTR("attention_4d_gqa_attn_mask"),
-            ORT_TSTR("attention_4d_gqa_causal"),
-            ORT_TSTR("attention_4d_gqa_scaled"),
-            ORT_TSTR("attention_4d_gqa_softcap"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present"),
-            ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"),
-            ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"),
-            ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"),
-            // unsupported case
             ORT_TSTR("AvgPool1d"),
             ORT_TSTR("AvgPool1d_stride"),
             ORT_TSTR("AvgPool2d"),
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index e84c1ea583250..59f5d8333657e 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -171,6 +171,10 @@ ABSL_FLAG(std::string, plugin_ep_options, "",
           "--plugin_ep_options \"ep_1_option_1_key|ep_1_option_1_value ...;;ep_3_option_1_key|ep_3_option_1_value ...;... \"");
 ABSL_FLAG(bool, list_ep_devices, false, "Prints all available device indices and their properties (including metadata). This option makes the program exit early without performing inference.\n");
 ABSL_FLAG(std::string, select_ep_devices, "", "Specifies a semicolon-separated list of device indices to add to the session and run with.");
+ABSL_FLAG(std::string, filter_ep_devices, "",
+          "Specifies EP or Device metadata entries as key-value pairs to filter ep devices passed to AppendExecutionProvider_V2.\n"
+          "[Usage]: --filter_ep_devices \"<key1>|<value1> <key2>|<value2>\" \n"
+          "Devices that match any of the key-value pair will be appended to the session. --select_ep_devices will take precedence over this option.\n");
 ABSL_FLAG(bool, compile_ep_context, DefaultPerformanceTestConfig().run_config.compile_ep_context, "Generate an EP context model");
 ABSL_FLAG(std::string, compile_model_path, "model_ctx.onnx", "The compiled model path for saving EP context model. Overwrites if already exists");
 ABSL_FLAG(bool, compile_binary_embed, DefaultPerformanceTestConfig().run_config.compile_binary_embed, "Embed binary blob within EP context node");
@@ -490,6 +494,22 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
     if (!select_ep_devices.empty()) test_config.selected_ep_device_indices = select_ep_devices;
   }
 
+  // --filter_ep_devices
+  {
+    const auto& filter_ep_devices = absl::GetFlag(FLAGS_filter_ep_devices);
+    if (!filter_ep_devices.empty()) {
+      ORT_TRY {
+        ParseEpDeviceFilterKeyValuePairs(filter_ep_devices, test_config.filter_ep_device_kv_pairs);
+      }
+      ORT_CATCH(const std::exception& ex) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          fprintf(stderr, "Error parsing filter_ep_devices: %s\n", ex.what());
+        });
+        return false;
+      }
+    }
+  }
+
   // --compile_ep_context
   test_config.run_config.compile_ep_context = absl::GetFlag(FLAGS_compile_ep_context);
 
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index f2a54b0335fe1..fa1725d9003d7 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -105,7 +105,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           if (added_ep_device_index_set.find(index) == added_ep_device_index_set.end()) {
             added_ep_devices[device.EpName()].push_back(device);
             added_ep_device_index_set.insert(index);
-            fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s] has been added to session.\n", index, device.EpName());
+            fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s, Type: %d] has been added to session.\n", static_cast<int>(index), device.EpName(), device.Device().Type());
           }
         } else {
           std::string err_msg = "[Plugin EP] [WARNING] : The EP device index and its corresponding OrtEpDevice is not created from " +
@@ -113,6 +113,28 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           fprintf(stderr, "%s", err_msg.c_str());
         }
       }
+    } else if (!performance_test_config.filter_ep_device_kv_pairs.empty()) {
+      // Find and select the OrtEpDevice associated with the EP in "--filter_ep_devices".
+      for (size_t index = 0; index < ep_devices.size(); ++index) {
+        auto device = ep_devices[index];
+        if (ep_set.find(std::string(device.EpName())) == ep_set.end())
+          continue;
+
+        // Check both EP metadata and device metadata for a match
+        auto ep_metadata_kv_pairs = device.EpMetadata().GetKeyValuePairs();
+        auto device_metadata_kv_pairs = device.Device().Metadata().GetKeyValuePairs();
+        for (const auto& kv : performance_test_config.filter_ep_device_kv_pairs) {
+          auto ep_metadata_itr = ep_metadata_kv_pairs.find(kv.first);
+          auto device_metadata_itr = device_metadata_kv_pairs.find(kv.first);
+
+          if ((ep_metadata_itr != ep_metadata_kv_pairs.end() && kv.second == ep_metadata_itr->second) ||
+              (device_metadata_itr != device_metadata_kv_pairs.end() && kv.second == device_metadata_itr->second)) {
+            added_ep_devices[device.EpName()].push_back(device);
+            fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s, Type: %d] has been added to session.\n", static_cast<int>(index), device.EpName(), device.Device().Type());
+            break;
+          }
+        }
+      }
     } else {
       // Find and select the OrtEpDevice associated with the EP in "--plugin_eps".
       for (size_t index = 0; index < ep_devices.size(); ++index) {
diff --git a/onnxruntime/test/perftest/strings_helper.cc b/onnxruntime/test/perftest/strings_helper.cc
index 5743346f8edf1..d9fd2a2a55c09 100644
--- a/onnxruntime/test/perftest/strings_helper.cc
+++ b/onnxruntime/test/perftest/strings_helper.cc
@@ -137,5 +137,22 @@ void ParseEpDeviceIndexList(const std::string& input, std::vector<int>& result)
     }
   }
 }
+
+void ParseEpDeviceFilterKeyValuePairs(const std::string& input, std::vector<std::pair<std::string, std::string>>& result) {
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, ' ')) {
+    if (!token.empty()) {
+      size_t delimiter_location = token.find("|");
+      if (delimiter_location == std::string::npos || delimiter_location == 0 || delimiter_location == token.size() - 1) {
+        ORT_THROW("Use a '|' to separate the key and value for the device filter you are trying to use.\n");
+      }
+      std::string key = token.substr(0, delimiter_location);
+      std::string value = token.substr(delimiter_location + 1);
+      result.emplace_back(std::make_pair(std::move(key), std::move(value)));
+    }
+  }
+}
 }  // namespace perftest
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/strings_helper.h b/onnxruntime/test/perftest/strings_helper.h
index a33b3d5089c9b..d6c6f6112ab6c 100644
--- a/onnxruntime/test/perftest/strings_helper.h
+++ b/onnxruntime/test/perftest/strings_helper.h
@@ -24,5 +24,7 @@ void ParseEpList(const std::string& input, std::vector<std::string>& result);
 void ParseEpOptions(const std::string& input, std::vector<std::unordered_map<std::string, std::string>>& result);
 
 void ParseEpDeviceIndexList(const std::string& input, std::vector<int>& result);
+
+void ParseEpDeviceFilterKeyValuePairs(const std::string& input, std::vector<std::pair<std::string, std::string>>& result);
 }  // namespace perftest
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 8d0b65d3158f5..1be09917e1a45 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -81,6 +81,7 @@ struct PerformanceTestConfig {
   std::basic_string<ORTCHAR_T> plugin_ep_names_and_libs;
   std::vector<std::string> registered_plugin_eps;
   std::string selected_ep_device_indices;
+  std::vector<std::pair<std::string, std::string>> filter_ep_device_kv_pairs;
   bool list_available_ep_devices = false;
 };
 
diff --git a/onnxruntime/test/platform/device_discovery_test.cc b/onnxruntime/test/platform/device_discovery_test.cc
index 6b43ccbc8f670..bd0110748b098 100644
--- a/onnxruntime/test/platform/device_discovery_test.cc
+++ b/onnxruntime/test/platform/device_discovery_test.cc
@@ -5,8 +5,8 @@
 
 #include "gtest/gtest.h"
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX)
 namespace onnxruntime::test {
-
 namespace {
 
 std::vector<OrtHardwareDevice> GetDevicesByType(OrtHardwareDeviceType device_type) {
@@ -31,3 +31,4 @@ TEST(DeviceDiscoveryTest, HasCpuDevice) {
 }
 
 }  // namespace onnxruntime::test
+#endif  // !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX)
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index a1a863d2442d1..924f9da41abef 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -19,6 +19,7 @@
 #include "gtest/gtest.h"
 
 #include "core/common/span_utils.h"
+#include "test/util/include/asserts.h"
 #include "test/util/include/file_util.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index cf49601e6c671..ca1a3104e0bed 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -678,7 +678,14 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
                                                      ORT_TSTR("fp16_coreml_FNS-Candy"),
                                                      ORT_TSTR("fp16_test_tiny_yolov2"),
                                                      ORT_TSTR("fp16_test_shufflenet"),
-                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
+                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"),
+                                                     // models from model zoo. #26274: cuDNN frontend no valid engine
+                                                     ORT_TSTR("YOLOv3"),
+                                                     ORT_TSTR("YOLOv3-12"),
+                                                     ORT_TSTR("YOLOv4"),
+                                                     ORT_TSTR("SSD-MobilenetV1"),
+                                                     ORT_TSTR("SSD-MobilenetV1-12")};
+
   // For ROCm EP, also disable the following tests due to flakiness,
   // mainly with precision issue and random memory access fault.
   static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 0b8624ad6c67f..7c84aefa1c01f 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -339,6 +339,61 @@ TEST(ConvTest, Conv2D_2) {
   TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
 }
 
+TEST(ConvTest, Conv2D_3) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      2,                            // group
+      vector<int64_t>{2, 2},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<int64_t> X_shape = {2, 2, 3, 3};
+  vector<float> X = {1.f, 2.f, 3.f,
+                     4.f, 5.f, 6.f,
+                     7.f, 8.f, 9.f,
+
+                     10.f, 11.f, 12.f,
+                     13.f, 14.f, 15.f,
+                     16.f, 17.f, 18.f,
+
+                     1.f, 2.f, 3.f,
+                     7.f, 8.f, 9.f,
+                     4.f, 5.f, 6.f,
+
+                     13.f, 14.f, 15.f,
+                     10.f, 11.f, 12.f,
+                     16.f, 17.f, 18.f};
+
+  vector<int64_t> W_shape = {2, 1, 2, 2};
+  vector<float> W = {1.f, 2.f, 3.f, 4.f, 2.f, 4.f, 6.f, 8.f};
+
+  vector<int64_t> Y_shape = {2, 2, 2, 2};
+  auto Y = {
+      37.f,
+      47.f,
+      67.f,
+      77.f,
+      254.f,
+      274.f,
+      314.f,
+      334.f,
+      58.f,
+      68.f,
+      55.f,
+      65.f,
+      230.f,
+      250.f,
+      296.f,
+      316.f,
+  };
+
+  TestConvOp(attrs, {X, W}, {X_shape, W_shape}, Y, Y_shape);
+  TestConvOp(attrs, {X, W}, {X_shape, W_shape}, Y, Y_shape, true);
+}
+
 TEST(ConvTest, Conv2D_Bias_1) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
index 8f4c4ff0896ba..289e94397fb39 100644
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -1477,7 +1477,7 @@ template <typename F4>
 void CastOpTestFloatFloat4(std::vector<int64_t> shape,
                            std::vector<float> float_data,
                            bool is_fp4_input = false) {
-  size_t num_pairs = float_data.size() / 2;
+  int num_pairs = static_cast<int>(float_data.size()) / 2;
   int num_fp4_elements = static_cast<int>((float_data.size() + 1) / 2);
   bool is_odd_count = (float_data.size() % 2 != 0);
 
diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h
index 1d8a50dc2fa04..5bd9ee2ceb826 100644
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@@ -5,6 +5,10 @@
 
 #include "test/unittest_util/checkers.h"
 #include "test/unittest_util/conversion.h"
+
+#if !defined(ORT_MINIMAL_BUILD)
 #include "test/unittest_util/model_tester.h"
 #include "test/unittest_util/op_tester.h"
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
 #include "test/unittest_util/run_options_config_keys.h"
diff --git a/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc b/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc
new file mode 100644
index 0000000000000..d167898e6a3b9
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "test/unittest_util/qdq_test_utils.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+
+// Build float test: Add -> Reshape(rank-6) -> Transpose -> Reshape -> Add
+// Uses smaller dimensions for testing
+GetTestModelFn BuildRank6ToRank5FloatTestCase() {
+  return [](ModelTestBuilder& builder) -> void {
+    auto input_def = TestInputDef<float>({256, 64}, false, -10.0f, 10.0f);
+    NodeArg* input = MakeTestInput<float>(builder, input_def);
+
+    NodeArg* add_const1 = builder.MakeScalarInitializer<float>(1.0f);
+    NodeArg* add1_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {input, add_const1}, {add1_out});
+
+    // Reshape: (256, 64) -> (1, 4, 4, 4, 4, 64)
+    NodeArg* reshape1_shape = builder.Make1DInitializer<int64_t>({1, 4, 4, 4, 4, 64});
+    NodeArg* reshape1_out = builder.MakeIntermediate();
+    builder.AddNode("Reshape", {add1_out, reshape1_shape}, {reshape1_out});
+
+    // Transpose: perm [0, 2, 1, 3, 4, 5]
+    NodeArg* transpose_out = builder.MakeIntermediate();
+    Node& transpose = builder.AddNode("Transpose", {reshape1_out}, {transpose_out});
+    transpose.AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3, 4, 5});
+
+    // Reshape: (1, 4, 4, 4, 4, 64) -> (1, 256, 64)
+    NodeArg* reshape2_shape = builder.Make1DInitializer<int64_t>({1, 256, 64});
+    NodeArg* reshape2_out = builder.MakeIntermediate();
+    builder.AddNode("Reshape", {transpose_out, reshape2_shape}, {reshape2_out});
+
+    NodeArg* add_const2 = builder.MakeScalarInitializer<float>(1.0f);
+    NodeArg* output = builder.MakeOutput();
+    builder.AddNode("Add", {reshape2_out, add_const2}, {output});
+  };
+}
+
+ProviderOptions GetProviderOptions() {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  return provider_options;
+}
+
+}  // namespace
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+TEST_F(QnnHTPBackendTests, Rank6ToRank5Fusion_Float) {
+  RunQnnModelTest(BuildRank6ToRank5FloatTestCase(),
+                  GetProviderOptions(),
+                  13,
+                  ExpectedEPNodeAssignment::All,
+                  1e-2f);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 327dfab96c2d1..a746493d779f8 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   ASSERT_TRUE(status.IsOK());
 }
 
+TEST(TensorrtExecutionProviderTest, DDSOutputTest) {
+  PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx");
+  SessionOptions so;
+  so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_op_x = {3, 4};
+  std::vector<float> values_op_x(12, 0.f);  // 12=3*4
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("output");
+  std::vector<OrtValue> fetches;
+
+  OrtTensorRTProviderOptionsV2 params;
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  // First pass run
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+
+  // Second pass run with new shape
+  dims_op_x = {6, 4};
+  values_op_x.resize(24, 0.f);  // 24=6*4
+  CreateMLValue<float>(cuda_allocator, dims_op_x, values_op_x, &ml_value_x);
+  feeds.clear();
+
+  feeds.insert(std::make_pair("data", ml_value_x));
+
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+}
+
 TEST_P(TensorrtExecutionProviderCacheTest, Run) {
   // GetParam() returns the parameter of following format:
   // ##cache type##_##input shape type##
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 72c6a5664f395..d2e9557f633b0 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None
         """
 
         def assert_similar_array(ref_output, output):
-            np.testing.assert_equal(ref_output.dtype, output.dtype)
+            np.testing.assert_equal(output.dtype, ref_output.dtype)
             if ref_output.dtype == object:
-                np.testing.assert_array_equal(ref_output, output)
+                np.testing.assert_array_equal(output, ref_output)
             else:
-                np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol)
+                np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol)
 
-        np.testing.assert_equal(len(ref_outputs), len(outputs))
+        np.testing.assert_equal(len(outputs), len(ref_outputs))
         for i in range(len(outputs)):  # pylint: disable=consider-using-enumerate
             if isinstance(outputs[i], list):
                 for j in range(len(outputs[i])):
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e44adcdb9827f..7f003453add89 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -54,7 +54,7 @@ def run_model(self, session_object, run_options):
         input_name = session_object.get_inputs()[0].name
         res = session_object.run([], {input_name: x}, run_options=run_options)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue):
         for _ in range(iter_num):
@@ -714,7 +714,7 @@ def test_run_model(self):
 
         res = sess.run([outputs[0].name], {inputs[0].name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_async(self):
         event = threading.Event()
@@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None:
             self.assertEqual(len(err), 0)
             self.assertEqual(len(res), 1)
             self.assertEqual(data.get_id(), 123456)
-            np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+            np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
             event.set()
 
         so = onnxrt.SessionOptions()
@@ -762,7 +762,7 @@ def test_run_model_from_bytes(self):
         self.assertEqual(output_shape, [3, 2])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -777,7 +777,7 @@ def test_run_model2(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model2_contiguous(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
@@ -792,10 +792,10 @@ def test_run_model2_contiguous(self):
         self.assertEqual(output_shape, [3, 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         xcontiguous = np.ascontiguousarray(x)
         rescontiguous = sess.run([output_name], {input_name: xcontiguous})
-        np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_multiple_threads(self):
         # Skip this test for a "pure" DML onnxruntime python wheel.
@@ -860,14 +860,14 @@ def test_list_as_input(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x.tolist()})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_string_list_as_input(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
         x_name = sess.get_inputs()[0].name
         res = sess.run([], {x_name: x.tolist()})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_run_device(self):
         device = onnxrt.get_device()
@@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self):
         self.assertEqual(output_shape, ["None", 1])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_boolean_inputs(self):
         sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers)
@@ -920,7 +920,7 @@ def test_boolean_inputs(self):
 
         output_expected = np.array([[True, False], [False, False]], dtype=bool)
         res = sess.run([output_name], {a_name: a, b_name: b})
-        np.testing.assert_equal(output_expected, res[0])
+        np.testing.assert_equal(res[0], output_expected)
 
     def test_string_input1(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -941,7 +941,7 @@ def test_string_input1(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_string_input2(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -962,7 +962,7 @@ def test_string_input2(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_bytes(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -983,7 +983,7 @@ def test_input_bytes(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0].astype("|S8"))
+        np.testing.assert_equal(res[0].astype("|S8"), x)
 
     def test_input_object(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1004,7 +1004,7 @@ def test_input_object(self):
         self.assertEqual(output_type, "tensor(string)")
 
         res = sess.run([output_name], {x_name: x})
-        np.testing.assert_equal(x, res[0])
+        np.testing.assert_equal(res[0], x)
 
     def test_input_void(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
@@ -1029,7 +1029,7 @@ def test_input_void(self):
         res = sess.run([output_name], {x_name: x})
 
         expr = np.array([["must", "have"], ["same", "size"]], dtype=object)
-        np.testing.assert_equal(expr, res[0])
+        np.testing.assert_equal(res[0], expr)
 
     def test_raise_wrong_num_inputs(self):
         with self.assertRaises(ValueError) as context:
@@ -1164,7 +1164,7 @@ def test_sequence_construct(self):
             },
         )
 
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_sequence_insert(self):
         opt = onnxrt.SessionOptions()
@@ -1194,7 +1194,7 @@ def test_sequence_insert(self):
                 "input_seq": [],
             },
         )
-        np.testing.assert_array_equal(output_expected, res[0])
+        np.testing.assert_array_equal(res[0], output_expected)
 
     def test_ort_execution_mode(self):
         opt = onnxrt.SessionOptions()
@@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self):
         input_1 = np.zeros((3, 5)).astype(np.float32)
         res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1})
         output_expected = np.ones((3, 5)).astype(np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Create an alias of SessionOptions instance
         # We will use this alias to construct another InferenceSession
@@ -1969,7 +1969,7 @@ def test_adater_export_read(self):
             self.assertTrue(value.is_tensor())
             self.assertEqual(expected_val.element_type(), value.element_type())
             self.assertEqual(expected_val.shape(), value.shape())
-            np.testing.assert_allclose(expected_val.numpy(), value.numpy())
+            np.testing.assert_allclose(value.numpy(), expected_val.numpy())
 
     def test_run_with_adapter(self):
         model_path = get_name("lora/two_params_lora_model.onnx")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index d66951bd66f3d..a24269a312e9b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -146,7 +146,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
@@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self):
         gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57)
         # copy back to CPU
         cpu_data = gpu_value.numpy()
-        np.testing.assert_equal(data, cpu_data)
+        np.testing.assert_equal(cpu_data, data)
 
         gpu_value.update_inplace(data2)  # update the fake GPU data
         cpu_data_2 = gpu_value.numpy()  # copy back to CPU
-        np.testing.assert_equal(data2, cpu_data_2)
+        np.testing.assert_equal(cpu_data_2, data2)
 
         gpu_value = None  # Delete OrtValue before unregistering library as the allocator will be destroyed.
 
@@ -336,8 +336,8 @@ def test_copy_tensors(self):
         del b_device
 
         # Verify the contents
-        np.testing.assert_array_equal(a, a_cpu_copy.numpy())
-        np.testing.assert_array_equal(b, b_cpu_copy.numpy())
+        np.testing.assert_array_equal(a_cpu_copy.numpy(), a)
+        np.testing.assert_array_equal(b_cpu_copy.numpy(), b)
 
         self.unregister_execution_provider_library(ep_name)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 6ed7dfe59b1f6..416d9b6edecd1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -19,7 +19,7 @@ def test_run_model(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index c245699e211d4..9e3c1acbc923b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
     for i in range(num_rows):
         # use np.testing.assert_allclose so we can specify the tolerance
         np.testing.assert_allclose(
-            [expected_rows[i][key] for key in sorted_keys],
             [actual_rows[i][key] for key in sorted_keys],
+            [expected_rows[i][key] for key in sorted_keys],
             rtol=1e-05,
             atol=1e-07,
         )
@@ -37,7 +37,7 @@ def test_run_model_non_tensor(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = rep.run(x)
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_proto(self):
         name = datasets.get_example("logreg_iris.onnx")
@@ -47,7 +47,7 @@ def test_run_model_proto(self):
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         res = rep.run(x)
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
@@ -72,7 +72,7 @@ def test_run_model_proto_api(self):
         outputs = ort_backend.run_model(model, inputs)
 
         output_expected = np.array([0, 0, 0], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08)
         output_expected = [
             {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654},
             {
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index 5ab2fe8939f6a..d6c1dd9cff3f3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_cuda_graph(self):
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
@@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers):
         # One regular run for the necessary memory allocation and cuda graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, CUDA graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay CUDA graph
         x_ortvalue.update_inplace(
@@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
     def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
@@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self):
             session.run_with_iobinding(io_binding)
             output = cuda_graph_helper.get_output("softmaxout_1")
 
-            np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02)
+            np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
index 033eae1cb4c8d..4a6aa7b63d9c3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py
@@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase):
     def test_ort_value_update_in_place(self):
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
-        np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x0)
 
         x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
         ortvalue_cpu.update_inplace(x1)
-        np.testing.assert_allclose(x1, ortvalue_cpu.numpy())
+        np.testing.assert_allclose(ortvalue_cpu.numpy(), x1)
 
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
             ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0)
-            np.testing.assert_allclose(x0, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x0)
 
             ortvalue_gpu.update_inplace(x1)
-            np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
+            np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
     def test_select_ep_to_run_dml_graph(self):
         if "DmlExecutionProvider" in onnxrt.get_available_providers():
@@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers):
         # One regular run for the necessary memory allocation and dml graph capturing
         session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # After capturing, DML graph replay happens from this Run onwards
         session.run_with_iobinding(io_binding, ro)
-        np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
+        np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         # Update input and then replay DML graph
         x_ortvalue.update_inplace(
@@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers):
         )
         session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
-            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
+            np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             rtol=1e-05,
             atol=1e-05,
         )
@@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
         del ro
         ro = onnxrt.RunOptions()
@@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers):
             session.run_with_iobinding(io_bindings[i], ro)
             io_bindings[i].synchronize_outputs()
             expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
-            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+            np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index 8b6b029c57752..70b8c0fc0b980 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -80,7 +80,7 @@ def test_dict_vectorizer(self):
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         xwrong = x.copy()
         xwrong["a"] = 5.6
@@ -96,17 +96,17 @@ def test_dict_vectorizer(self):
         x = {np.int64(k): np.float32(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int64(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = {np.int32(k): np.float64(v) for k, v in x.items()}
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[49.752754]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_label_encoder(self):
         sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
@@ -127,18 +127,18 @@ def test_label_encoder(self):
         x = np.array([["4"]])
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([[3]], dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         # Python type
         x = np.array(["4"], ndmin=2)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
         x = np.array(["4"], ndmin=2, dtype=object)
         res = sess.run([output_name], {input_name: x})
         output_expected = np.array([3], ndmin=2, dtype=np.int64)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_run_model_mlnet(self):
         available_providers = onnxrt.get_available_providers()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
index d5c80a4a1f4ba..034f0288e2508 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
@@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         """
@@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self):
         """
@@ -152,7 +152,7 @@ def my_delegate(
         input_name = sess.get_inputs()[0].name
         res = sess.run([], {input_name: x})
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
-        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
 
     def test_bind_input_only(self):
         """
diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py
index bea110e566fb9..f02f4da4eb0fb 100644
--- a/onnxruntime/test/python/quantization/test_fusions.py
+++ b/onnxruntime/test/python/quantization/test_fusions.py
@@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e
         for idx, expected_output in enumerate(orig_results):
             actual_output = fused_results[idx]
             np.testing.assert_allclose(
-                expected_output,
                 actual_output,
+                expected_output,
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"Fused model output {idx} differs",
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index 5d70641547eae..20b40fc157c16 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self):
             for expected, actual in zip(model_outputs, test_outputs, strict=False):
                 exp = expected.reshape(-1)
                 act = actual.reshape(-1)
-                np.testing.assert_equal(exp, act)
+                np.testing.assert_equal(act, exp)
 
     def test_create_activation_matching_present(self):
         float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx")
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
index a8f7591186766..906bf7aab8698 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
@@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self):
                         matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type)
                         quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type)
                         quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type)
-                        np.testing.assert_allclose(quant_value_ref, quant_value)
-                        np.testing.assert_allclose(absmax_ref, absmax)
+                        np.testing.assert_allclose(quant_value, quant_value_ref)
+                        np.testing.assert_allclose(absmax, absmax_ref)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
index 7916d93c3e531..1dedc475c9962 100644
--- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
@@ -40,7 +40,7 @@ def test_basic_all(self):
         x = np.arange(2**2).reshape((2,) * 2).astype(np.float32)
         t = np.arange(8).reshape((2, 4)).astype(np.float32)
         got = sess.run(None, {"X": x})[0]
-        np.testing.assert_allclose(t, got, atol=1e-5)
+        np.testing.assert_allclose(got, t, atol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index f5f6a3ae3bc39..0558d008a2275 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -31,23 +31,12 @@
     "current_failing_tests": [
         "^test_adagrad",
         "^test_adagrad_multiple",
-        "^test_attention_4d_diff_heads_mask4d_padded_kv*",  // pending onnx update
-        "^test_attention_3d_gqa*",  // pending onnx update
-        "^test_attention_3d_gqa_causal",  // pending onnx update
-        "^test_attention_3d_gqa_scaled",  // pending onnx update
-        "^test_attention_3d_gqa_softcap",  // pending onnx update
-        "^test_attention_3d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_4d_gqa*",  // pending onnx update
-        "^test_attention_4d_gqa_causal",  // pending onnx update
-        "^test_attention_4d_gqa_scaled",  // pending onnx update
-        "^test_attention_4d_gqa_softcap",  // pending onnx update
-        "^test_attention_4d_gqa_with_past_and_present",  // pending onnx update
-        "^test_attention_*causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*",  // pending onnx update
-        "^test_attention_4d_attn_mask_3d_causal_expanded*",  // pending onnx update
         "^test_attention_4d_fp16*",  // precision issue: 1 / 192 mismatched elements
         "^test_attention_4d_fp16_expanded*",  // precision issue: 3 / 192 mismatched elements
+        "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*",  // webgpu mismatched elements 38 / 576
+        "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*",  // webgpu
+        "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu
+        "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen
         "^test_l2normalization*",  // LpNormalization(22) not implemented
         "^test_l1normalization*",  // LpNormalization(22) not implemented
         "^test_lpnormalization*",  // LpNormalization(22) not implemented
@@ -123,13 +112,9 @@
         "^test_if_opt",
         "^test_loop16_seq_none",
         "^test_identity_opt",
-        // rotary dim should be fixed in onnx==1.19.1
-        "^test_rotary_embedding_no_position_ids_rotary_dim",
-        "^test_rotary_embedding_with_interleaved_rotary_dim",
-        "^test_rotary_embedding_with_rotary_dim",
-        "^test_rotary_embedding_3d_input_expanded",
-        "^test_rotary_embedding_interleaved_expanded",
-        "^test_rotary_embedding_no_position_ids_interleaved_expanded",
+        "^test_rotary_embedding_3d_input_expanded",  // win cuda fail
+        "^test_rotary_embedding_interleaved_expanded",  // win cuda fail
+        "^test_rotary_embedding_no_position_ids_interleaved_expanded",  // win cuda fail
         "^test_rotary_embedding_expanded", //webgpu
         "^test_rotary_embedding_no_position_ids_expanded", //webgpu
         // Following tests are for opset 16 ops and are not yet implemented in ORT
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py
new file mode 100644
index 0000000000000..fa381e5df1094
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272.py
@@ -0,0 +1,26 @@
+import onnx
+from onnx import TensorProto, helper
+
+# Create a simple ONNX model with DDS output
+input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"])
+output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"])
+
+nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node")
+transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node")
+gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node")
+
+value_info = [
+    helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]),
+    helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]),
+]
+
+graph = helper.make_graph(
+    [nonzeros_node, transpose_node, gathernd_node],
+    "test_graph",
+    [input],
+    [output],
+    value_info=value_info,
+)
+
+model = helper.make_model(graph)
+onnx.save(model, "ort_github_issue_26272_dds.onnx")
diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
new file mode 100644
index 0000000000000..371f99c537898
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx
@@ -0,0 +1,28 @@
+
+:�
+(
+datanonzerosnonzeros_node"NonZero
+1
+nonzeros
+nonzeros_ttranspose_node"	Transpose
+3
+data
+
+nonzeros_toutputgathernd_node"GatherND
+test_graphZ
+data
+
+d1
+d2b
+output
+
+nzrj
+nonzeros
+
+
+nzrj
+
+nonzeros_t
+
+nzr
+B
\ No newline at end of file
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 327caf83c7850..591be538ac873 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1015,6 +1015,9 @@ def generate_build_tree(
     if path_to_protoc_exe:
         cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"]
 
+    if args.cmake_deps_mirror_dir:
+        cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"]
+
     if args.fuzz_testing:
         if not (
             args.build_shared_lib
@@ -1330,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
             cmd_args.extend(["--target", *targets])
 
         build_tool_args = []
-        if num_parallel_jobs != 1:
+        if num_parallel_jobs != 0:
             if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
                 # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
                 # not maxing out CL_MPCount
@@ -1748,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 # Install cpu only version of torch when cuda is not enabled in Linux.
                 extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
                 run_subprocess(
-                    [sys.executable, "-m", "pip", "install", "torch", *extra],
+                    [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra],
                     cwd=cwd,
                     dll_path=dll_path,
                     python_path=python_path,
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index c5454903474d1..05d5052067b2e 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None:
         help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines",
     )
     parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.")
+    parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.")
     parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.")
     parser.add_argument(
         "--enable_symbolic_shape_infer_tests",
diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 53b62762319ba..e54216fe4ef4e 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -31,5 +31,5 @@ stages:
         machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU'
         extra_build_arg: ''
         cmake_build_type: Release
-        cuda_version: 12.2
+        cuda_version: 12.8
         docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 91736752e22d4..086d65c93062b 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -73,12 +73,12 @@ variables:
 - name: ReleaseVersionSuffix
   value: ''
 - name: win_trt_version
-  value: 12.2
+  value: 12.8
 
 - name: win_trt_home
   value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
-  value: $(Agent.TempDirectory)\v12.2
+  value: $(Agent.TempDirectory)\v12.8
 extends:
   # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
   # For non-production pipelines, use "Unofficial" as defined below.
@@ -142,7 +142,7 @@ extends:
 
     - template: stages/nuget-combine-cuda-stage.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
         win_trt_home: ${{ variables.win_trt_home }}
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 46363c07b3e3e..7e107c33ed8c0 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -127,7 +127,7 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_win.yml
   parameters:
@@ -136,7 +136,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -144,7 +144,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 - template: nuget/templates/test_linux.yml
   parameters:
@@ -153,7 +153,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
-    CudaVersion: 12.2
+    CudaVersion: 12.8
 
 
 
@@ -202,7 +202,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
@@ -257,7 +257,7 @@ stages:
 
         - template: templates/jobs/download_win_gpu_library.yml
           parameters:
-            CudaVersion: 12.2
+            CudaVersion: 12.8
             DownloadCUDA: true
             DownloadTRT: true
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 5535d7b4f264d..d7fc0efbf45ea 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -48,9 +48,9 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-  - 12.2
+  - 12.8
 
 variables:
 - template: templates/common-variables.yml
@@ -59,13 +59,13 @@ variables:
 - name: win_trt_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }}
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
     value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }}
 - name: win_cuda_home
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
     value: $(Agent.TempDirectory)\v11.8
-  ${{ if eq(parameters.CudaVersion, '12.2') }}:
-    value: $(Agent.TempDirectory)\v12.2
+  ${{ if eq(parameters.CudaVersion, '12.8') }}:
+    value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index 1ad6f411d9848..5ce6ec278b1e7 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 - name: QnnSdk
   displayName: QNN SDK Version
@@ -40,8 +40,8 @@ variables:
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: $(Agent.TempDirectory)\v12.8
 
 resources:
   repositories:
@@ -178,9 +178,6 @@ extends:
             inputs:
               targetType: 'inline'
               script: |
-                mkdir -p $(Build.BinariesDirectory)/osx-x64
-                Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64
-
                 mkdir -p $(Build.BinariesDirectory)/osx-arm64
                 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64
 
@@ -200,12 +197,6 @@ extends:
                 foreach ($dir in $dirs) {
                   Write-Host "Directory: $($dir.FullName)"
                 }
-                $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*
-                if ($osx_x64_archive.Count -eq 0) {
-                  Write-Host "No osx-x64 archive found."
-                } else {
-                  Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)"
-                }
                 $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*
                 if ($osx_arm64_archive.Count -eq 0) {
                   Write-Host "No osx-arm64 archive found."
@@ -233,13 +224,10 @@ extends:
               script: |
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
                 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
-                $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
-                tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null
                 tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null
                 $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName
                 $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName
-                $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName
                 $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
                 Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
                 Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
index d387c07d6dc6e..463c02203e21a 100644
--- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
+++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
@@ -40,7 +40,7 @@ stages:
 
     - template: templates/jobs/download_win_gpu_library.yml
       parameters:
-        CudaVersion: 12.2
+        CudaVersion: 12.8
         DownloadCUDA: true
         DownloadTRT: true
 
@@ -105,7 +105,7 @@ stages:
     - name: runCodesignValidationInjection
       value: false
     - name: docker_base_image
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
     timeoutInMinutes: 60
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index 0410001d77d13..5e6671e3797ce 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,21 +31,21 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index 89ce3f3c86727..b60ef7576184e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -58,9 +58,9 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
           Context: tools/ci_build/github/linux/docker/
-          ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          ${{ if eq(parameters.CudaVersion, '12.8') }}:
             DockerBuildArgs: "
-            --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
+            --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
             --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }}
             --build-arg BUILD_UID=$( id -u )
             "
@@ -107,4 +107,4 @@ stages:
           DisableContribOps: $(DisableContribOps)
           DisableMlOps: $(DisableMlOps)
           IsReleaseBuild: $(IsReleaseBuild)
-          PACKAGENAME: ${{ parameters.NugetPackageName }}
\ No newline at end of file
+          PACKAGENAME: ${{ parameters.NugetPackageName }}
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index deb8b84bf19b8..fdfafd4d9a179 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -2,16 +2,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 stages:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index c2c89686a077e..02b6a6df76611 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,8 +18,8 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
-          cuda_version: '12.2'
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
+          cuda_version: '12.8'
 
   - stage: Republish_Wheels
     dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index 4c536bad45368..290af4a3e4449 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -49,4 +49,4 @@ extends:
       - template: stages/py-gpu-packaging-stage.yml
         parameters:
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          cuda_version: '12.2'
+          cuda_version: '12.8'
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 858de4d173484..b53aee639372d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -2,9 +2,9 @@ parameters:
   - name: CudaVersion
     displayName: 'CUDA version'
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
-      - 12.2
+      - 12.8
   - name: machine_pool
     type: string
 
@@ -44,13 +44,13 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     pool: ${{ parameters.machine_pool }}
     steps:
@@ -105,4 +105,4 @@ jobs:
         inputs:
           targetType: filePath
           filePath: tools/ci_build/github/linux/run_python_dockertest.sh
-          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2
+          arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
index 0a11ba80fb5df..7b120fa06190b 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
@@ -12,9 +12,7 @@ parameters:
   displayName: 'NPM packages publish configuration'
   type: string
   default: 'dev'
-- name: is1ES
-  type: boolean
-  default: false
+
 jobs:
 - job: ReactNative_CI_Android
   pool:
@@ -153,30 +151,16 @@ jobs:
       targetFolder: $(Build.ArtifactStagingDirectory)
     displayName: Create Artifacts onnxruntime-react-native
 
-  - ${{ if eq(parameters.is1ES, true) }}:
-    - task: 1ES.PublishPipelineArtifact@1
-      inputs:
-        artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt)
-        targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts'
-      condition: succeededOrFailed()
-      displayName: Publish React Native Detox E2E test logs
-    - task: 1ES.PublishPipelineArtifact@1
-      inputs:
-        artifactName: '${{parameters.PackageName}}'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-      displayName: Publish Pipeline Artifact
-
-  - ${{ if eq(parameters.is1ES, false) }}:
-    - task: PublishPipelineArtifact@1
-      inputs:
-        artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt)
-        targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts'
-      condition: succeededOrFailed()
-      displayName: Publish React Native Detox E2E test logs
-    - task: PublishPipelineArtifact@1
-      inputs:
-        artifactName: '${{parameters.PackageName}}'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-      displayName: Publish Pipeline Artifact
+  - task: 1ES.PublishPipelineArtifact@1
+    inputs:
+      artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt)
+      targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts'
+    condition: succeededOrFailed()
+    displayName: Publish React Native Detox E2E test logs
+  - task: 1ES.PublishPipelineArtifact@1
+    inputs:
+      artifactName: '${{parameters.PackageName}}'
+      targetPath: '$(Build.ArtifactStagingDirectory)'
+    displayName: Publish Pipeline Artifact
 
   - template: ../../templates/explicitly-defined-final-tasks.yml
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
index bca95a4a2fd02..8cbb81ba89c12 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 
 stages:
 - stage: Linux_Nodejs_Packaging_x64
@@ -20,14 +20,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 121e80fca1021..b1e5f541b90e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
 - name: buildJava
   type: boolean
 - name: buildNodejs
@@ -22,7 +22,7 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
@@ -74,14 +74,14 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self
@@ -140,12 +140,12 @@ stages:
     - name: CUDA_VERSION_MAJOR
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: '11'
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: ${{ variables.linux_trt_version_cuda11 }}
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      ${{ if eq(parameters.CudaVersion, '12.8') }}:
         value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 61afeba2d302b..e7e541205ba0a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -60,7 +60,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual"
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
@@ -80,7 +80,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual"
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index d8bb51b5ef79d..3c5cf591039e0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -19,9 +19,9 @@ parameters:
 - name: cuda_version
   type: string
   displayName: 'CUDA version. Windows Only.'
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 - name: PythonVersions
   type: object
@@ -38,7 +38,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90a-virtual"
+          EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual"
           use_tensorrt: True
 
     - template: py-linux-gpu-stage.yml
@@ -48,4 +48,4 @@ stages:
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
index 715470eb9f012..ab1fb919af413 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
@@ -22,9 +22,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 stages:
 - stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
@@ -55,7 +55,7 @@ stages:
       - name: trt_version
         ${{ if eq(parameters.cuda_version, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
-        ${{ if eq(parameters.cuda_version, '12.2') }}:
+        ${{ if eq(parameters.cuda_version, '12.8') }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index e2683c04f21f2..c3957fc8341de 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -20,9 +20,9 @@ parameters:
   default: ''
 - name: CudaVersion
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 - name: cmake_build_type
   type: string
@@ -47,7 +47,7 @@ stages:
       workspace:
         clean: all
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
       templateContext:
         sdl:
@@ -76,7 +76,7 @@ stages:
       - name: win_trt_folder
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.win_trt_folder_cuda11 }}
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        ${{ if eq(parameters.CudaVersion, '12.8') }}:
           value: ${{ variables.win_trt_folder_cuda12 }}
       - name: trt_build_flag
         ${{ if eq(parameters.use_tensorrt, true) }}:
@@ -119,7 +119,7 @@ stages:
             --cmake_generator "$(VSGenerator)"
             --enable_pybind
             --enable_onnx_tests
-            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40
+            --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build 
             $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
           workingDirectory: '$(Build.BinariesDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 2a6f8461d773c..338789a8da9e3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -107,20 +107,15 @@ stages:
     workspace:
       clean: all
     pool:
-      name: 'Azure Pipelines'
-      image: 'macOS-15'
-      os: 'macOS'
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
 
     timeoutInMinutes: 300
     steps:
     - template: set-version-number-variables-step.yml
 
-    - task: JavaToolInstaller@0
-      inputs:
-        versionSpec: "17"
-        jdkArchitectureOption: "x64"
-        jdkSourceOption: 'PreInstalled'
-
     - template: use-xcode-version.yml
       parameters:
         xcodeVersion: 16.4
@@ -143,7 +138,6 @@ stages:
 
     - script: |
         python3 tools/ci_build/github/apple/test_apple_packages.py \
-          --fail_if_cocoapods_missing \
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
           --skip_macos_test \
@@ -209,13 +203,6 @@ stages:
       - input: pipelineArtifact
         artifactName: drop-onnxruntime-java-linux-aarch64
         targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-linux-aarch64'
-
-      - input: pipelineArtifact
-        artifactName: drop-onnxruntime-java-osx-x86_64
-        targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-x86_64'
-      - input: pipelineArtifact
-        artifactName: drop-onnxruntime-java-osx-arm64
-        targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-arm64'
       outputs:
       - output: pipelineArtifact
         targetPath: $(Build.BinariesDirectory)\java-artifact\onnxruntime-java-win-x64
@@ -448,7 +435,6 @@ stages:
     #  - Windows arm64 (CPU, DML, WebGPU)
     #  - Linux x64 (CPU, CUDA, TensorRT, WebGPU)
     #  - Linux arm64 (CPU only)
-    #  - macOS x64 (CPU, CoreML, WebGPU)
     #  - macOS arm64 (CPU, CoreML, WebGPU)
     #
     # File manifest:
@@ -485,12 +471,6 @@ stages:
     #      - onnxruntime_binding.node
     #      - libonnxruntime.so.1
     #
-    #  - macOS x64 (CPU, CoreML, WebGPU):
-    #    dependency: MacOS_C_API_Packaging_CPU_x86_64 (drop-onnxruntime-nodejs-osx-x86_64)
-    #    files:
-    #      - onnxruntime_binding.node
-    #      - libonnxruntime.{version}.dylib
-    #
     #  - macOS arm64 (CPU, CoreML, WebGPU):
     #    dependency: MacOS_C_API_Packaging_CPU_arm64 (drop-onnxruntime-nodejs-osx-arm64)
     #    files:
@@ -518,12 +498,6 @@ stages:
         artifactName: 'drop-onnxruntime-nodejs-win-arm64'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/arm64/'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - Nodejs (macOS x86_64)'
-      inputs:
-        artifactName: 'drop-onnxruntime-nodejs-osx-x86_64'
-        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/darwin/x64/'
-
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (macOS arm64)'
       inputs:
@@ -600,16 +574,6 @@ stages:
           *.node
         TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\arm64'
 
-    # Node.js binding darwin/x64
-    - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64\'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\darwin\x64'
-        Contents: |
-          libonnxruntime.*.dylib
-          *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64'
-
     # Node.js binding darwin/arm64
     - task: CopyFiles@2
       displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\arm64\'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 681138a5ab3d1..be213337091e8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -7,10 +7,10 @@ parameters:
     default: false
   - name: CudaVersion
     type: string
-    default: '12.2'
+    default: '12.8'
     values:
       - 11.8
-      - 12.2
+      - 12.8
   - name: TrtVersion
     type: string
     default: '10.9.0.34'
@@ -46,11 +46,11 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 96436883fb8b8..d7c940cda30f4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -9,7 +9,7 @@ parameters:
     default: false
   - name: PrimaryCUDAVersion
     type: string
-    default: '12.2'
+    default: '12.8'
 #  - name: SecondaryCUDAVersion
 #    type: string
 #    default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index 56cc84a90dc68..907563cb77242 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -26,13 +26,6 @@ stages:
   jobs:
   - template: mac-cpu-packing-jobs.yml
     parameters:
-      MacosArch: 'x86_64'
-      AllowReleasedOpsetOnly: ${{ parameters.AllowReleasedOpsetOnly }}
-      AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
-
-  - template: mac-cpu-packing-jobs.yml
-    parameters:
-      MacosArch: 'arm64'
       AllowReleasedOpsetOnly: ${{ parameters.AllowReleasedOpsetOnly }}
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
 
@@ -41,14 +34,12 @@ stages:
   jobs:
   - job: MacOS_C_API_Package_Publish
     pool:
-      name: 'Azure Pipelines'
-      image: 'macOS-14'
-      os: 'macOS'
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
     templateContext:
       inputs:
-      - input: pipelineArtifact
-        artifactName: onnxruntime-osx-x86_64 # The files in this artifact are not signed
-        targetPath: $(Build.ArtifactStagingDirectory)
       - input: pipelineArtifact
         artifactName: onnxruntime-osx-arm64 # The files in this artifact are not signed
         targetPath: $(Build.ArtifactStagingDirectory)
@@ -64,12 +55,16 @@ stages:
         versionSpec: '3.13'
         addToPath: true
 
-    - task: PythonScript@0
-      displayName: 'Prepare, Create Universal Binary, and Zip with Python'
-      inputs:
-        scriptSource: 'filePath'
-        scriptPath: 'tools/ci_build/prepare_macos_package.py'
-        arguments: '--staging_dir $(Build.ArtifactStagingDirectory)'
+    - script: |
+        set -ex
+        cd $(Build.ArtifactStagingDirectory)
+        # Find and extract the arm64 tarball
+        find . -name 'onnxruntime-osx-arm64*.tgz' -exec tar -xzf {} \;
+        # Remove _manifest directories if they exist
+        find . -type d -name '_manifest' -exec rm -rf {} + || true
+        # Find the extracted directory and zip it
+        find . -maxdepth 1 -type d -name 'onnxruntime-osx-arm64*' -exec zip -FSr --symlinks {}.zip {} \;
+      displayName: 'Prepare ARM64 Package for Signing'
 
     - template: mac-esrp-dylib.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
index c43bfe2886f22..8e454f2137ce8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
@@ -1,9 +1,7 @@
 parameters:
 - name: MacosArch
   type: string
-  values:
-  - 'x86_64'
-  - 'arm64'
+  default: 'arm64'
 
 - name: AdditionalBuildFlags
   displayName: Additional build flags for build.py
@@ -21,11 +19,6 @@ steps:
       make install DESTDIR=$(Build.BinariesDirectory)/installed
   displayName: 'Build ${{ parameters.MacosArch }}'
 
-- ${{ if eq(parameters.MacosArch, 'x86_64') }}:
-  - script: |
-      python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --test  ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-    displayName: 'Running Tests'
-
 - task: ShellScript@2
   displayName: 'Copy build artifacts for zipping'
   inputs:
@@ -42,31 +35,13 @@ steps:
     archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion).tgz'
     replaceExistingArchive: true
 
-- script: |
-    set -e -x
-    mkdir -p $(Build.ArtifactStagingDirectory)/testdata
-    cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.ArtifactStagingDirectory)/testdata
-  displayName: 'Copy libcustom_op_library.dylib to ArtifactStagingDirectory'
-  condition: and(succeeded(), eq('${{ parameters.MacosArch }}', 'x86_64'))
-
 - task: 1ES.PublishPipelineArtifact@1
   inputs:
     targetPath: '$(Build.ArtifactStagingDirectory)'
     artifactName: 'onnxruntime-osx-${{ parameters.MacosArch }}'
 
-- template: java-api-artifacts-package-and-publish-steps-posix.yml
-  parameters:
-    arch: 'osx-${{ parameters.MacosArch }}'
-    buildConfig: 'Release'
-    artifactName: 'onnxruntime-java-osx-${{ parameters.MacosArch }}'
-    libraryName: 'libonnxruntime.dylib'
-    nativeLibraryName: 'libonnxruntime4j_jni.dylib'
-
 - template: nodejs-artifacts-package-and-publish-steps-posix.yml
   parameters:
-      ${{ if eq(parameters.MacosArch, 'x86_64') }}:
-          arch: x64
-      ${{ if eq(parameters.MacosArch, 'arm64') }}:
-          arch: arm64
+      arch: arm64
       os: 'darwin'
       artifactName: 'drop-onnxruntime-nodejs-osx-${{ parameters.MacosArch }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 3ae07ebffdb8c..bfccaef1c9852 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -1,10 +1,4 @@
 parameters:
-- name: MacosArch
-  type: string
-  values:
-  - 'x86_64'
-  - 'arm64'
-
 - name: AdditionalBuildFlags
   displayName: Additional build flags for build.py
   type: string
@@ -20,35 +14,30 @@ parameters:
   - 0
 
 jobs:
-- job: MacOS_C_API_Packaging_CPU_${{ parameters.MacosArch }}
+- job: MacOS_C_API_Packaging_CPU_arm64
   workspace:
     clean: all
   variables:
     MACOSX_DEPLOYMENT_TARGET: '14.0'
     ALLOW_RELEASED_ONNX_OPSET_ONLY: ${{ parameters.AllowReleasedOpsetOnly }}
   pool:
-    name: "Azure Pipelines"
-    image: 'macOS-15'
-    os: macOS
+   name: AcesShared
+   os: macOS
+   demands:
+   - ImageOverride -equals ACES_VM_SharedPool_Sequoia
   timeoutInMinutes: 300
   steps:
   - checkout: self
     clean: true
     submodules: none
 
-  - task: JavaToolInstaller@0
-    inputs:
-      versionSpec: "17"
-      jdkArchitectureOption: "x64"
-      jdkSourceOption: 'PreInstalled'
-
   - template: use-xcode-version.yml
     parameters:
       xcodeVersion: 16.4
 
   - template: setup-build-tools.yml
     parameters:
-      host_cpu_arch: ${{ parameters.MacosArch }}
+      host_cpu_arch: arm64
 
   - template: set-version-number-variables-step.yml
 
@@ -58,14 +47,7 @@ jobs:
       export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF"
       python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt'    
 
-  - ${{ if eq(parameters.MacosArch, 'arm64') }}:
-    - template: mac-cpu-packaging-steps.yml
-      parameters:
-        MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
-
-  - ${{ if eq(parameters.MacosArch, 'x86_64') }}:
-    - template: mac-cpu-packaging-steps.yml
-      parameters:
-        MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=x86_64
\ No newline at end of file
+  - template: mac-cpu-packaging-steps.yml
+    parameters:
+      MacosArch: arm64
+      AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-macos.yml b/tools/ci_build/github/azure-pipelines/templates/py-macos.yml
index 6fb560d7fec7e..b59de879e2984 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-macos.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-macos.yml
@@ -24,9 +24,10 @@ jobs:
   workspace:
     clean: all
   pool:
-    name: "Azure Pipelines"
-    image: "macOS-15"
+    name: AcesShared
     os: macOS
+    demands:
+    - ImageOverride -equals ACES_VM_SharedPool_Sequoia
   templateContext:
     outputs:
     - output: pipelineArtifact
@@ -44,7 +45,7 @@ jobs:
 
   - template: use-xcode-version.yml
     parameters:
-      xcodeVersion: '16.4.0'
+      xcodeVersion: '16.4'
 
 
   - template: setup-build-tools.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 1415586521f30..263f73a9e29b0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,9 +18,9 @@ parameters:
 
 - name: cuda_version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-   - 12.2
+   - 12.8
 
 # TODO: Ideally it should fetch information from the build that triggers it
 - name: cmake_build_type
@@ -46,7 +46,7 @@ jobs:
   - name: trt_version
     ${{ if eq(parameters.cuda_version, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
+    ${{ if eq(parameters.cuda_version, '12.8') }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
   workspace:
     clean: all
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index f20172e1c70a6..8018da41fbc2d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -26,9 +26,6 @@ parameters:
 - name: enable_code_sign
   displayName: Use GPG to sign the jars
   type: boolean
-- name: is1ES
-  type: boolean
-  default: false
 
 stages:
 - stage: Build_Android_Packages
@@ -44,7 +41,7 @@ stages:
       enable_code_sign: '${{parameters.enable_code_sign}}'
       pool_name: '${{parameters.PoolName}}'
       packageName: 'onnxruntime-android'
-      is1ES: '${{parameters.is1ES}}'
+      is1ES: true
 
 - stage: ReactNative_CI_Android
   displayName: ReactNative_CI_Android
@@ -55,34 +52,21 @@ stages:
       PackageName: '${{parameters.PackageName}}'
       ArtifactName: 'onnxruntime-android-full-aar'
       NpmPackagingMode: '${{parameters.NpmPackagingMode}}'
-      is1ES: '${{parameters.is1ES}}'
 
 - stage: ReactNative_CI_iOS
   displayName: ReactNative_CI_iOS
   dependsOn: '${{parameters.InitialStageDependsOn}}'
   variables:
-  - name: publishPipelineArtifactTask
-    ${{ if eq(parameters.is1ES, true) }}:
-      value: 1ES.PublishPipelineArtifact@1
-    ${{ else }}:
-      value: PublishPipelineArtifact@1
   jobs:
   - job: ReactNative_CI_iOS_build
-
-    ${{ if eq(parameters.is1ES, false) }}:
-      pool:
-        vmImage: 'macOS-14'
-    ${{ if eq(parameters.is1ES, true) }}:
-      pool:
-        name: 'Azure Pipelines'
-        image: 'macOS-14'
-        os: 'macOS'
-
+    pool:
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
     timeoutInMinutes: 120
-
     variables:
       runCodesignValidationInjection: false
-
     steps:
     - template: use-xcode-version.yml
 
@@ -106,7 +90,7 @@ stages:
           --build-settings-file $(Build.SourcesDirectory)/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json
       displayName: Build iOS package and assemble pods
 
-    - task: ${{ variables.publishPipelineArtifactTask }}
+    - task: 1ES.PublishPipelineArtifact@1
       inputs:
         artifactName: 'ios_pod'
         targetPath: '$(Build.BinariesDirectory)/ios_pod'
@@ -114,16 +98,11 @@ stages:
 
   - job: ReactNative_CI_iOS_unit_tests
     dependsOn: 'ReactNative_CI_iOS_build'
-
-    ${{ if eq(parameters.is1ES, false) }}:
-      pool:
-        vmImage: 'macOS-14'
-    ${{ if eq(parameters.is1ES, true) }}:
-      pool:
-        name: 'Azure Pipelines'
-        image: 'macOS-14'
-        os: 'macOS'
-
+    pool:
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
     timeoutInMinutes: 90
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml b/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml
index df7fea537ce6f..548ff8a54a854 100644
--- a/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml
@@ -16,13 +16,12 @@ parameters:
 steps:
 - template: telemetry-steps.yml
 
-# Currently all ADO macOS machines are x64 machines
 - task: UsePythonVersion@0
   displayName: 'Use Python ${{ parameters.host_cpu_arch }} (macOS)'
   condition: and(succeeded(), eq(variables['Agent.OS'], 'Darwin'))
   inputs:
     versionSpec: ${{ parameters.python_version }}    
-    architecture: 'x64'
+    architecture: ${{ parameters.host_cpu_arch }}
     
 - task: UsePythonVersion@0
   displayName: 'Use Python ${{ parameters.host_cpu_arch }} (non-macOS)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index e0b58e68e24cb..e9f170ff60301 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -17,8 +17,8 @@ stages:
       # Note: Keep the Xcode version and iOS simulator version compatible.
       # Check the table here to see what iOS simulator versions are supported by a particular Xcode version:
       # https://developer.apple.com/support/xcode/
-      xcodeVersion: "15.3.0"
-      iosSimulatorRuntimeVersion: "17.4"
+      xcodeVersion: "16.4"
+      iosSimulatorRuntimeVersion: "18.5"
       buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
       cPodName: onnxruntime-c
       objcPodName: onnxruntime-objc
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 0310735d94b2e..ca698123a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -78,7 +78,7 @@ parameters:
   default: '11.8'
   values:
       - 11.8
-      - 12.2
+      - 12.8
 
 - name: SpecificArtifact
   displayName: Use Specific Artifact
@@ -136,7 +136,7 @@ stages:
 
     ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
       pool:
-        name: onnxruntime-Win-CPU-2022
+        name: onnxruntime-Win-CPU-VS2022-Latest
         os: windows
     ${{ else }}:
       pool:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
index c20f4a2c1bd19..8b320b0ceb4ac 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
@@ -32,10 +32,10 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
     - 11.8
-    - 12.2
+    - 12.8
 
 stages:
 - stage: kernelDocumentation
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index c12bb3552920c..08953749f6527 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -31,16 +31,16 @@ parameters:
 - name: CudaVersion
   displayName: CUDA version
   type: string
-  default: '12.2'
+  default: '12.8'
   values:
-    - 12.2
+    - 12.8
 
 variables:
   - template: templates/common-variables.yml
   - name: win_trt_folder
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.win_trt_folder_cuda11 }}
-    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    ${{ if eq(parameters.CudaVersion, '12.8') }}:
       value: ${{ variables.win_trt_folder_cuda12 }}
 
 jobs:
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 9cc140f41cf91..2f3ac991aee9c 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -2,4 +2,4 @@
 set -e -x
 docker run -e SYSTEM_COLLECTIONURI --rm --volume \
 $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
-/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed"
+/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed"
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 65be0c7b60ead..62bf8b4a245bb 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -69,7 +69,7 @@ fi
 if [ "$BUILD_DEVICE" == "GPU" ]; then
     SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed   's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
     #Enable CUDA and TRT EPs.
-    BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90a-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
+    BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
 fi
 
 if [ "$BUILD_DEVICE" == "NPU" ]; then
diff --git a/tools/ci_build/github/linux/build_nodejs_package.sh b/tools/ci_build/github/linux/build_nodejs_package.sh
index cc6443cc7fab6..ff5c504376d1d 100755
--- a/tools/ci_build/github/linux/build_nodejs_package.sh
+++ b/tools/ci_build/github/linux/build_nodejs_package.sh
@@ -3,4 +3,4 @@ set -e -x
 mkdir -p $HOME/.onnx
 docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
-/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
+/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index b8d968c82d002..c0849bf0ace73 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -3,4 +3,4 @@ set -e -x
 mkdir -p $HOME/.onnx
 docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
-/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
+/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 2a65e7c26b20b..a277286866e41 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index 3337af3be6074..5410bd64036ce 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 ARG ROCM_VERSION=6.2.3
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
index 0007a4e06f7c0..07ad8e933baf0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 8a84b9b940306..5d98c25b535af 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04
 ARG TRT_VERSION=10.9.0.34-1+cuda12.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index 8b2083c2ccfc1..cef2d11780969 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index f5143d5ac9ab9..79d99d08dcc4e 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index bae6f4cb51816..1b1dadeaf8db2 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index cfc2ce7079148..72d98206f9205 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2
 FROM $BASEIMAGE
 
 ENV LANG=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index 8401393a661b1..85f4a074e30bf 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2
 FROM $BASEIMAGE
 ARG TRT_VERSION
 
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index b923febc1227f..81ba47f397f91 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1
+ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2
 FROM $BASEIMAGE
 
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
index f3341f32a768d..5ad1023bfb5b2 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -1,5 +1,5 @@
 # Use the specified UBI8 base image with GCC 14
-ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1"
+ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2"
 FROM ${BASEIMAGE}
 
 ARG BUILD_UID=1000
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index 2871f5cab2ea2..dc394ff50f4f9 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,7 +3,7 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.19.0
+onnx==1.19.1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 381d42831e715..2d89aece56340 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -3,7 +3,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 4cc94f9148656..c19c0170291e6 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -4,11 +4,11 @@ mypy
 pytest
 setuptools==78.1.1
 wheel==0.45.1
-onnx==1.19.0
+onnx==1.19.1
 argparse
 sympy==1.14
 flatbuffers
 protobuf==4.25.1
 packaging
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index d48fb66194f2a..3ddce9cc0ec31 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -3,12 +3,12 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
diff --git a/tools/ci_build/github/windows/jar_packaging.py b/tools/ci_build/github/windows/jar_packaging.py
index 2354363610251..b399782e9410f 100644
--- a/tools/ci_build/github/windows/jar_packaging.py
+++ b/tools/ci_build/github/windows/jar_packaging.py
@@ -33,7 +33,12 @@ def find_7z_executable():
     if seven_zip_exe:
         return seven_zip_exe
 
-    # 2. Check the default installation directory under Program Files
+    # 2. Check if '7za' is in the PATH (common on Linux systems)
+    seven_zip_exe = shutil.which("7za")
+    if seven_zip_exe:
+        return seven_zip_exe
+
+    # 3. Check the default installation directory under Program Files
     program_files = os.environ.get("ProgramFiles")  # noqa: SIM112
     if program_files:
         default_path = Path(program_files) / "7-Zip" / "7z.exe"
@@ -226,9 +231,7 @@ def run_packaging(package_type: str, build_dir: str):
         "cpu": {
             "platforms": [
                 {"path": "onnxruntime-java-linux-x64", "lib": "libcustom_op_library.so", "archive_lib": True},
-                {"path": "onnxruntime-java-osx-x86_64", "lib": "libcustom_op_library.dylib", "archive_lib": True},
                 {"path": "onnxruntime-java-linux-aarch64", "lib": "libcustom_op_library.so", "archive_lib": False},
-                {"path": "onnxruntime-java-osx-arm64", "lib": "libcustom_op_library.dylib", "archive_lib": False},
             ]
         },
         "gpu": {
diff --git a/tools/ci_build/github/windows/jar_packaging_test.py b/tools/ci_build/github/windows/jar_packaging_test.py
index 91b68728dad15..2dd61cf9c3088 100644
--- a/tools/ci_build/github/windows/jar_packaging_test.py
+++ b/tools/ci_build/github/windows/jar_packaging_test.py
@@ -31,7 +31,6 @@ def _setup_test_directory(package_type: str, version_string: str):
         java_artifact_dir = tmp_path / "java-artifact"
         win_dir = java_artifact_dir / "onnxruntime-java-win-x64"
         linux_dir = java_artifact_dir / "onnxruntime-java-linux-x64"
-        osx_dir = java_artifact_dir / "onnxruntime-java-osx-x86_64"
 
         # --- Main artifact directory (Windows) ---
         win_dir.mkdir(parents=True, exist_ok=True)
@@ -53,26 +52,14 @@ def _setup_test_directory(package_type: str, version_string: str):
             create_empty_file(linux_native_dir / "libonnxruntime_providers_cuda.so")
         (linux_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True)
 
-        # --- macOS and other platforms (for CPU test) ---
+        # --- Additional platforms (for CPU test) ---
         if package_type == "cpu":
-            osx_native_dir = osx_dir / "ai" / "onnxruntime" / "native" / "osx-x86_64"
-            osx_native_dir.mkdir(parents=True, exist_ok=True)
-            create_empty_file(osx_dir / "libcustom_op_library.dylib")
-            create_empty_file(osx_native_dir / "libonnxruntime.dylib")
-            create_empty_file(osx_native_dir / "libonnxruntime4j_jni.dylib")
-            (osx_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True)
-
-            # Add linux-aarch64 and osx-arm64 for CPU test
+            # Add linux-aarch64 for CPU test
             linux_aarch64_dir = java_artifact_dir / "onnxruntime-java-linux-aarch64"
             linux_aarch64_native_dir = linux_aarch64_dir / "ai" / "onnxruntime" / "native" / "linux-aarch64"
             linux_aarch64_native_dir.mkdir(parents=True, exist_ok=True)
             create_empty_file(linux_aarch64_dir / "libcustom_op_library.so")
 
-            osx_arm64_dir = java_artifact_dir / "onnxruntime-java-osx-arm64"
-            osx_arm64_native_dir = osx_arm64_dir / "ai" / "onnxruntime" / "native" / "osx-arm64"
-            osx_arm64_native_dir.mkdir(parents=True, exist_ok=True)
-            create_empty_file(osx_arm64_dir / "libcustom_op_library.dylib")
-
         return tmp_path
 
     return _setup_test_directory
@@ -134,9 +121,6 @@ def test_cpu_packaging(directory_setup_factory, version_string):
         # Linux libs
         assert "ai/onnxruntime/native/linux-x64/libonnxruntime.so" in jar_contents
         assert "ai/onnxruntime/native/linux-x64/libonnxruntime4j_jni.so" in jar_contents
-        # macOS libs
-        assert "ai/onnxruntime/native/osx-x86_64/libonnxruntime.dylib" in jar_contents
-        assert "ai/onnxruntime/native/osx-x86_64/libonnxruntime4j_jni.dylib" in jar_contents
         # GPU libs should NOT be present
         assert "ai/onnxruntime/native/linux-x64/libonnxruntime_providers_cuda.so" not in jar_contents
 
@@ -144,14 +128,9 @@ def test_cpu_packaging(directory_setup_factory, version_string):
     with zipfile.ZipFile(testing_jar_path, "r") as zf:
         jar_contents = zf.namelist()
         assert "libcustom_op_library.so" in jar_contents
-        assert "libcustom_op_library.dylib" in jar_contents
 
     # 3. Verify the custom op libraries were removed from the source directories
     linux_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-x64"
-    osx_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-x86_64"
     linux_aarch64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-aarch64"
-    osx_arm64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-arm64"
     assert not (linux_dir / "libcustom_op_library.so").exists()
-    assert not (osx_dir / "libcustom_op_library.dylib").exists()
     assert not (linux_aarch64_dir / "libcustom_op_library.so").exists()
-    assert not (osx_arm64_dir / "libcustom_op_library.dylib").exists()
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index 6ab2ab2b7b61f..bb307a20d7f18 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -3,13 +3,13 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.19.0
+onnx==1.19.1
 protobuf==4.25.1
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
 jinja2
 markupsafe
 semver
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index f93938e2a9009..f095f58f9920e 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,13 +1,13 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index ecadab5d3f8a3..115a19b6f3a01 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -1,14 +1,14 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH%
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 
-@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+@REM The default version is still cuda v12.8, because set cuda v11.8 after it
 set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 45e0d970fb541..6110249a9cde6 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,10 +1,10 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64
 )
 set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index bcd5a434c58e8..21894c2ba003d 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -3,12 +3,13 @@ packaging
 # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
 protobuf==4.25.1
 numpy==2.2.6
-torch>=2.6.0
+torch==2.8.0
+torchvision==0.23.0
 coloredlogs==15.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece
 psutil
 einops
-onnxscript==0.3.2
-onnx-ir
+onnxscript==0.5.3
+onnx-ir==0.1.10
diff --git a/tools/python/update_version.py b/tools/python/update_version.py
index 6d040ea90947f..7807441285d4c 100755
--- a/tools/python/update_version.py
+++ b/tools/python/update_version.py
@@ -1,122 +1,217 @@
-import os
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import re
+import shutil
+import sys
+from pathlib import Path
+
+# --- Helper Functions for Updating Files ---
+
+
+def update_versioning_md(file_path: Path, new_version: str):
+    """Updates the version table in Versioning.md."""
+    print(f"Checking '{file_path.name}' for version updates...")
+    if not file_path.exists():
+        print(f"Warning: File not found at '{file_path}'. Skipping.")
+        return
+    content = file_path.read_text()
+
+    # Find the first version number in the markdown table
+    match = re.search(r"^\| ([\d.]+) \|", content, re.MULTILINE)
+    if not match:
+        print(f"Warning: Could not find current version in '{file_path.name}'. Skipping.")
+        return
+
+    current_version = match.group(1)
+    print(f"Found current version: {current_version}")
+
+    if new_version != current_version:
+        print(f"Updating version in '{file_path.name}' to {new_version}...")
+        # Prepare the new row by duplicating the header separator line's structure
+        header_separator_match = re.search(r"(\r\n?|\n)(\|---\|.*)", content)
+        if not header_separator_match:
+            print(f"Warning: Could not find table header separator in '{file_path.name}'. Skipping.")
+            return
+
+        header_separator = header_separator_match.group(2)
+        # Create a new row based on the separator, replacing dashes with spaces and adding the version
+        new_row_parts = [" " + part.replace("-", " ") + " " for part in header_separator.split("|")]
+        new_row_parts[1] = f" {new_version} "  # Set the new version
+        new_row = "|".join(new_row_parts)
+
+        # Insert the new row right after the header separator line
+        insertion_point = header_separator_match.end(0)
+        new_content = content[:insertion_point] + "\n" + new_row + content[insertion_point:]
+        file_path.write_text(new_content)
+        print("Update complete.")
+    else:
+        print("Version is already up to date.")
+
+
+def update_readme_rst(file_path: Path, new_version: str):
+    """Updates the release history in the Python README.rst."""
+    print(f"Checking '{file_path.name}' for version updates...")
+    if not file_path.exists():
+        print(f"Warning: File not found at '{file_path}'. Skipping.")
+        return
+    content = file_path.read_text()
+
+    # Find the first version header in the file
+    match = re.search(r"^([\d.]+)", content, re.MULTILINE)
+    if not match:
+        print(f"Warning: Could not find current version in '{file_path.name}'. Skipping.")
+        return
+
+    current_version = match.group(1)
+    print(f"Found current version: {current_version}")
+
+    if new_version != current_version:
+        print(f"Updating version in '{file_path.name}' to {new_version}...")
+        new_header = f"{new_version}\n{'^' * len(new_version)}"
+        release_notes = f"Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v{new_version}"
+        new_section = f"{new_header}\n\n{release_notes}\n\n"
+
+        # Insert the new section before the first version header found
+        insertion_point = match.start(0)
+        new_content = content[:insertion_point] + new_section + content[insertion_point:]
+        file_path.write_text(new_content)
+        print("Update complete.")
+    else:
+        print("Version is already up to date.")
+
+
+def update_init_py(file_path: Path, new_version: str):
+    """Updates the __version__ variable in the project's __init__.py."""
+    print(f"Checking '{file_path.name}' for version updates...")
+    if not file_path.exists():
+        print(f"Warning: File not found at '{file_path}'. Skipping.")
+        return
+    content = file_path.read_text()
+
+    # Find the __version__ line
+    match = re.search(r"__version__\s*=\s*[\"']([\d.]+)[\"']", content)
+    if not match:
+        print(f"Warning: Could not find __version__ in '{file_path.name}'. Skipping.")
+        return
+
+    current_version = match.group(1)
+    print(f"Found current version: {current_version}")
+
+    if new_version != current_version:
+        print(f"Updating version in '{file_path.name}' to {new_version}...")
+        new_content = re.sub(r"__version__\s*=\s*[\"'][\d.]+[\"']", f'__version__ = "{new_version}"', content)
+        file_path.write_text(new_content)
+        print("Update complete.")
+    else:
+        print("Version is already up to date.")
+
+
+def update_npm_packages(js_root: Path, new_version: str):
+    """Updates versions for all NPM packages in the js directory."""
+    print("\nUpdating NPM package versions...")
+
+    # This script assumes a 'util' module is available in the search path.
+    try:
+        from util import is_windows  # noqa: PLC0415
+        from util import run as run_command  # noqa: PLC0415
+    except ImportError:
+        print("Error: Could not import 'is_windows' and 'run' from a 'util' module.", file=sys.stderr)
+        print("Please ensure the 'util' module is in Python's search path.", file=sys.stderr)
+        return
+
+    command_prefix = []
+    # Check if node and npm are directly available in the system's PATH.
+    if shutil.which("node") and shutil.which("npm"):
+        print("Found node and npm in PATH.")
+    # If not, and if on Linux, check if 'fnm' is available.
+    elif shutil.which("fnm"):
+        print("node/npm not in PATH. Found 'fnm' on Linux, will use it to run commands.")
+        nvmrc_path = js_root / ".nvmrc"
+        # Check for .nvmrc file.
+        if not nvmrc_path.exists():
+            print(f"Error: 'fnm' is being used, but the version file '{nvmrc_path}' was not found.", file=sys.stderr)
+            print(
+                "Please create a .nvmrc file in the 'js' directory with the desired Node.js version.", file=sys.stderr
+            )
+            return
+
+        node_version = nvmrc_path.read_text().strip()
+        print(f"Found node version '{node_version}' in .nvmrc.")
+
+        # Ensure the required node version is installed by fnm.
+        print(f"Ensuring Node.js version '{node_version}' is installed via fnm...")
+        run_command("fnm", "install", node_version, cwd=js_root)
+
+        print(f"Using Node.js version '{node_version}' with fnm.")
+        command_prefix = ["fnm", "exec", f"--using={node_version}", "--"]
+    # If neither is available, skip the NPM updates.
+    else:
+        print("Error: Could not find 'node' and 'npm' in your PATH.", file=sys.stderr)
+        if sys.platform.startswith("linux"):
+            print("Hint: Install 'fnm' (Fast Node Manager) to manage Node.js versions.", file=sys.stderr)
+        print("Skipping NPM package updates.", file=sys.stderr)
+        return
+
+    def run_npm(args, cwd):
+        """Helper to run npm commands, prepending fnm if necessary."""
+        full_command = command_prefix + list(args)
+        print(full_command)
+        run_command(*full_command, cwd=cwd)
+
+    npm_exe = "npm.cmd" if is_windows() else "npm"
+    packages = ["common", "node", "web", "react_native"]
+
+    for package in packages:
+        print(f"\n--- Updating package: {package} ---")
+        # Use npm's --prefix argument and run from js_root.
+        # --allow-same-version prevents an error if the version is already correct.
+        run_npm([npm_exe, "--prefix", package, "version", new_version, "--allow-same-version"], cwd=js_root)
+        run_npm([npm_exe, "--prefix", package, "install", "--package-lock-only", "--ignore-scripts"], cwd=js_root)
+
+    print("\n--- Finalizing JS versions and formatting ---")
+    run_npm([npm_exe, "ci"], cwd=js_root)
+    for package in packages:
+        run_npm([npm_exe, "run", "update-version", package], cwd=js_root)
+
+    run_npm([npm_exe, "run", "format"], cwd=js_root)
+    print("NPM package updates complete.")
+
+
+# Define repository root relative to the script's location
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_DIR = SCRIPT_DIR.parent.parent
 
 
 def update_version():
-    version = ""
-    cwd = os.path.dirname(os.path.realpath(__file__))
-    with open(os.path.join(cwd, "..", "..", "VERSION_NUMBER")) as f:
-        version = f.readline().strip()
-    lines = []
-    current_version = ""
-    file_path = os.path.join(cwd, "..", "..", "docs", "Versioning.md")
-    with open(file_path) as f:
-        lines = f.readlines()
-        for line in lines:
-            if line.startswith("|"):
-                sections = line.split("|")
-                if len(sections) == 8 and sections[1].strip()[0].isdigit():
-                    current_version = sections[1].strip()
-                    break
-    print("Current version of ORT seems to be: " + current_version)
-    if version != current_version:
-        with open(file_path, "w") as f:
-            for i, line in enumerate(lines):
-                f.write(line)
-                if line.startswith("|--"):
-                    sections = lines[i + 1].split("|")
-                    # Make sure there are no 'False Positive' version additions
-                    # by making sure the line we are building a new line from
-                    # contains the current_version
-                    if len(sections) > 1 and sections[1].strip() == current_version:
-                        sections[1] = " " + version + " "
-                        new_line = "|".join(sections)
-                        f.write(new_line)
-    lines = []
-    current_version = ""
-    file_path = os.path.join(cwd, "..", "..", "docs", "python", "README.rst")
-    with open(file_path) as f:
-        lines = f.readlines()
-        for line in lines:
-            sections = line.strip().split(".")
-            if len(sections) == 3 and sections[0].isdigit() and sections[1].isdigit() and sections[2].isdigit():
-                current_version = line.strip()
-                break
-    if version != current_version:
-        inserted = False
-        with open(file_path, "w") as f:
-            for line in lines:
-                sections = line.strip().split(".")
-                if (
-                    inserted is False
-                    and len(sections) == 3
-                    and sections[0].isdigit()
-                    and sections[1].isdigit()
-                    and sections[2].isdigit()
-                ):
-                    f.write(version + "\n")
-                    f.write("^" * len(version) + "\n\n")
-                    f.write(
-                        "Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v"
-                        + version.strip()
-                        + "\n\n"
-                    )
-                    inserted = True
-                f.write(line)
-    lines = []
-    current_version = ""
-    file_path = os.path.join(cwd, "..", "..", "onnxruntime", "__init__.py")
-    with open(file_path) as f:
-        lines = f.readlines()
-        for line in lines:
-            if line.startswith("__version__"):
-                current_version = line.split("=")[1].strip()[1:-1]
-                break
-    if version != current_version:
-        with open(file_path, "w") as f:
-            for line in lines:
-                if line.startswith("__version__"):
-                    f.write('__version__ = "' + version + '"\n')
-                    continue
-                f.write(line)
-
-    # update version for NPM packages
-    current_version = ""
-    js_root = os.path.join(cwd, "..", "..", "js")
-
-    def run(args, cwd):
-        from util import is_windows, run  # noqa: PLC0415
-
-        if is_windows():
-            args = ["cmd", "/c", *args]
-        run(*args, cwd=cwd)
-
-    # check if node and npm are installed
-    run(["node", "--version"], cwd=js_root)
-    run(["npm", "--version"], cwd=js_root)
-
-    # upgrade version for onnxruntime-common
-    run(["npm", "version", version], cwd=os.path.join(js_root, "common"))
-    run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "common"))
-
-    # upgrade version for onnxruntime-node
-    run(["npm", "version", version], cwd=os.path.join(js_root, "node"))
-    run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "node"))
-
-    # upgrade version for onnxruntime-web
-    run(["npm", "version", version], cwd=os.path.join(js_root, "web"))
-    run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "web"))
-
-    # upgrade version for onnxruntime-react-native
-    run(["npm", "version", version], cwd=os.path.join(js_root, "react_native"))
-    run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "react_native"))
-
-    # upgrade version.ts in each package
-    run(["npm", "ci"], cwd=js_root)
-    run(["npm", "run", "update-version", "common"], cwd=js_root)
-    run(["npm", "run", "update-version", "node"], cwd=js_root)
-    run(["npm", "run", "update-version", "web"], cwd=js_root)
-    run(["npm", "run", "update-version", "react_native"], cwd=js_root)
-    run(["npm", "run", "format"], cwd=js_root)
+    """Main function to read the new version and orchestrate updates across the project."""
+    # Read and validate the new version from VERSION_NUMBER
+    version_file = REPO_DIR / "VERSION_NUMBER"
+    print(f"Reading new version from '{version_file}'...")
+    try:
+        new_version = version_file.read_text().strip()
+    except FileNotFoundError:
+        print(f"Error: '{version_file}' not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Validate that the version is in x.y.z format
+    if not re.fullmatch(r"\d+\.\d+\.\d+", new_version):
+        print(
+            f"Error: Version '{new_version}' from '{version_file.name}' is not a valid x.y.z semantic version.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    print(f"Target version to set: {new_version}\n")
+
+    # Update files using absolute paths from REPO_DIR
+    update_versioning_md(REPO_DIR / "docs" / "Versioning.md", new_version)
+    update_readme_rst(REPO_DIR / "docs" / "python" / "README.rst", new_version)
+    update_init_py(REPO_DIR / "onnxruntime" / "__init__.py", new_version)
+
+    # Update all NPM packages
+    update_npm_packages(REPO_DIR / "js", new_version)
 
 
 if __name__ == "__main__":
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index e8dda5cc592b9..c7fbde2aac584 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -130,7 +130,7 @@ def start_emulator(
     extra_args: typing.Sequence[str] | None = None,
     timeout_minutes: int = 20,
 ) -> subprocess.Popen:
-    if check_emulator_running_using_avd_name(avd_name=avd_name):
+    if check_emulator_running_using_avd_name(sdk_tool_paths=sdk_tool_paths, avd_name=avd_name):
         raise RuntimeError(
             f"An emulator with avd_name{avd_name} is already running. Please close it before starting a new one."
         )
@@ -234,12 +234,12 @@ def start_emulator(
             time.sleep(sleep_interval_seconds)
 
         # Verify if the emulator is now running
-        if not check_emulator_running_using_avd_name(avd_name=avd_name):
+        if not check_emulator_running_using_avd_name(sdk_tool_paths=sdk_tool_paths, avd_name=avd_name):
             raise RuntimeError("Emulator failed to start.")
         return emulator_process
 
 
-def check_emulator_running_using_avd_name(avd_name: str) -> bool:
+def check_emulator_running_using_avd_name(sdk_tool_paths: SdkToolPaths, avd_name: str) -> bool:
     """
     Check if an emulator is running based on the provided AVD name.
     :param avd_name: Name of the Android Virtual Device (AVD) to check.
@@ -247,7 +247,7 @@ def check_emulator_running_using_avd_name(avd_name: str) -> bool:
     """
     try:
         # Step 1: List running devices
-        result = subprocess.check_output(["adb", "devices"], text=True).strip()
+        result = subprocess.check_output([sdk_tool_paths.adb, "devices"], text=True).strip()
         _log.info(f"adb devices output:\n{result}")
         running_emulators = [line.split("\t")[0] for line in result.splitlines()[1:] if "emulator" in line]
 
@@ -259,7 +259,7 @@ def check_emulator_running_using_avd_name(avd_name: str) -> bool:
         for emulator in running_emulators:
             try:
                 avd_info = (
-                    subprocess.check_output(["adb", "-s", emulator, "emu", "avd", "name"], text=True)
+                    subprocess.check_output([sdk_tool_paths.adb, "-s", emulator, "emu", "avd", "name"], text=True)
                     .strip()
                     .split("\n")[0]
                 )