diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml index 0e5ea60f61402..04177b11e9c30 100644 --- a/.github/workflows/gradle-wrapper-validation.yml +++ b/.github/workflows/gradle-wrapper-validation.yml @@ -16,7 +16,7 @@ jobs: runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"] steps: - uses: actions/checkout@v5 - - uses: gradle/actions/wrapper-validation@v4 + - uses: gradle/actions/wrapper-validation@v5 concurrency: group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }} cancel-in-progress: true diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml index 42d1bdc295785..683c5594e82f2 100644 --- a/.github/workflows/publish-csharp-apidocs.yml +++ b/.github/workflows/publish-csharp-apidocs.yml @@ -20,7 +20,7 @@ permissions: jobs: build: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] env: DOCFXVERSION: 2.62.2 steps: diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 437fc0e2c6334..3d24d4b6b75b6 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU CUDA CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -52,9 +52,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - uses: actions/setup-node@v5 with: @@ -111,7 +111,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -188,10 +188,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -199,9 +199,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - name: Set OnnxRuntimeBuildDirectory shell: pwsh @@ -227,7 +227,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml index 395ccfbe70244..b608c0879aa45 100644 --- a/.github/workflows/windows_openvino.yml +++ b/.github/workflows/windows_openvino.yml @@ -18,7 +18,7 @@ concurrency: jobs: BUILD_OPENVINO_EP: name: Windows OpenVINO CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 240 env: AZCOPY_AUTO_LOGIN_TYPE: MSI diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml index 9788792b94fa8..1906fcb18c841 100644 --- a/.github/workflows/windows_qnn_x64.yml +++ b/.github/workflows/windows_qnn_x64.yml @@ -18,7 +18,7 @@ concurrency: jobs: build_test_qnn_ep: name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }}) - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 120 strategy: matrix: diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 5f3dcb9607a47..2a1fe97d9b7b7 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -19,7 +19,7 @@ concurrency: jobs: build: name: Windows GPU TensorRT CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] steps: - uses: actions/checkout@v5 with: @@ -41,10 +41,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -56,9 +56,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - uses: actions/setup-node@v5 @@ -116,7 +116,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -193,10 +193,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -208,9 +208,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" - name: Set OnnxRuntimeBuildDirectory @@ -237,7 +237,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml index 6165375e7a54a..6a1b43e54ed89 100644 --- a/.github/workflows/windows_x64_debug_build_x64_debug.yml +++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_debug: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml index f9d7b0d9e9e04..0bcd282e8dc50 100644 --- a/.github/workflows/windows_x64_release_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml index 54c13e1e04b0a..3934047266f59 100644 --- a/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml +++ b/.github/workflows/windows_x64_release_ep_generic_interface_build_x64_release_ep_generic_interface.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_ep_generic_interface: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml index 06230962b39be..1c38d8e58970c 100644 --- a/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml +++ b/.github/workflows/windows_x64_release_vitisai_build_x64_release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_vitisai: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x64_release_xnnpack.yml b/.github/workflows/windows_x64_release_xnnpack.yml index 21033ef4cbe3c..6eb9f00d3997d 100644 --- a/.github/workflows/windows_x64_release_xnnpack.yml +++ b/.github/workflows/windows_x64_release_xnnpack.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x64_release_xnnpack: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/.github/workflows/windows_x86.yml b/.github/workflows/windows_x86.yml index fa1e9362e2f34..597c1c7f4b6cf 100644 --- a/.github/workflows/windows_x86.yml +++ b/.github/workflows/windows_x86.yml @@ -13,7 +13,7 @@ concurrency: jobs: build_x86_release: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 300 steps: diff --git a/VERSION_NUMBER b/VERSION_NUMBER index a6c2798a482eb..53cc1a6f9292c 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -1.23.0 +1.24.0 diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 793207f5b6d76..8186da507a442 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -101,7 +101,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF) cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF) option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF) cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF) -cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF) +option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF) option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF) option(onnxruntime_USE_AVX "Use AVX instructions" OFF) @@ -287,9 +287,13 @@ if (onnxruntime_ENABLE_TRAINING_APIS) endif() -# Single output director for all binaries +# Single output directory for all binaries set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") +# Local mirror directory of cmake dependencies +set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) +set(onnxruntime_CMAKE_DEPS_MIRROR_DIR ${REPO_ROOT}/mirror CACHE PATH "Path to the local mirror of cmake dependencies") + include(FetchContent) @@ -425,7 +429,6 @@ if (onnxruntime_EXTENDED_MINIMAL_BUILD AND NOT onnxruntime_MINIMAL_BUILD) set(onnxruntime_MINIMAL_BUILD ON) endif() -set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime) set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining) set(ORTTRAINING_SOURCE_DIR ${ORTTRAINING_ROOT}/orttraining) diff --git a/cmake/deps.txt b/cmake/deps.txt index 7b243ff15cd80..bf76753c1b3c0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -34,7 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 -onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.0.zip;4c798b73e131438c196e6dcb9f3393968a8936f1 +onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.19.1.zip;c5215b5697dcdfd71799f001b8c4054a6bba6b09 # Use the latest commit of 10.9-GA onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa diff --git a/cmake/external/emsdk b/cmake/external/emsdk index d49219d03a41c..419021fa04042 160000 --- a/cmake/external/emsdk +++ b/cmake/external/emsdk @@ -1 +1 @@ -Subproject commit d49219d03a41cd12f95a33ba84273c20d41fd350 +Subproject commit 419021fa040428bc69ef1559b325addb8e10211f diff --git a/cmake/external/helper_functions.cmake b/cmake/external/helper_functions.cmake index 55059b9500a8e..e8044411e4201 100644 --- a/cmake/external/helper_functions.cmake +++ b/cmake/external/helper_functions.cmake @@ -4,11 +4,11 @@ # 2. Set the cmake property COMPILE_WARNING_AS_ERROR to OFF for these external projects. function(onnxruntime_fetchcontent_declare contentName) + cmake_parse_arguments(PARSE_ARGV 1 ARG "" "URL;SOURCE_SUBDIR" "") + message(STATUS "Fetch ${contentName} from ${ARG_URL}") FetchContent_Declare(${ARGV}) string(TOLOWER ${contentName} contentNameLower) - list(FIND ARGN SOURCE_SUBDIR index_SOURCE_SUBDIR) - if(index_SOURCE_SUBDIR GREATER_EQUAL 0) - cmake_parse_arguments(PARSE_ARGV 1 ARG "" "SOURCE_SUBDIR" "") + if(NOT "${ARG_SOURCE_SUBDIR}" STREQUAL "") set(onnxruntime_${contentNameLower}_cmake_src_dir "${ARG_SOURCE_SUBDIR}" PARENT_SCOPE) endif() endfunction() diff --git a/cmake/external/onnx b/cmake/external/onnx index 54b72a5edd399..e709452ef2bbc 160000 --- a/cmake/external/onnx +++ b/cmake/external/onnx @@ -1 +1 @@ -Subproject commit 54b72a5edd399eb096ee09fecdef03201e9bde89 +Subproject commit e709452ef2bbc1d113faf678c24e6d3467696e83 diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 8e1a880579b34..b6a741d8b0fe7 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -20,7 +20,7 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST) if(ONNXRUNTIME_DEP_URL MATCHES "^https://") # Search a local mirror folder - string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") + string(REGEX REPLACE "^https://" "${onnxruntime_CMAKE_DEPS_MIRROR_DIR}/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") if(EXISTS "${LOCAL_URL}") cmake_path(ABSOLUTE_PATH LOCAL_URL) @@ -498,13 +498,7 @@ else() endif() if(Patch_FOUND) - set(ONNXRUNTIME_ONNX_PATCH_COMMAND - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch && - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < - ${PROJECT_SOURCE_DIR}/patches/onnx/avoid_regenerating_proto_files.patch - ) + set(ONNXRUNTIME_ONNX_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/onnx/onnx.patch) else() set(ONNXRUNTIME_ONNX_PATCH_COMMAND "") endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 68a3e9014b7b0..1d31eb1fbd207 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -182,8 +182,8 @@ # Since CUDA 12.8, compiling diagnostics become stricter if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - target_compile_options(${target} PRIVATE "$<$:--relocatable-device-code=true>") - set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_options(${target} PRIVATE "$<$:--static-global-template-stub=false>") + if (MSVC) target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4505>") endif() diff --git a/cmake/onnxruntime_test_pch.cmake b/cmake/onnxruntime_test_pch.cmake index f989774ade35b..4a8735a9c346c 100644 --- a/cmake/onnxruntime_test_pch.cmake +++ b/cmake/onnxruntime_test_pch.cmake @@ -5,9 +5,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") target_precompile_headers(onnxruntime_test_all PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h" ) - target_precompile_headers(onnxruntime_provider_test PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h" - ) + if (TARGET onnxruntime_provider_test) + target_precompile_headers(onnxruntime_provider_test PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/test_pch.h" + ) + endif() endif() # Exclude certain files that might conflict with PCH diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 177bc4229df31..460736ff8506e 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1228,6 +1228,11 @@ block() LIBS ${onnxruntime_provider_test_libs} DEPENDS ${onnxruntime_provider_test_deps} ) + if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV)) + # The test_main.cc includes NvInfer.h where it has many deprecated declarations + # simply ignore them for TensorRT EP build + set_property(TARGET onnxruntime_provider_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") + endif() # enable dynamic plugin EP usage target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_ENABLE_DYNAMIC_PLUGIN_EP_USAGE) diff --git a/cmake/patches/onnx/avoid_regenerating_proto_files.patch b/cmake/patches/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/patches/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/patches/onnx/onnx.patch +++ b/cmake/patches/onnx/onnx.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch b/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch deleted file mode 100644 index 804dfeb8f59c2..0000000000000 --- a/cmake/vcpkg-ports/onnx/avoid_regenerating_proto_files.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 479955793..cc3ef1400 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -321,7 +321,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - set(${SRCS}) - - set(GEN_PROTO_PY "${ONNX_ROOT}/onnx/gen_proto.py") -- set(GENERATED_FILE_TARGETS) -+ set(GENERATED_FILES) - foreach(INFILE ${ARGN}) - set(ABS_FILE "${ONNX_ROOT}/${INFILE}") - get_filename_component(FILE_DIR ${ABS_FILE} DIRECTORY) -@@ -371,12 +371,11 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND GEN_PROTO_ARGS "${ONNX_PROTOC_EXECUTABLE}") - endif() - -- add_custom_target("${GENERATED_FILE_WE}_proto_file" -- COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -- BYPRODUCTS "${GENERATED_PROTO}" -- DEPENDS ${INFILE} -- COMMENT "Running gen_proto.py on ${INFILE}" -- ) -+ # Use add_custom_command to avoid re-generate of PROTO files -+ add_custom_command(OUTPUT "${GENERATED_PROTO}" -+ COMMAND ${ONNX_PYTHON_INTERPRETER} "${GEN_PROTO_PY}" ${GEN_PROTO_ARGS} -+ DEPENDS ${INFILE} -+ COMMENT "Running gen_proto.py on ${INFILE}") - message("Generated: ${GENERATED_PROTO}") - - set(PROTOC_ARGS -@@ -393,11 +392,10 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS) - list(APPEND PROTOC_ARGS ${CMAKE_CURRENT_BINARY_DIR}) - endif() - endif() -- list(APPEND GENERATED_FILE_TARGETS ${GENERATED_FILE_WE}_proto_file) -- add_custom_target(${GENERATED_FILE_WE}_src -+ list(APPEND GENERATED_FILES "${GENERATED_PROTO}") -+ add_custom_command(OUTPUT "${OUTPUT_PB_SRC}" - COMMAND "${ONNX_PROTOC_EXECUTABLE}" ${PROTOC_ARGS} -- BYPRODUCTS "${OUTPUT_PB_SRC}" -- DEPENDS ${GENERATED_FILE_TARGETS} -+ DEPENDS ${GENERATED_FILES} - COMMENT "Running C++ protocol buffer compiler on ${GENERATED_PROTO}") - endforeach() - diff --git a/cmake/vcpkg-ports/onnx/binskim.patch b/cmake/vcpkg-ports/onnx/binskim.patch index e8ae766062d08..047cb527bb4da 100644 --- a/cmake/vcpkg-ports/onnx/binskim.patch +++ b/cmake/vcpkg-ports/onnx/binskim.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 47995579..6cc439f6 100644 +index cc3ef140..f70312ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ option(ONNX_USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -10,7 +10,7 @@ index 47995579..6cc439f6 100644 if(WIN32) option(ONNX_USE_MSVC_STATIC_RUNTIME "Build with MSVC static runtime" OFF) endif() -@@ -411,14 +412,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS +@@ -409,14 +410,28 @@ relative_protobuf_generate_cpp(ONNX_PROTO_SRCS add_library(onnx_proto ${ONNX_PROTO_SRCS}) @@ -47,7 +47,7 @@ index 47995579..6cc439f6 100644 # Hide all symbols we don't need set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden) -@@ -440,19 +455,6 @@ add_onnx_global_defines(onnx_proto) +@@ -438,19 +453,6 @@ add_onnx_global_defines(onnx_proto) target_include_directories(onnx_proto PUBLIC $ $) @@ -68,10 +68,10 @@ index 47995579..6cc439f6 100644 if(CMAKE_SYSTEM_NAME STREQUAL "AIX") # whole-archive linker option not available on AIX. diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc -index 40635f97..44770774 100644 +index ad6dd0c1..50259f32 100644 --- a/onnx/defs/nn/old.cc +++ b/onnx/defs/nn/old.cc -@@ -4090,7 +4090,6 @@ ONNX_OPERATOR_SET_SCHEMA( +@@ -4091,7 +4091,6 @@ ONNX_OPERATOR_SET_SCHEMA( GroupNormalization, 18, OpSchema() @@ -80,7 +80,7 @@ index 40635f97..44770774 100644 .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f) .Attr( diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h -index ddd95454..34647987 100644 +index 7e9bc27f..4b87c5a5 100644 --- a/onnx/defs/schema.h +++ b/onnx/defs/schema.h @@ -999,7 +999,7 @@ class OpSchemaRegistry final : public ISchemaRegistry { diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake index 27f5ea5fadd79..882850963a0c0 100644 --- a/cmake/vcpkg-ports/onnx/portfile.cmake +++ b/cmake/vcpkg-ports/onnx/portfile.cmake @@ -4,12 +4,9 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO onnx/onnx REF "v${VERSION}" - SHA512 e6f7b5782a43a91783607549e4d0f0a9cbd46dfb67a602f81aaffc7bcdd8f450fe9c225f0bc314704f2923e396f0df5b03ea91af4a7887203c0b8372bc2749d0 + SHA512 cf6ff4c0bb6cc16ce5f4d6267480d35f3c7a5fde94d10e1358928ff6e4ec6d756a7c5d34a500e60bbd8eb1912c8af21aa763719321b330f56a0eb6b9b810ef60 PATCHES fix-cmakelists.patch - # Patch changes from https://github.com/onnx/onnx/pull/7253 to avoid unnecessary rebuilding. - # This change should be included in ONNX 1.19.1. - avoid_regenerating_proto_files.patch fix-dependency-protobuf.patch binskim.patch ) diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json index 350db2e35061a..ad0d1aaf15f51 100644 --- a/cmake/vcpkg-ports/onnx/vcpkg.json +++ b/cmake/vcpkg-ports/onnx/vcpkg.json @@ -1,6 +1,6 @@ { "name": "onnx", - "version-semver": "1.19.0", + "version-semver": "1.19.1", "port-version": 1, "description": "Open standard for machine learning interoperability", "homepage": "https://onnx.ai", diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md index 8da19ddc51cb7..8c1280431c384 100644 --- a/docs/How_To_Update_ONNX_Dev_Notes.md +++ b/docs/How_To_Update_ONNX_Dev_Notes.md @@ -35,7 +35,7 @@ git add onnx 1. Modify [cmake/vcpkg-ports/onnx/binskim.patch](/cmake/vcpkg-ports/onnx/binskim.patch) to be the same as [cmake/patches/onnx/onnx.patch](/cmake/patches/onnx/onnx.patch). 2. The other patches are required/created by vcpkg repository to build ONNX. We just need to re-run diff to makes sure the patches can be applied in the updated ONNX version. 3. Update [cmake/vcpkg-ports/onnx/portfile.cmake](/cmake/vcpkg-ports/onnx/portfile.cmake) with the correct commit id and SHA512. (alternatively, build it with the wrong SHA and ORT should tell you the expected one.) -4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc={170774be-e1c6-4f8b-a3ae-984f211fe410}&action=edit&wd=target%28Development.)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl +4. Upload your package: [Follow the instructions](https://microsoft.sharepoint.com/:o:/r/teams/ONNX2/_layouts/15/Doc.aspx?sourcedoc=%7B170774BE-E1C6-4F8B-A3AE-984F211FE410%7D&wd=target(Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FUpdate%20a%20VCPKG%20package%7CB6AE6A97-94FC-4436-8FC6-08C21AE895DA%2F)&wdpartid=%7BB5CF19CC-40FE-0EC7-32B6-8119B427B32A%7D%7B1%7D&wdsectionfileid=%7B9DD25660-A195-48EA-B9E0-DF8B902AFDD7%7D&ovuser=72f988bf-86f1-41af-91ab-2d7cd011db47%2Ctitaiwang%40microsoft.com&clickparams=eyJBcHBOYW1lIjoiVGVhbXMtRGVza3RvcCIsIkFwcFZlcnNpb24iOiI0OS8yNTA5MTExNjAxNiIsIkhhc0ZlZGVyYXRlZFVzZXIiOmZhbHNlfQ%3D%3D&CID=fb9dcaa1-c0b5-1000-5597-c19e3adf468c&cidOR=SPO)one%7C63d3ab47-51d1-4a62-9965-66882234bd44%2FAdd%20or%20Update%20a%20C%2B%2B%20dependency%7Cb6ae6a97-94fc-4436-8fc6-08c21ae895da%2F%29&wdorigin=NavigationUrl Alternatively, directly run Terrapin to upload ONNX package (need SHA512): diff --git a/docs/python/README.rst b/docs/python/README.rst index fdef200c1d0de..f610b36958fe1 100644 --- a/docs/python/README.rst +++ b/docs/python/README.rst @@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime = 2.6.0 diff --git a/js/.nvmrc b/js/.nvmrc new file mode 100644 index 0000000000000..0a39d73000b91 --- /dev/null +++ b/js/.nvmrc @@ -0,0 +1 @@ +v24.9.0 \ No newline at end of file diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts index 4a670e24aa6b7..09316966a2fd1 100644 --- a/js/common/lib/inference-session.ts +++ b/js/common/lib/inference-session.ts @@ -245,7 +245,23 @@ export declare namespace InferenceSession { } export interface WebGpuExecutionProviderOption extends ExecutionProviderOption { readonly name: 'webgpu'; + + /** + * Specify the preferred layout when running layout sensitive operators. + * + * @default 'NCHW' + */ preferredLayout?: 'NCHW' | 'NHWC'; + + /** + * Specify a list of node names that should be executed on CPU even when WebGPU EP is used. + */ + forceCpuNodeNames?: readonly string[]; + + /** + * Specify an optional WebGPU device to be used by the WebGPU execution provider. + */ + device?: TryGetGlobalType<'GPUDevice'>; } // #region WebNN options diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts index 994eb6f4300c1..1bf7e3ff6b819 100644 --- a/js/common/lib/version.ts +++ b/js/common/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.23.0'; +export const version = '1.24.0'; diff --git a/js/common/package-lock.json b/js/common/package-lock.json index 12e960e239b29..8b8fe876a16d1 100644 --- a/js/common/package-lock.json +++ b/js/common/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/common/package.json b/js/common/package.json index a0eff9095e6d7..df1e50f995390 100644 --- a/js/common/package.json +++ b/js/common/package.json @@ -2,7 +2,7 @@ "license": "MIT", "type": "module", "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "repository": { "url": "https://github.com/Microsoft/onnxruntime.git", "type": "git" diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts index 994eb6f4300c1..1bf7e3ff6b819 100644 --- a/js/node/lib/version.ts +++ b/js/node/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.23.0'; +export const version = '1.24.0'; diff --git a/js/node/package-lock.json b/js/node/package-lock.json index 740be4dd8d9a3..145d11ada7aa3 100644 --- a/js/node/package-lock.json +++ b/js/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-node", - "version": "1.23.0", + "version": "1.24.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-node", - "version": "1.23.0", + "version": "1.24.0", "hasInstallScript": true, "license": "MIT", "os": [ @@ -30,7 +30,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/node/package.json b/js/node/package.json index 5520a48aa124a..3490ae8cf0cce 100644 --- a/js/node/package.json +++ b/js/node/package.json @@ -11,7 +11,7 @@ 6 ] }, - "version": "1.23.0", + "version": "1.24.0", "dependencies": { "adm-zip": "^0.5.16", "global-agent": "^3.0.0", diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js index 3147f90904e7a..f03a78878788b 100644 --- a/js/node/script/install-metadata-versions.js +++ b/js/node/script/install-metadata-versions.js @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -module.exports = { nuget: [{ feed: 'nuget', version: '1.23.0' }] }; +module.exports = { nuget: [{ feed: 'nuget', version: '1.24.0' }] }; diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc index 7fff751a29186..9f979110fd644 100644 --- a/js/node/src/session_options_helper.cc +++ b/js/node/src/session_options_helper.cc @@ -73,12 +73,37 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions& sess for (const auto& nameIter : obj.GetPropertyNames()) { Napi::Value nameVar = nameIter.second; std::string name = nameVar.As().Utf8Value(); - if (name != "name") { - Napi::Value valueVar = obj.Get(nameVar); - ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(), "Invalid argument: sessionOptions.executionProviders must be a string or an object with property 'name'."); - std::string value = valueVar.As().Utf8Value(); - webgpu_options[name] = value; + Napi::Value valueVar = obj.Get(nameVar); + std::string value; + if (name == "preferredLayout" || + name == "validationMode" || + name == "storageBufferCacheMode" || + name == "uniformBufferCacheMode" || + name == "queryResolveBufferCacheMode" || + name == "defaultBufferCacheMode") { + ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsString(), epList.Env(), + "Invalid argument: \"", name, "\" must be a string."); + value = valueVar.As().Utf8Value(); + } else if (name == "forceCpuNodeNames") { + ORT_NAPI_THROW_TYPEERROR_IF(!valueVar.IsArray(), epList.Env(), + "Invalid argument: \"forceCpuNodeNames\" must be a string array."); + auto arr = valueVar.As(); + for (uint32_t i = 0; i < arr.Length(); i++) { + Napi::Value v = arr[i]; + ORT_NAPI_THROW_TYPEERROR_IF(!v.IsString(), epList.Env(), + "Invalid argument: elements of \"forceCpuNodeNames\" must be strings."); + if (i > 0) { + value += '\n'; + } + value += v.As().Utf8Value(); + } + } else { + // unrecognized option + ORT_NAPI_THROW_TYPEERROR_IF(name != "name", epList.Env(), + "Invalid argument: WebGPU EP has an unrecognized option: '", name, "'."); + continue; } + webgpu_options[name] = value; } } #endif diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts index 994eb6f4300c1..1bf7e3ff6b819 100644 --- a/js/react_native/lib/version.ts +++ b/js/react_native/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.23.0'; +export const version = '1.24.0'; diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json index ec2147b2cc4ba..f83bc60642247 100644 --- a/js/react_native/package-lock.json +++ b/js/react_native/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-react-native", - "version": "1.23.0", + "version": "1.24.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "onnxruntime-react-native", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "dependencies": { "buffer": "^6.0.3", @@ -31,7 +31,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/react_native/package.json b/js/react_native/package.json index 7a5ee35bdb25a..e776222c56f12 100644 --- a/js/react_native/package.json +++ b/js/react_native/package.json @@ -37,7 +37,7 @@ "registry": "https://registry.npmjs.org/" }, "source": "lib/index", - "version": "1.23.0", + "version": "1.24.0", "main": "dist/commonjs/index", "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md", "files": [ diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 295aacc6fffa3..ea88f291e5597 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -46,7 +46,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s | GatherElements | ai.onnx(11-12, 13+) | gatherElements | | | GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 | | Gelu | ai.onnx(20+) | gelu | | -| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input | +| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | | | GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input | | GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input | | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 | diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts index 994eb6f4300c1..1bf7e3ff6b819 100644 --- a/js/web/lib/version.ts +++ b/js/web/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.23.0'; +export const version = '1.24.0'; diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts index 52d40bb403c77..d9f3ad70f0c23 100644 --- a/js/web/lib/wasm/session-options.ts +++ b/js/web/lib/wasm/session-options.ts @@ -72,9 +72,10 @@ const appendEpOption = (epOptions: Array<[number, number]>, key: string, value: const setExecutionProviders = async ( sessionOptionsHandle: number, - executionProviders: readonly InferenceSession.ExecutionProviderConfig[], + sessionOptions: InferenceSession.SessionOptions, allocs: number[], ): Promise => { + const executionProviders = sessionOptions.executionProviders!; for (const ep of executionProviders) { let epName = typeof ep === 'string' ? ep : ep.name; const epOptions: Array<[number, number]> = []; @@ -98,16 +99,36 @@ const setExecutionProviders = async ( let customDevice: GPUDevice | undefined; if (typeof ep !== 'string') { - const customOptions = ep as unknown as { device: GPUDevice }; - if (customOptions.device) { - if (typeof GPUDevice !== 'undefined' && customOptions.device instanceof GPUDevice) { - customDevice = customOptions.device; + const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption; + + // set custom GPU device + if (webgpuOptions.device) { + if (typeof GPUDevice !== 'undefined' && webgpuOptions.device instanceof GPUDevice) { + customDevice = webgpuOptions.device; } else { throw new Error('Invalid GPU device set in WebGPU EP options.'); } } - // TODO: handle more options + // set graph capture option from session options + const { enableGraphCapture } = sessionOptions; + if (typeof enableGraphCapture === 'boolean' && enableGraphCapture) { + appendEpOption(epOptions, 'enableGraphCapture', '1', allocs); + } + + // set layout option + if (typeof webgpuOptions.preferredLayout === 'string') { + appendEpOption(epOptions, 'preferredLayout', webgpuOptions.preferredLayout, allocs); + } + + // set force CPU fallback nodes + if (webgpuOptions.forceCpuNodeNames) { + const names = Array.isArray(webgpuOptions.forceCpuNodeNames) + ? webgpuOptions.forceCpuNodeNames + : [webgpuOptions.forceCpuNodeNames]; + + appendEpOption(epOptions, 'forceCpuNodeNames', names.join('\n'), allocs); + } } const info = getInstance().webgpuRegisterDevice!(customDevice); @@ -211,7 +232,7 @@ export const setSessionOptions = async (options?: InferenceSession.SessionOption } if (sessionOptions.executionProviders) { - await setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs); + await setExecutionProviders(sessionOptionsHandle, sessionOptions, allocs); } if (sessionOptions.enableGraphCapture !== undefined) { diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 2b0a353b59832..86438200886e3 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-web", - "version": "1.23.0", + "version": "1.24.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-web", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "dependencies": { "flatbuffers": "^25.1.24", @@ -50,7 +50,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.23.0", + "version": "1.24.0", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/web/package.json b/js/web/package.json index d5425931bfc9e..ecd87fab4302b 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -7,7 +7,7 @@ "type": "git" }, "author": "fs-eire", - "version": "1.23.0", + "version": "1.24.0", "jsdelivr": "dist/ort.min.js", "dependencies": { "flatbuffers": "^25.1.24", diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json index 48f0a8f3e9d5c..e880f6bca2ac4 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json @@ -12,7 +12,7 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } }, "node_modules/@babel/helper-string-parser": { @@ -1114,9 +1114,9 @@ } }, "node_modules/vite": { - "version": "6.3.5", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz", - "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==", + "version": "6.3.6", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.6.tgz", + "integrity": "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA==", "dev": true, "license": "MIT", "dependencies": { diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json index f7d5751354905..84013e2aecb88 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package.json @@ -13,6 +13,6 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.3.5" + "vite": "^6.3.6" } } diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 3f1face2a043c..80991a3ebbb5f 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -2147,66 +2147,66 @@ "test_reduce_log_sum_default", "test_reduce_log_sum_desc_axes", // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64. - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", - // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", - // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", + "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", + "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", "test_reduce_log_sum_negative_axes", "test_reduce_log_sum", "test_reduce_max_default_axes_keepdim_example", - // "test_reduce_max_default_axes_keepdims_random", - // "test_reduce_max_do_not_keepdims_example", - // "test_reduce_max_do_not_keepdims_random", - // "test_reduce_max_keepdims_example", - // "test_reduce_max_keepdims_random", - // "test_reduce_max_negative_axes_keepdims_example", - // "test_reduce_max_negative_axes_keepdims_random", - // "test_reduce_mean_default_axes_keepdims_example", - // "test_reduce_mean_default_axes_keepdims_random", - // "test_reduce_mean_do_not_keepdims_example", - // "test_reduce_mean_do_not_keepdims_random", - // "test_reduce_mean_keepdims_example", - // "test_reduce_mean_keepdims_random", - // "test_reduce_mean_negative_axes_keepdims_example", - // "test_reduce_mean_negative_axes_keepdims_random", - // "test_reduce_min_default_axes_keepdims_example", - // "test_reduce_min_default_axes_keepdims_random", - // "test_reduce_min_do_not_keepdims_example", - // "test_reduce_min_do_not_keepdims_random", - // "test_reduce_min_keepdims_example", - // "test_reduce_min_keepdims_random", - // "test_reduce_min_negative_axes_keepdims_example", - // "test_reduce_min_negative_axes_keepdims_random", - // "test_reduce_prod_default_axes_keepdims_example", - // "test_reduce_prod_default_axes_keepdims_random", - // "test_reduce_prod_do_not_keepdims_example", - // "test_reduce_prod_do_not_keepdims_random", - // "test_reduce_prod_keepdims_example", - // "test_reduce_prod_keepdims_random", - // "test_reduce_prod_negative_axes_keepdims_example", - // "test_reduce_prod_negative_axes_keepdims_random", - // "test_reduce_sum_default_axes_keepdims_example", - // "test_reduce_sum_default_axes_keepdims_random", - // "test_reduce_sum_do_not_keepdims_example", - // "test_reduce_sum_do_not_keepdims_random", + "test_reduce_max_default_axes_keepdims_random", + "test_reduce_max_do_not_keepdims_example", + "test_reduce_max_do_not_keepdims_random", + "test_reduce_max_keepdims_example", + "test_reduce_max_keepdims_random", + "test_reduce_max_negative_axes_keepdims_example", + "test_reduce_max_negative_axes_keepdims_random", + "test_reduce_mean_default_axes_keepdims_example", + "test_reduce_mean_default_axes_keepdims_random", + "test_reduce_mean_do_not_keepdims_example", + "test_reduce_mean_do_not_keepdims_random", + "test_reduce_mean_keepdims_example", + "test_reduce_mean_keepdims_random", + "test_reduce_mean_negative_axes_keepdims_example", + "test_reduce_mean_negative_axes_keepdims_random", + "test_reduce_min_default_axes_keepdims_example", + "test_reduce_min_default_axes_keepdims_random", + "test_reduce_min_do_not_keepdims_example", + "test_reduce_min_do_not_keepdims_random", + "test_reduce_min_keepdims_example", + "test_reduce_min_keepdims_random", + "test_reduce_min_negative_axes_keepdims_example", + "test_reduce_min_negative_axes_keepdims_random", + "test_reduce_prod_default_axes_keepdims_example", + "test_reduce_prod_default_axes_keepdims_random", + "test_reduce_prod_do_not_keepdims_example", + "test_reduce_prod_do_not_keepdims_random", + "test_reduce_prod_keepdims_example", + "test_reduce_prod_keepdims_random", + "test_reduce_prod_negative_axes_keepdims_example", + "test_reduce_prod_negative_axes_keepdims_random", + "test_reduce_sum_default_axes_keepdims_example", + "test_reduce_sum_default_axes_keepdims_random", + "test_reduce_sum_do_not_keepdims_example", + "test_reduce_sum_do_not_keepdims_random", "test_reduce_sum_empty_axes_input_noop_example", "test_reduce_sum_empty_axes_input_noop_random", - // "test_reduce_sum_keepdims_example", - // "test_reduce_sum_keepdims_random", - // "test_reduce_sum_negative_axes_keepdims_example", - // "test_reduce_sum_negative_axes_keepdims_random", - // "test_reduce_sum_square_default_axes_keepdims_example", - // "test_reduce_sum_square_default_axes_keepdims_random", - // "test_reduce_sum_square_do_not_keepdims_example", - // "test_reduce_sum_square_do_not_keepdims_random", - // "test_reduce_sum_square_keepdims_example", - // "test_reduce_sum_square_keepdims_random", - // "test_reduce_sum_square_negative_axes_keepdims_example", - // "test_reduce_sum_square_negative_axes_keepdims_random", + "test_reduce_sum_keepdims_example", + "test_reduce_sum_keepdims_random", + "test_reduce_sum_negative_axes_keepdims_example", + "test_reduce_sum_negative_axes_keepdims_random", + "test_reduce_sum_square_default_axes_keepdims_example", + "test_reduce_sum_square_default_axes_keepdims_random", + "test_reduce_sum_square_do_not_keepdims_example", + "test_reduce_sum_square_do_not_keepdims_random", + "test_reduce_sum_square_keepdims_example", + "test_reduce_sum_square_keepdims_random", + "test_reduce_sum_square_negative_axes_keepdims_example", + "test_reduce_sum_square_negative_axes_keepdims_random", // "test_reflect_pad", "test_relu", "test_reshape_allowzero_reordered", diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py index 8b019f60d3e99..6f303acb4e97a 100644 --- a/onnxruntime/__init__.py +++ b/onnxruntime/__init__.py @@ -8,7 +8,7 @@ or the `Github project `_. """ -__version__ = "1.23.0" +__version__ = "1.24.0" __author__ = "Microsoft" # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package). diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc index 469084e7b4491..c51fc1cf54815 100644 --- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc +++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc @@ -130,6 +130,11 @@ Status SparseAttention::Compute(OpKernelContext* context) const { allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V)); } + OrtValue RotaryQKV; + OrtValue RotaryQ; + OrtValue RotaryK; + T* q_rotary = Q.GetMutable()->MutableData(); + T* k_rotary = packed_qkv ? nullptr : K.GetMutable()->MutableData(); if (do_rotary_) { rotary_embedding_helper::RotaryParameters rotary_params = {}; rotary_params.batch_size = batch_size; @@ -167,30 +172,22 @@ Status SparseAttention::Compute(OpKernelContext* context) const { const T* q_input; const T* k_input; - T* q_rotary; - T* k_rotary; if (packed_qkv) { - OrtValue RotaryQKV; TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV); q_input = Q.Get().Data(); k_input = q_input + num_heads_ * sequence_length * head_size; q_rotary = RotaryQKV.GetMutable()->MutableData(); k_rotary = q_rotary + num_heads_ * sequence_length * head_size; - Q = RotaryQKV; } else { - OrtValue RotaryQ; TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ); - OrtValue RotaryK; TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size}); Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK); q_input = Q.Get().Data(); k_input = K.Get().Data(); q_rotary = RotaryQ.GetMutable()->MutableData(); k_rotary = RotaryK.GetMutable()->MutableData(); - Q = RotaryQ; - K = RotaryK; } ORT_RETURN_IF_ERROR(RunRotaryEmbedding(tp, rotary_params, q_input, @@ -221,9 +218,8 @@ Status SparseAttention::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator)); // Compute the attention score and apply the score to V - return ApplyAttention(Q.Get().Data(), packed_qkv ? nullptr : K.Get().Data(), - packed_qkv ? nullptr : V.Get().Data(), past_key, past_value, - output, present_key, present_value, + return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get().Data(), + past_key, past_value, output, present_key, present_value, total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context); } } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl index 4f901a550e8bf..588f37051b534 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl @@ -60,7 +60,7 @@ namespace cutlass_kernels { template -#ifdef COMPILE_HOPPER_TMA_GEMMS +#if defined(COMPILE_HOPPER_TMA_GEMMS) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 900) && defined(__NV_SASS_VERSION__) void sm90_generic_mixed_gemm_kernelLauncher( ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases, @@ -269,6 +269,7 @@ void sm90_generic_mixed_gemm_kernelLauncher( } } #else // COMPILE_HOPPER_TMA_GEMMS +// This stub is now used for ALL non-SASS or non-SM90A compilation passes includes the 90-virtual (PTX) pass. void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const*, WeightType const*, ScaleZeroType const*, ScaleZeroType const*, BiasType const*, float const, OutputType*, int, int, int, int const, tkc::CutlassGemmConfig, diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc index 925a6913a2890..e5b15856a6c05 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#if USE_FPA_INTB_GEMM #include "contrib_ops/cuda/llm/fpA_intB_gemm_profiler.h" #include "contrib_ops/cuda/llm/common/workspace.h" @@ -97,3 +98,4 @@ bool WeightOnlyGroupwiseQuantGemmPluginProfiler::checkTactic(int m, int /*n*/, i } } // namespace onnxruntime::llm::kernels::weight_only +#endif diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index b5c1f73d1678d..a9bd4afc5cd09 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -31,6 +31,11 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform); const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape"); + // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output + if (prepare_indirect_dispatch_) { + shader.AddInput("seqlen_k", ShaderUsage::None); + shader.AddOutput("indirect_buffer", ShaderUsage::None); + } shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.copy_size") << " let output_indices = " << copy_kv_shape.OffsetToIndices("global_idx") << ";\n" @@ -38,8 +43,26 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { " let sequence_id = output_indices[2];\n" " let num_head_id = output_indices[1];\n" " let batch = output_indices[0];\n"; + if (prepare_indirect_dispatch_) { + shader.MainFunctionBody() << " let total_seq_length = u32(seqlen_k[0u]) + 1u;\n"; + } else { + shader.MainFunctionBody() << " let total_seq_length = uniforms.total_sequence_length;\n"; + } + + // Add indirect dispatch logic for thread 0 + if (prepare_indirect_dispatch_) { + // TODO: Add NormalizeDispatchGroupSize logic here to avoid exceeding max dispatch size. + shader.MainFunctionBody() << " // Prepare indirect dispatch buffer for thread 0\n" + << " if (global_idx == 0u) {\n" + << " let num_total_seq_length_tile = (total_seq_length + uniforms.tile_size - 1u) / uniforms.tile_size;\n" + << " indirect_buffer[0] = num_total_seq_length_tile;\n" + << " indirect_buffer[1] = uniforms.num_heads;\n" + << " indirect_buffer[2] = 1u;\n" + << " }\n\n"; + } + if (has_past_) { - shader.MainFunctionBody() << "let past_sequence_length = uniforms.past_sequence_length;\n"; + shader.MainFunctionBody() << "let past_sequence_length = total_seq_length - uniforms.kv_sequence_length;\n"; if (past_present_share_buffer_) { shader.MainFunctionBody() << " let present_offset = " << present_key.IndicesToOffset("present_key_indices_t(batch, num_head_id, past_sequence_length + sequence_id, head_size_id)") << ";\n" << " let offset = " << key.IndicesToOffset(kv_BNSH_ ? "key_indices_t(batch, num_head_id, sequence_id, head_size_id)" : "key_indices_t(batch, sequence_id, num_head_id, head_size_id)") << ";\n" @@ -70,10 +93,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const { Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters, const Tensor* K, const Tensor* past_key, Tensor* present_key, - const Tensor* V, const Tensor* past_value, Tensor* present_value) { + const Tensor* V, const Tensor* past_value, Tensor* present_value, + uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) { // CopyKVCache takes past key/value and current key/value and copies them to present key and value. // This makes it so that FlashAttention only needs to look at present key and value, and saves // number of input buffers in the shader, which we run out of (<=8) without this optimization. + // If indirect_buffer is provided, also prepare indirect dispatch buffer for flash attention. const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1); bool has_past = (parameters.total_sequence_length_ - parameters.kv_sequence_length_) > 0; // parameters.total_sequence_length_ is past_sequence_length + kv_sequence_length. @@ -83,7 +108,12 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt int copy_sequence_length = has_past && parameters.past_present_share_buffer_ ? parameters.kv_sequence_length_ : parameters.total_sequence_length_; TensorShape copy_kv_shape{parameters.batch_size_, num_heads, copy_sequence_length, parameters.head_size_ / components}; int64_t copy_size = copy_kv_shape.Size(); - CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_}; + + // Determine if we need to prepare indirect dispatch + bool prepare_indirect_dispatch = (indirect_buffer != nullptr); + + CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_, + prepare_indirect_dispatch}; if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) { program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, components}}); @@ -94,20 +124,31 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}, {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}}); } + + if (prepare_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } + if (has_past && !parameters.past_present_share_buffer_) { program.AddInputs({{past_key, ProgramTensorMetadataDependency::TypeAndRank, components}, {past_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); } program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components}, - {present_value, ProgramTensorMetadataDependency::Rank, components}}) - .AddIndices(std::move(copy_kv_shape)); + {present_value, ProgramTensorMetadataDependency::Rank, components}}); + + if (prepare_indirect_dispatch) { + program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None}); + } + + program.AddIndices(std::move(copy_kv_shape)); program.SetDispatchGroupSize(static_cast((copy_size + 63) / 64)) .SetWorkgroupSize(64) - .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_) + .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch) .AddUniformVariables({{static_cast(copy_size)}, - // Note that when parameters.past_present_share_buffer_ is true, parameters.past_sequence_length_ will become to - // max_sequence_length. To get a valid past_sequence_length, we use total_sequence_length - kv_sequence_length. - {static_cast(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}}); + {static_cast(parameters.total_sequence_length_)}, + {static_cast(parameters.kv_sequence_length_)}, + {tile_size}, + {static_cast(parameters.num_heads_)}}); return context.RunProgram(program); } @@ -147,6 +188,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } if (has_attention_bias_) { shader.AddInput("attention_bias", ShaderUsage::UseUniform); } @@ -159,23 +203,25 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q, - const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, - const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile, uint32_t tile_size, - uint32_t present_sequence_length) { + const Tensor* attention_bias, Tensor* output, Tensor* present_key, Tensor* metadata, const Tensor* seqlen_k, + const WebgpuAttentionParameters& parameters, const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, bool use_indirect_dispatch, uint32_t present_sequence_length) { const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast(parameters.head_size_)) : parameters.scale_; const bool has_attention_bias = attention_bias != nullptr; const int components = 4; - FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size}; + FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch}; program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components}, {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } if (has_attention_bias) { program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank}); } @@ -183,15 +229,18 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte {metadata, ProgramTensorMetadataDependency::Rank, 2}}); const uint32_t vectorized_head_size = parameters.head_size_ / components; - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .SetWorkgroupSize(64) - .CacheHint(tile_size, has_attention_bias) + if (use_indirect_dispatch) { + program.SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.SetWorkgroupSize(64) + .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch) .AddUniformVariables({{static_cast(vectorized_head_size)}, {static_cast(parameters.total_sequence_length_)}, {static_cast(alpha)}, present_sequence_length, {static_cast(parameters.n_reps)}, - {num_total_seq_length_tile}, {num_present_sequence_length_tile}, {static_cast(parameters.num_heads_)}}); @@ -202,6 +251,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad shader.AddInput("metadata", ShaderUsage::UseUniform); shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("out_split_vx", ShaderUsage::UseUniform); const uint32_t tile_size_k_vec = 8u; @@ -210,7 +262,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context, @@ -218,26 +271,33 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte const Tensor* qk, Tensor* out_split_vx, Tensor* present_value, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, + const Tensor* indirect_buffer, uint32_t num_total_seq_length_tile, uint32_t num_present_sequence_length_tile, uint32_t tile_size, + bool use_indirect_dispatch, uint32_t present_sequence_length) { const int components = 4; int head_size_vec = parameters.v_head_size_ / components; - FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec}; + FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch}; program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2}, {qk, ProgramTensorMetadataDependency::TypeAndRank}, {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}}); program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); // [B, N, split_k, head_size] - program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile) - .CacheHint(tile_size, head_size_vec) + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}) + .SetIndirectDispatchTensor(indirect_buffer); + } else { + program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile); + } + program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch) .SetWorkgroupSize(64) .AddUniformVariables({{static_cast(parameters.total_sequence_length_)}, {static_cast(head_size_vec)}, present_sequence_length, {static_cast(parameters.n_reps)}, - num_total_seq_length_tile, num_present_sequence_length_tile, {static_cast(parameters.num_heads_)}}); @@ -246,27 +306,38 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddInput("input", ShaderUsage::UseUniform); + if (use_indirect_dispatch_) { + shader.AddInput("seqlens_k", ShaderUsage::None); + } shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template", - WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_)); + WGSL_TEMPLATE_PARAMETER(seq_tile_size, seq_tile_size_), + WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), + WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_)); } Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context, const Tensor* out_split_vx, Tensor* output, + const Tensor* seqlen_k, const WebgpuAttentionParameters& parameters, uint32_t num_total_seq_length_tile, - uint32_t num_present_sequence_length_tile) { + uint32_t num_present_sequence_length_tile, + uint32_t seq_tile_size, + bool use_indirect_dispatch) { const int components = 4; constexpr int tile_size = 8; int tile_head_size = tile_size * components; - FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size}; + FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, seq_tile_size, use_indirect_dispatch}; program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}}); + if (use_indirect_dispatch) { + program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None}); + } program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}}); const uint32_t num_head_size_tile = static_cast((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size); program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile) - .CacheHint(tile_size) + .CacheHint(tile_size, seq_tile_size, use_indirect_dispatch) .SetWorkgroupSize(tile_size * tile_size) .AddUniformVariables({{static_cast(parameters.v_head_size_ / components)}, num_total_seq_length_tile, @@ -279,14 +350,15 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias, Tensor* output, const Tensor* past_key, Tensor* present_key, const Tensor* past_value, Tensor* present_value, - const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) { - ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value)); - + const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) { // Extract present_sequence_length directly from present_key tensor shape: // (batch_size, num_heads, total_sequence_length/max_sequence_length, head_size) const uint32_t present_sequence_length = static_cast(present_key->Shape()[2]); + if (parameters.sequence_length_ > 1) { const uint32_t tile_size = 64; + // For encode path, use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); bool has_attention_bias = attention_bias != nullptr; bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"}; bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"}; @@ -323,7 +395,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co return context.RunProgram(program); } - // Use present_sequence_length instead of total_sequence_length to make sure the |qk| buffer is static when static qv cache is enabled. + // For decode path (sequence_length == 1) const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_, parameters.sequence_length_, present_sequence_length}); const TensorShape qk_shape(qk_dims); @@ -331,21 +403,48 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co constexpr uint32_t tile_size = 64; const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size; const uint32_t num_present_sequence_length_tile = (present_sequence_length + tile_size - 1) / tile_size; + + // Determine if we should use indirect dispatch + const bool use_indirect_dispatch = parameters.past_present_share_buffer_ && + seqlen_k != nullptr && + context.IsGraphCaptureEnabled(); + + // Create indirect dispatch buffer if using indirect dispatch + Tensor* indirect_buffer_ptr = nullptr; + Tensor indirect_buffer; + if (use_indirect_dispatch) { + const TensorShape indirect_buffer_shape{3}; // 3 uint32 values for dispatch dimensions + indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType(), indirect_buffer_shape); + indirect_buffer_ptr = &indirect_buffer; + // Use the fused CopyKVCache that also prepares the indirect dispatch buffer + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr)); + } else { + // Use the original CopyKVCache without indirect dispatch preparation + ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr)); + } + // The metadata is used to store the max and sum of each tile. const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, 2}); const TensorShape metadata_shape(metadata_dims); Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType(), metadata_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, - parameters, num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata, seqlen_k, + parameters, indirect_buffer_ptr, num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch, present_sequence_length)); - const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, num_present_sequence_length_tile, parameters.head_size_}); + const TensorShapeVector out_split_vx_dims({parameters.batch_size_, parameters.num_heads_, + num_present_sequence_length_tile, parameters.head_size_}); const TensorShape out_split_vx_shape(out_split_vx_dims); Tensor out_split_vx = context.CreateGPUTensor(Q->DataType(), out_split_vx_shape); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, parameters, - num_total_seq_length_tile, num_present_sequence_length_tile, tile_size, present_sequence_length)); - ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, parameters, num_total_seq_length_tile, num_present_sequence_length_tile)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeSplitVxScore(context, &metadata, &qk, &out_split_vx, present_value, + seqlen_k, parameters, indirect_buffer_ptr, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, + use_indirect_dispatch, present_sequence_length)); + ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters, + num_total_seq_length_tile, + num_present_sequence_length_tile, tile_size, use_indirect_dispatch)); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index c75494df253c1..7d71dc0f4d42d 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -17,19 +17,24 @@ using namespace onnxruntime::webgpu; class CopyKVCacheProgram final : public Program { public: - CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer) - : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer) { + CopyKVCacheProgram(const std::string& kernel_name, bool has_past, bool kv_BNSH, bool past_present_share_buffer, + bool prepare_indirect_dispatch = false) + : Program{kernel_name}, has_past_(has_past), kv_BNSH_(kv_BNSH), past_present_share_buffer_(past_present_share_buffer), prepare_indirect_dispatch_(prepare_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"copy_size", ProgramUniformVariableDataType::Uint32}, - {"past_sequence_length", ProgramUniformVariableDataType::Uint32}); + {"total_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"kv_sequence_length", ProgramUniformVariableDataType::Uint32}, + {"tile_size", ProgramUniformVariableDataType::Uint32}, + {"num_heads", ProgramUniformVariableDataType::Uint32}); private: bool has_past_; bool kv_BNSH_; bool past_present_share_buffer_; + bool prepare_indirect_dispatch_; }; class FlashAttentionProgram final : public Program { @@ -75,8 +80,8 @@ class FlashAttentionProgram final : public Program { class FlashAttentionDecodeQKTProgram final : public Program { public: FlashAttentionDecodeQKTProgram(const std::string& kernel_name, - bool has_attention_bias, uint32_t tile_size) - : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size) { + bool has_attention_bias, uint32_t tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, has_attention_bias_(has_attention_bias), tile_size_(tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -86,19 +91,19 @@ class FlashAttentionDecodeQKTProgram final : public Program { public: - FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec) - : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec) { + FlashAttentionDecodeSplitVxProgram(const std::string& kernel_name, uint32_t tile_size, int head_size_vec, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), head_size_vec_(head_size_vec), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -107,19 +112,19 @@ class FlashAttentionDecodeSplitVxProgram final : public Program { public: - FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size) - : Program{kernel_name}, tile_size_(tile_size) { + FlashAttentionDecodeVxReduceProgram(const std::string& kernel_name, uint32_t tile_size, uint32_t seq_tile_size, bool use_indirect_dispatch) + : Program{kernel_name}, tile_size_(tile_size), seq_tile_size_(seq_tile_size), use_indirect_dispatch_(use_indirect_dispatch) { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -132,11 +137,13 @@ class FlashAttentionDecodeVxReduceProgram final : public Program tile_qk: array; $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); +#if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; +#else + let total_sequence_length = uniforms.total_sequence_length; +#endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let q_offset = head_idx * uniforms.head_size_vec; - var total_sequence_length = uniforms.total_sequence_length; let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.present_sequence_length * uniforms.head_size_vec; for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) { if (local_idx < tile_size_k_vec && k + local_idx < uniforms.head_size_vec) { @@ -95,7 +101,7 @@ $MAIN { for (var i = 0u; i < tile_size && (total_seq_offset + i) < total_sequence_length; i++) { l_sum += exp(f32(tile_qk[i]) - l_max); } - let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile; + let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + workgroup_idx % num_total_seq_length_tile; metadata[meta_offset] = metadata_value_t(l_max, l_sum); } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template index c7593af311ce2..37cf7e8f11b1f 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template @@ -5,6 +5,7 @@ #param head_size_vec #param tile_size_k_vec #param sub_tile_count +#param use_indirect_dispatch // Note that this shader adopts similar algorithm with dp4a generation shader. // @@ -40,9 +41,14 @@ var qkv_values: array, $MAIN { let local_row = u32(local_idx / tile_size_k_vec); let local_col = local_idx % tile_size_k_vec; - let total_seq_offset = (workgroup_idx % uniforms.num_total_seq_length_tile) * tile_size; - let head_idx = u32(workgroup_idx / uniforms.num_total_seq_length_tile); - var total_sequence_length = uniforms.total_sequence_length; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + #else + let total_sequence_length = uniforms.total_sequence_length; + #endif + let num_total_seq_length_tile = (total_sequence_length + tile_size - 1) / tile_size; + let total_seq_offset = (workgroup_idx % num_total_seq_length_tile) * tile_size; + let head_idx = u32(workgroup_idx / num_total_seq_length_tile); let present_offset = u32(head_idx / uniforms.n_reps) * head_size_vec * uniforms.present_sequence_length; // Calculate the global max and sum in qk. @@ -50,12 +56,12 @@ $MAIN { { var g_max = f32(-3.402823e+38f); var g_sum = f32(0); - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; g_max = max(g_max, metadata[meta_offset].x); } - for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++) + for (var i = 0u; i < num_total_seq_length_tile; i++) { let meta_offset = head_idx * uniforms.num_present_sequence_length_tile + i; let m_value = metadata[meta_offset]; @@ -95,7 +101,7 @@ $MAIN { } for (var i = local_idx; i < head_size_vec; i += workgroup_size_x) { - let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * head_size_vec + i; + let out_offset = head_idx * uniforms.num_present_sequence_length_tile * head_size_vec + (workgroup_idx % num_total_seq_length_tile) * head_size_vec + i; out_split_vx[out_offset] = tile_output[i]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template index a4381baa638ce..22f18655307de 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template @@ -1,7 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#param seq_tile_size #param tile_size +#param use_indirect_dispatch // Inputs are splits of the GQA output, split into num_total_seq_length_tiles // rows. This shader needs to add these splits across the row dimension to @@ -23,10 +25,16 @@ $MAIN { var value = output_value_t(0); let local_row = u32(local_idx / tile_size); let local_col = local_idx % tile_size; + #if use_indirect_dispatch + let total_sequence_length = u32(seqlens_k[0]) + 1u; + let num_total_seq_length_tile = (total_sequence_length + seq_tile_size - 1) / seq_tile_size; + #else + let num_total_seq_length_tile = uniforms.num_total_seq_length_tile; + #endif if (head_size_offset + local_col < uniforms.head_size_vec) { - for (var r = 0u; r < uniforms.num_total_seq_length_tile; r += tile_size) { - if (r + local_row < uniforms.num_total_seq_length_tile) { + for (var r = 0u; r < num_total_seq_length_tile; r += tile_size) { + if (r + local_row < num_total_seq_length_tile) { value += input[in_offset + (r + local_row) * uniforms.head_size_vec + head_size_offset + local_col]; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc index 8b7b257dd2852..49cc0209785c5 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc @@ -110,35 +110,87 @@ Status GeneratePositionIDs(onnxruntime::webgpu::ComputeContext& context, const W return context.RunProgram(program); } -Status RunRotaryEmbedding(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& params, const Tensor* input, const Tensor* pos_ids, const Tensor* cos_cache, const Tensor* sin_cache, Tensor* output, bool is_query_input) { +// Fused Q/K rotary embedding +Status RunFusedQKRotaryEmbedding(onnxruntime::webgpu::ComputeContext& context, + const WebgpuAttentionParameters& params, + const Tensor* query_in, + const Tensor* key_in, + const Tensor* seqlen_k, + const Tensor* cos_cache, + const Tensor* sin_cache, + Tensor* query_out, + Tensor* key_out) { + Tensor pos_ids = context.CreateGPUTensor(DataTypeImpl::GetType(), + TensorShape({params.batch_size_, params.sequence_length_})); + ORT_RETURN_IF_ERROR(GeneratePositionIDs(context, params, seqlen_k, &pos_ids)); + const auto half_rotary_embedding_dim = gsl::narrow_cast(cos_cache->Shape()[1]); const auto head_size = params.head_size_; - const auto hidden_size = is_query_input ? params.hidden_size_ : params.kv_hidden_size_; - const TensorShape global_shape({params.batch_size_, params.sequence_length_, hidden_size / head_size, static_cast(head_size - half_rotary_embedding_dim)}); - const auto rank = global_shape.NumDimensions(); - std::vector global_dims(rank); - std::vector global_strides(rank); + + // Build Q domain + const auto hidden_size_q = params.hidden_size_; + const TensorShape q_global_shape({params.batch_size_, params.sequence_length_, + hidden_size_q / head_size, + static_cast(head_size - half_rotary_embedding_dim)}); + const auto rank = q_global_shape.NumDimensions(); + std::vector q_global_dims(rank); + std::vector q_global_strides(rank); for (size_t j = 0; j < rank; ++j) { - global_dims[j] = gsl::narrow_cast(global_shape[j]); - global_strides[j] = gsl::narrow_cast(global_shape.SizeFromDimension(j + 1)); + q_global_dims[j] = gsl::narrow_cast(q_global_shape[j]); + q_global_strides[j] = gsl::narrow_cast(q_global_shape.SizeFromDimension(j + 1)); } - const auto input_output_strides = std::vector({gsl::narrow_cast(input->Shape().SizeFromDimension(1)), gsl::narrow_cast(hidden_size), gsl::narrow_cast(head_size), 1}); - const auto output_size = gsl::narrow_cast(global_shape.Size()); - RotaryEmbeddingProgram program(params.rotary_interleaved_); + // Build K domain + const auto hidden_size_k = params.kv_hidden_size_; + const TensorShape k_global_shape({params.batch_size_, params.sequence_length_, + hidden_size_k / head_size, + static_cast(head_size - half_rotary_embedding_dim)}); + std::vector k_global_dims(rank); + for (size_t j = 0; j < rank; ++j) { + k_global_dims[j] = gsl::narrow_cast(k_global_shape[j]); + } + + const auto q_domain_size = gsl::narrow_cast(q_global_shape.Size()); + + const auto q_input_output_strides = std::vector( + {gsl::narrow_cast(query_in->Shape().SizeFromDimension(1)), + gsl::narrow_cast(hidden_size_q), + gsl::narrow_cast(head_size), + 1u}); + + const auto k_input_output_strides = std::vector( + {gsl::narrow_cast(key_in->Shape().SizeFromDimension(1)), + gsl::narrow_cast(hidden_size_k), + gsl::narrow_cast(head_size), + 1u}); + + // Dispatch computations only over the Q domain, and fuse K write operations using a head-index-based condition. + FusedQKRotaryEmbeddingProgram program(params.rotary_interleaved_); program .CacheHint(params.rotary_interleaved_) - .AddInputs({{input, ProgramTensorMetadataDependency::Rank}, - {pos_ids, ProgramTensorMetadataDependency::Rank}, - {cos_cache, ProgramTensorMetadataDependency::Rank}, - {sin_cache, ProgramTensorMetadataDependency::Rank}}) - .AddOutput(output) - .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) - .AddUniformVariables({{params.scale_}, - {gsl::make_span(global_dims)}, - {gsl::make_span(global_strides)}, - {gsl::make_span(input_output_strides)}}) + .AddInputs({ + {query_in, ProgramTensorMetadataDependency::Rank}, + {key_in, ProgramTensorMetadataDependency::Rank}, + {&pos_ids, ProgramTensorMetadataDependency::Rank}, + {cos_cache, ProgramTensorMetadataDependency::Rank}, + {sin_cache, ProgramTensorMetadataDependency::Rank}, + }) + .AddOutputs({ + {query_out, ProgramTensorMetadataDependency::Rank}, + {key_out, ProgramTensorMetadataDependency::Rank}, + }) + .SetDispatchGroupSize((q_domain_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({ + {params.scale_}, + {gsl::make_span(q_global_dims)}, + {gsl::make_span(q_global_strides)}, + {gsl::make_span(q_input_output_strides)}, + {gsl::make_span(k_global_dims)}, + {gsl::make_span(k_input_output_strides)}, + {q_domain_size}, + }) .AddIndices(TensorShape{1, 1}); + return context.RunProgram(program); } @@ -199,15 +251,6 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& parameters.past_present_share_buffer_ = present_key != nullptr && present_value != nullptr && past_key != nullptr && past_value != nullptr && past_key->DataRaw() == present_key->DataRaw() && past_value->DataRaw() == present_value->DataRaw(); ORT_ENFORCE(parameters.total_sequence_length_ <= parameters.seqlen_present_kv_cache_, "Total sequence length cannot be greater than the existing KV cache length."); - // Use a sliding window if the total sequence exceeds the window's length. - bool use_sliding_window = (local_window_size_ != -1 && local_window_size_ < parameters.total_sequence_length_); - if (!do_rotary_ && - head_sink == nullptr && !use_smooth_softmax_ && - !use_sliding_window && - CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) { - return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value, - present_value, parameters, context); - } Tensor qSplit; Tensor kSplit; @@ -218,6 +261,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& vSplit = context.CreateGPUTensor(query->DataType(), TensorShape({parameters.batch_size_, parameters.sequence_length_, parameters.kv_hidden_size_})); ORT_RETURN_IF_ERROR(SplitPackedQKV(context, parameters, query, &qSplit, &kSplit, &vSplit)); parameters.is_packed_qkv_ = false; + parameters.qkv_format_ = Q_K_V_BSNH; query = &qSplit; key = &kSplit; value = &vSplit; @@ -228,15 +272,24 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& if (do_rotary_) { qRotary = context.CreateGPUTensor(query->DataType(), query->Shape()); kRotary = context.CreateGPUTensor(key->DataType(), key->Shape()); - auto pos_ids_shape = TensorShape({parameters.batch_size_, parameters.sequence_length_}); - Tensor pos_ids = context.CreateGPUTensor(DataTypeImpl::GetType(), pos_ids_shape); - ORT_RETURN_IF_ERROR(GeneratePositionIDs(context, parameters, seqlen_k, &pos_ids)); - ORT_RETURN_IF_ERROR(RunRotaryEmbedding(context, parameters, query, &pos_ids, cos_cache, sin_cache, &qRotary, /* is_query_input = */ true)); - ORT_RETURN_IF_ERROR(RunRotaryEmbedding(context, parameters, key, &pos_ids, cos_cache, sin_cache, &kRotary, /* is_query_input = */ false)); + ORT_RETURN_IF_ERROR(RunFusedQKRotaryEmbedding(context, parameters, + query, key, + seqlen_k, + cos_cache, sin_cache, + &qRotary, &kRotary)); query = &qRotary; key = &kRotary; } + // Use a sliding window if the total sequence exceeds the window's length. + bool use_sliding_window = (local_window_size_ != -1 && local_window_size_ < parameters.total_sequence_length_); + if (head_sink == nullptr && !use_smooth_softmax_ && + !use_sliding_window && + CanApplyFlashAttention(attention_bias, present_key, present_value, parameters, context)) { + return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value, + present_value, parameters, context); + } + TensorShapeVector q_new_dims({parameters.batch_size_, parameters.num_heads_, parameters.sequence_length_, parameters.head_size_}); TensorShape q_new_shape(q_new_dims); diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc index 8f316cfae80e9..79c8f45fb7832 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc @@ -50,6 +50,57 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const { return Status::OK(); } +Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const { + // Inputs + const auto& q_input = shader.AddInput("q_input", ShaderUsage::UseUniform); + const auto& k_input = shader.AddInput("k_input", ShaderUsage::UseUniform); + const auto& position_ids = shader.AddInput("position_ids", ShaderUsage::UseUniform); + const auto& cos_cache = shader.AddInput("cos_cache", ShaderUsage::UseUniform); + const auto& sin_cache = shader.AddInput("sin_cache", ShaderUsage::UseUniform); + // Outputs + const auto& q_output = shader.AddOutput("q_output", ShaderUsage::UseUniform); + const auto& k_output = shader.AddOutput("k_output", ShaderUsage::UseUniform); + // Indices helper + const auto& dummy_indices = shader.AddIndices("dummy_indices", ShaderUsage::None); + + const auto interleaved_str = interleaved_ ? "true" : "false"; + + shader.MainFunctionBody() + << " if (global_idx >= uniforms.q_domain_size) { return; }\n" + << " let half_rotary_dim = uniforms.cos_cache_shape[1];\n" + << " let bsnh = global_idx / uniforms.q_global_stride % uniforms.q_global_shape;\n" + << " if (bsnh[3] < half_rotary_dim) {\n" + << " let pos_ids_idx = " << position_ids.BroadcastedIndicesToOffset("bsnh.xy", dummy_indices) << ";\n" + << " let position_id = u32(" << position_ids.GetByOffset("pos_ids_idx") << ") + select(0u, bsnh[1], pos_ids_idx == 0u);\n" + << " let cos_v = " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " let sin_v = " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" + << " let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" + << " let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n" + << " " << q_output.SetByOffset("qi", "q_re") << "\n" + << " let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n" + << " " << q_output.SetByOffset("qj", "q_im") << "\n" + // Conditionally process Key (only for heads that exist in K domain) + << " if (bsnh[2] < uniforms.k_global_shape[2]) {\n" + << " let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" + << " let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" + << " let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n" + << " " << k_output.SetByOffset("ki", "k_re") << "\n" + << " let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n" + << " " << k_output.SetByOffset("kj", "k_im") << "\n" + << " }\n" + << " } else {\n" + << " let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n" + << " " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n" + // Conditionally process Key (only for heads that exist in K domain) + << " if (bsnh[2] < uniforms.k_global_shape[2]) {\n" + << " let kk = dot(bsnh, uniforms.k_input_output_stride) + half_rotary_dim;\n" + << " " << k_output.SetByOffset("kk", k_input.GetByOffset("kk")) << "\n" + << " }\n" + << " }\n"; + return Status::OK(); +} + RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) { scale_ = info.GetAttrOrDefault("scale", 1.0); rotary_embedding_dim_ = static_cast(info.GetAttrOrDefault("rotary_embedding_dim", 0)); diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h index 0d73b89fb62df..e3dc4468cb3ed 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h @@ -29,6 +29,27 @@ class RotaryEmbeddingProgram final : public Program { const bool interleaved_; }; +class FusedQKRotaryEmbeddingProgram final : public Program { + public: + FusedQKRotaryEmbeddingProgram(bool interleaved) : Program{"FusedQKRotaryEmbedding"}, interleaved_{interleaved} {} + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + // q_* describes query rotation domain (same definition as existing program) + // k_* describes key rotation domain + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES( + {"scale", ProgramUniformVariableDataType::Float32}, + {"q_global_shape", ProgramUniformVariableDataType::Uint32}, + {"q_global_stride", ProgramUniformVariableDataType::Uint32}, + {"q_input_output_stride", ProgramUniformVariableDataType::Uint32}, + {"k_global_shape", ProgramUniformVariableDataType::Uint32}, + {"k_input_output_stride", ProgramUniformVariableDataType::Uint32}, + {"q_domain_size", ProgramUniformVariableDataType::Uint32}); + + private: + const bool interleaved_; +}; + class RotaryEmbedding final : public WebGpuKernel { public: RotaryEmbedding(const OpKernelInfo& info); diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template index ee6dde3788157..eebe329c104e7 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul.wgsl.template @@ -6,6 +6,8 @@ #param has_zero_points #param is_qualcomm +#use .getByOffset .setByOffset + #include "quantization/dp4a_matmul_common.wgsl.template" // This shader implements co-operative matrix multiply. The key idea here is to @@ -57,11 +59,11 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32) { return; } - tile_A[col][row] = input_a[a_global*uniforms.K16+kidx_v+col]; + tile_A[col][row] = a.getByOffset(a_global*uniforms.K16+kidx_v+col); if (col == 0) { // kidx_v - covers 16 values of k - scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/8]; + scale_A[row] = scales_a.getByOffset(a_global*(uniforms.K/128) + kidx_v/8); } } @@ -74,14 +76,14 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32) return; } - let b_value = input_b[b_global*uniforms.K16+kidx_v+col]; + let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col); let block_idx = kidx_v/(block_size/16); let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col); tile_B[col][row] = DequantizedFrom4BitsTo8Bits(b_value, zero); if (col == 0) { // kidx_v - each kidx_v covers 16 values of k - scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx]; + scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx); } } #endif @@ -95,13 +97,13 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32) return; } - let b_value = input_b[b_global*uniforms.K16+kidx_v+col]; + let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col); tile_B[col][row] = AlignWithZeroPoint(b_value); if (col == 0) { // kidx_v - each kidx_v covers 16 values of k let block_idx = kidx_v/(block_size/16); - scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx]; + scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx); #if has_zero_points zeroes[row] = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col); #endif @@ -117,10 +119,10 @@ fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32) { return; } - let b_value = input_b[b_global*uniforms.K16+kidx_v+col]; + let b_value = b.getByOffset(b_global*uniforms.K16+kidx_v+col); tile_B[col][row] = DequantizedFrom2BitsTo8Bits(b_value); let block_idx = kidx_v/(block_size/16); - scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + block_idx]; + scale_B[row] = scales_b.getByOffset(b_global*(uniforms.K/block_size) + block_idx); } #endif @@ -362,15 +364,15 @@ $MAIN { if (a_global < uniforms.M && b_global < uniforms.N) { #if is_qualcomm - output[output_idx] = vec4(lane_outputs[0], lane_outputs[1], lane_outputs[2], lane_outputs[3]); - output[output_idx+1] = vec4(lane_outputs[4], lane_outputs[5], lane_outputs[6], lane_outputs[7]); - output[output_idx+2] = vec4(lane_outputs[8], lane_outputs[9], lane_outputs[10], lane_outputs[11]); - output[output_idx+3] = vec4(lane_outputs[12], lane_outputs[13], lane_outputs[14], lane_outputs[15]); + output.setByOffset(output_idx, vec4(lane_outputs[0], lane_outputs[1], lane_outputs[2], lane_outputs[3])); + output.setByOffset(output_idx+1, vec4(lane_outputs[4], lane_outputs[5], lane_outputs[6], lane_outputs[7])); + output.setByOffset(output_idx+2, vec4(lane_outputs[8], lane_outputs[9], lane_outputs[10], lane_outputs[11])); + output.setByOffset(output_idx+3, vec4(lane_outputs[12], lane_outputs[13], lane_outputs[14], lane_outputs[15])); #else - output[output_idx] = lane_output1; - output[output_idx+1] = lane_output2; - output[output_idx+2] = lane_output3; - output[output_idx+3] = lane_output4; + output.setByOffset(output_idx, lane_output1); + output.setByOffset(output_idx+1, lane_output2); + output.setByOffset(output_idx+2, lane_output3); + output.setByOffset(output_idx+3, lane_output4); #endif } } // MAIN diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc index 84954946fa6be..d6e15e56f193f 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc @@ -10,39 +10,47 @@ namespace contrib { namespace webgpu { Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const { - shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); - shader.AddOutput("output", ShaderUsage::UseUniform); - shader.AddOutput("scales", ShaderUsage::UseUniform); - return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_quantize.wgsl.template"); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform); + const auto& scales = shader.AddOutput("scales", ShaderUsage::UseUniform); + return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_quantize.wgsl.template", + WGSL_TEMPLATE_VARIABLE(a, a), + WGSL_TEMPLATE_VARIABLE(output, output), + WGSL_TEMPLATE_VARIABLE(scales, scales)); } Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { - shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); - shader.AddInput("scales_a", ShaderUsage::UseUniform); - shader.AddInput("input_b", ShaderUsage::UseUniform); - shader.AddInput("scales_b", ShaderUsage::UseUniform); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + const auto& scales_a = shader.AddInput("scales_a", ShaderUsage::UseUniform); + const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform); + const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform); if (has_zero_points_) { shader.AddInput("zero_points", ShaderUsage::UseUniform); } - shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); return WGSL_TEMPLATE_APPLY(shader, "quantization/dp4a_matmul.wgsl.template", WGSL_TEMPLATE_PARAMETER(block_size, block_size_), WGSL_TEMPLATE_PARAMETER(has_zero_points, has_zero_points_), WGSL_TEMPLATE_PARAMETER(is_qualcomm, is_qualcomm_), WGSL_TEMPLATE_PARAMETER(n_bits, nbits_), - WGSL_TEMPLATE_PARAMETER(output_type_i32, true)); + WGSL_TEMPLATE_PARAMETER(output_type_i32, true), + WGSL_TEMPLATE_VARIABLE(a, a), + WGSL_TEMPLATE_VARIABLE(b, b), + WGSL_TEMPLATE_VARIABLE(output, output), + WGSL_TEMPLATE_VARIABLE(scales_a, scales_a), + WGSL_TEMPLATE_VARIABLE(scales_b, scales_b)); } // scale_A components = 1, b components = 4, output components = 1 Status DP4AMatMulNBitsSmallMProgram::GenerateShaderCode(ShaderHelper& shader) const { - shader.AddInput("input_a", ShaderUsage::UseUniform); - shader.AddInput("scales_a", ShaderUsage::UseUniform); - shader.AddInput("input_b", ShaderUsage::UseUniform); - shader.AddInput("scales_b", ShaderUsage::UseUniform); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform); + const auto& scales_a = shader.AddInput("scales_a", ShaderUsage::UseUniform); + const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform); + const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform); if (has_zero_points_) { shader.AddInput("zero_points", ShaderUsage::UseUniform); } - shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); ORT_ENFORCE(WorkgroupSizeX() % tile_size_k_vec_ == 0 && tile_size_k_vec_ % 4 == 0, "tile_size_k_vec_ must evenly divide workgroup size X and be divisible by 4"); const uint32_t sub_tile_count = WorkgroupSizeX() / tile_size_k_vec_; @@ -55,7 +63,12 @@ Status DP4AMatMulNBitsSmallMProgram::GenerateShaderCode(ShaderHelper& shader) co WGSL_TEMPLATE_PARAMETER(single_scale_weights, single_scale_weights_), WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec_)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec_), + WGSL_TEMPLATE_VARIABLE(a, a), + WGSL_TEMPLATE_VARIABLE(b, b), + WGSL_TEMPLATE_VARIABLE(output, output), + WGSL_TEMPLATE_VARIABLE(scales_a, scales_a), + WGSL_TEMPLATE_VARIABLE(scales_b, scales_b)); } Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales, diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template index 57e4903ad219f..dc4e244b1ad28 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_small_m.wgsl.template @@ -8,6 +8,9 @@ #param n_bits #param has_zero_points +#use .getByOffset .setByOffset + + #include "quantization/dp4a_matmul_common.wgsl.template" // This algorithm works to compute dot product of k in parallel, by processing k at each step amongst tile_size_k_vec threads, @@ -47,11 +50,11 @@ fn loadSHMA(a_global: u32, kidx_v: u32, col: u32) return; } - tile_A[col] = input_a[a_global*uniforms.K16+k_offset]; + tile_A[col] = a.getByOffset(a_global*uniforms.K16+k_offset); if (col < scale_a_size_in_tile_a) { // kidx_v - covers 16 values of k in input_a - scale_A[col] = scales_a[a_global*(uniforms.K/128) + kidx_v/8 + col]; + scale_A[col] = scales_a.getByOffset(a_global*(uniforms.K/128) + kidx_v/8 + col); } } @@ -70,7 +73,7 @@ $MAIN { #endif #if single_scale_weights let zero = mm_read_zero(0, 0, uniforms.N, uniforms.zero_blocks_per_col); - let own_scale_b = scales_b[0]; + let own_scale_b = scales_b.getByOffset(0); #endif for (var kidx_v:u32 = 0; kidx_v < uniforms.K32; kidx_v += tile_size_k_vec) @@ -95,16 +98,16 @@ $MAIN { let b_offset = b_global * uniforms.K32 + k_offset; #if !single_scale_weights let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col); - let own_scale_b = scales_b[b_global * uniforms.K / uniforms.block_size + block_idx]; + let own_scale_b = scales_b.getByOffset(b_global * uniforms.K / uniforms.block_size + block_idx); #endif #if n_bits == 4 - let b_value = input_b[b_offset]; + let b_value = b.getByOffset(b_offset); let own_b = DequantizedFrom4BitsTo8Bits(b_value.xy, zero); let own_b1 = DequantizedFrom4BitsTo8Bits(b_value.zw, zero); inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b); #elif n_bits == 8 - let own_b = AlignWithZeroPoint(input_b[b_offset * 2]); - let own_b1 = AlignWithZeroPoint(input_b[b_offset * 2 + 1]); + let own_b = AlignWithZeroPoint(b.getByOffset(b_offset * 2)); + let own_b1 = AlignWithZeroPoint(b.getByOffset(b_offset * 2 + 1)); #if has_zero_points inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b, zero); #else @@ -112,7 +115,7 @@ $MAIN { #endif #elif n_bits == 2 - let b_value = input_b[b_offset]; + let b_value = b.getByOffset(b_offset); let own_b = DequantizedFrom2BitsTo8Bits(b_value.x); let own_b1 = DequantizedFrom2BitsTo8Bits(b_value.y); inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b); @@ -131,7 +134,7 @@ $MAIN { let b_global = b_global_base + local_idx; let output_idx = a_global * uniforms.N + b_global; if (b_global < uniforms.N) { - output[output_idx] = output_value; + output.setByOffset(output_idx, output_value); } } } // MAIN diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template index 8576dfeed7b82..09cbd78fd6ccd 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template @@ -5,6 +5,8 @@ // Quantizes input matrix A for DP4A computation // This shader quantizes float values to 8-bit signed integers using pack4x8snorm +#use .getByOffset .setByOffset + var a_values : array, 2>; var max_values : array; @@ -13,7 +15,7 @@ fn readInput(offset: u32) -> input_a_value_t if (offset >= uniforms.output_size) { return input_a_value_t(0); } - return input_a[offset]; + return a.getByOffset(offset); } $MAIN { @@ -26,11 +28,11 @@ $MAIN { let max_temp = max(max_val.xy, max_val.zw); let scale = max(max_temp[0], max_temp[1]); let norm_a = local_a/scale; - output[global_idx] = pack4x8snorm(vec4(norm_a)); + output.setByOffset(global_idx, pack4x8snorm(vec4(norm_a))); if (local_idx % 32 == 0) { // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f. - scales[workgroup_idx * 2 + local_idx / 32] = scale/127; + scales.setByOffset(workgroup_idx * 2 + local_idx / 32, scale/127); } } else if (sg_size == 16) { let local_a = readInput(global_idx); @@ -53,11 +55,11 @@ $MAIN { let max_temp = max(max_val.xy, max_val.zw); let scale = max(max_temp[0], max_temp[1]); let norm_a = local_a/scale; - output[global_idx] = pack4x8snorm(vec4(norm_a)); + output.setByOffset(global_idx, pack4x8snorm(vec4(norm_a))); if (local_idx % 32 == 0) { // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f. - scales[workgroup_idx * 2 + local_idx / 32] = scale/127; + scales.setByOffset(workgroup_idx * 2 + local_idx / 32, scale/127); } } else { let local_row = local_idx / 32u; @@ -78,11 +80,11 @@ $MAIN { let max_temp = max(max_val.xy, max_val.zw); let scale = max(max_temp[0], max_temp[1]); let norm_a = a_values[local_row][local_col]/scale; - output[global_idx] = pack4x8snorm(vec4(norm_a)); + output.setByOffset(global_idx, pack4x8snorm(vec4(norm_a))); if (local_col == 0u) { // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f. - scales[workgroup_idx * 2 + local_row] = scale/127; + scales.setByOffset(workgroup_idx * 2 + local_row, scale/127); } } } diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index a78eef98ce1ad..f0480a2e3c886 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -42,13 +42,13 @@ ONNX_OPERATOR_KERNEL_EX( MatMulNBits); Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) const { - shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); - shader.AddInput("input_b", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); - shader.AddInput("scales", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& b = shader.AddInput("input_b", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& scales = shader.AddInput("scales", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); if (has_zero_points_) { shader.AddInput("zero_points", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); } - shader.AddOutput("output", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY(); ORT_ENFORCE(tile_m_ == workgroup_size / 8, "tile_m must be workgroup_size / 8."); @@ -59,18 +59,22 @@ Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) cons WGSL_TEMPLATE_PARAMETER(has_zero_points, has_zero_points_), WGSL_TEMPLATE_PARAMETER(nbits, nbits_), WGSL_TEMPLATE_PARAMETER(tile_m, tile_m_), - WGSL_TEMPLATE_PARAMETER(tile_n, tile_n_)); + WGSL_TEMPLATE_PARAMETER(tile_n, tile_n_), + WGSL_TEMPLATE_VARIABLE(a, a), + WGSL_TEMPLATE_VARIABLE(b, b), + WGSL_TEMPLATE_VARIABLE(output, output), + WGSL_TEMPLATE_VARIABLE(scales, scales)); } // Apply similar idea with DP4AMatMulNBitsSmallMProgram algorithm. Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& a = shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias); const auto& b = shader.AddInput("input_b"); - shader.AddInput("scales_b"); + const auto& scales_b = shader.AddInput("scales_b"); if (has_zero_points_) { shader.AddInput("zero_points", ShaderUsage::UseUniform); } - shader.AddOutput("output", ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseElementTypeAlias); const uint32_t components_a = a.NumComponents(); const uint32_t components_b = b.NumComponents() / 4; // b is stored as uint32 which includes 4 uint8. @@ -92,7 +96,11 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count), WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_), WGSL_TEMPLATE_PARAMETER(tile_size_k, tile_size_k), - WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec)); + WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec), + WGSL_TEMPLATE_VARIABLE(a, a), + WGSL_TEMPLATE_VARIABLE(b, b), + WGSL_TEMPLATE_VARIABLE(output, output), + WGSL_TEMPLATE_VARIABLE(scales_b, scales_b)); } Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template index aba6e3d57c72a..0fe3ec92ef3de 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.wgsl.template @@ -12,6 +12,8 @@ #param tile_size_k #param tile_size +#use .getByOffset .setByOffset + #include "quantization/matmul_nbits_zero_pt.wgsl.template" // Shared memory @@ -22,7 +24,7 @@ fn loadSHMA(batch: u32, a_global: u32, kidx: u32, col: u32) { let k_offset = kidx / component_a + col; if (batch < uniforms.batch_count && k_offset < uniforms.K_of_a) { - tile_A[col] = input_a[batch * uniforms.M * uniforms.K_of_a + a_global * uniforms.K_of_a + k_offset]; + tile_A[col] = a.getByOffset(batch * uniforms.M * uniforms.K_of_a + a_global * uniforms.K_of_a + k_offset); } else { tile_A[col] = input_a_value_t(0); } @@ -38,7 +40,7 @@ $MAIN { #if single_scale_weights let block_idx = 0; - let scale_b = scales_b[0]; + let scale_b = scales_b.getByOffset(0); let zero = mm_read_zero(0, 0, uniforms.N, uniforms.zero_blocks_per_col); #endif @@ -58,10 +60,10 @@ $MAIN { { #if !single_scale_weights let block_idx = (kidx + idx * elements_in_value_b) / uniforms.block_size; - let scale_b = scales_b[b_global * uniforms.blocks_per_col + block_idx]; + let scale_b = scales_b.getByOffset(b_global * uniforms.blocks_per_col + block_idx); let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col); #endif - var b_value = input_b[b_global * uniforms.K_of_b + k_offset]; + var b_value = b.getByOffset(b_global * uniforms.K_of_b + k_offset); #if n_bits == 4 var sum = output_element_t(0); @@ -152,7 +154,7 @@ $MAIN { let b_global = b_global_base + local_idx; let output_idx = batch * uniforms.M * uniforms.N + a_global * uniforms.N + b_global; if (b_global < uniforms.N) { - output[output_idx] = output_value; + output.setByOffset(output_idx, output_value); } } } // MAIN diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template index 462f9a340c1b8..7c2fca615a99b 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_wide_tile.wgsl.template @@ -6,6 +6,8 @@ #param tile_m #param tile_n +#use .getByOffset .setByOffset + // Only support Block32 at the moment. const KAVecSizeForBlock32 = 8u; @@ -58,7 +60,7 @@ fn load_zero(row : u32, col : u32, r_dim : u32, c_dim : u32) -> output_element_t fn load_a(batch : u32, row : u32, col : u32) -> input_a_value_t { if (batch < uniforms.Batch && row < uniforms.M && col < uniforms.K_of_a) { let offset = batch * uniforms.M * uniforms.K_of_a + row * uniforms.K_of_a + col; - return input_a[offset]; + return a.getByOffset(offset); } return input_a_value_t(); } @@ -66,7 +68,7 @@ fn load_a(batch : u32, row : u32, col : u32) -> input_a_value_t { fn load_scale(row : u32, block_idx : u32) -> output_element_t { if (row < uniforms.N && block_idx < uniforms.n_blocks_per_col) { let offset = row * uniforms.n_blocks_per_col + block_idx; - return scales[offset]; + return scales.getByOffset(offset); } return output_element_t(); } @@ -74,7 +76,7 @@ fn load_scale(row : u32, block_idx : u32) -> output_element_t { fn write_output(batch : u32, row : u32, col : u32, value : output_element_t) { if (batch < uniforms.Batch && row < uniforms.M && col < uniforms.N) { let offset = batch * uniforms.M * uniforms.N + row * uniforms.N + col; - output[offset] = value; + output.setByOffset(offset, value); } } @@ -82,7 +84,7 @@ fn write_output(batch : u32, row : u32, col : u32, value : output_element_t) { fn load_b(row : u32, block_idx : u32) -> vec4 { if (row < uniforms.N && block_idx < uniforms.K_of_b) { let offset = row * uniforms.K_of_b + block_idx; - return input_b[offset]; + return b.getByOffset(offset); } return vec4(); } @@ -112,10 +114,10 @@ fn load_b(row : u32, block_idx : u32) -> array, 4> { if (row < uniforms.N) { let offset = 2 * block_idx; let b_data_0 = select(input_b_value_t(), - input_b[row * uniforms.K_of_b + offset], + b.getByOffset(row * uniforms.K_of_b + offset), offset < uniforms.K_of_b); let b_data_1 = select(input_b_value_t(), - input_b[row * uniforms.K_of_b + offset + 1], + b.getByOffset(row * uniforms.K_of_b + offset + 1), offset + 1 < uniforms.K_of_b); let b_data = array, 4>( diff --git a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc index c8781631fb19c..db1a6319b3247 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc @@ -132,7 +132,9 @@ Status PrepackProgram::GenerateShaderCode(ShaderHelper& shader) const { return Status::OK(); } -Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t config_index, bool has_zero_points) { +Status GenerateShaderCodeOnIntel(ShaderHelper& shader, const ShaderVariableHelper& b, + const ShaderVariableHelper& scales_b, + uint32_t nbits, int32_t config_index, bool has_zero_points) { auto& config = intel_supported_subgroup_matrix_configs[config_index]; shader.AdditionalImplementation() << "alias component_type = " << ComponentTypeName[static_cast(std::get<2>(config))] << ";\n" << "alias result_component_type = " << ComponentTypeName[static_cast(std::get<3>(config))] << ";\n" @@ -150,7 +152,7 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c var tile_B: array; // 64 x 32 - RxC )ADDNL_FN" << GenerateZeroPointReadingCode(nbits, has_zero_points, "component_type"); if (nbits == 4) { - shader.AdditionalImplementation() << R"ADDNL_FN( + shader.AdditionalImplementation() << R"ADDNL_FN_PART( fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) { let b_global = tile_base + row; if (b_global >= uniforms.N) { @@ -161,9 +163,14 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c // 256 threads need to load 64 x 32. 4 threads per row or 8 col per thread. // Stored in column major fashion. let b_idx = u32((b_global * uniforms.K + k_idx + col) / 8); - let scale = component_type(scales_b[(b_global * uniforms.K + k_idx + col) / quantization_block_size]); - let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col); - let b_value = input_b[b_idx]; + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "let scale = component_type(" + << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size") + << ");" + << "let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);" + << "let b_value = " + << b.GetByOffset("b_idx") << ';'; + shader.AdditionalImplementation() << R"ADDNL_FN_PART( let b_value_lower = (vec4(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4(zero)) * scale; let b_value_upper = (vec4(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4(zero)) * scale; let tile_b_base = row * tile_k + col; @@ -176,10 +183,10 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c tile_B[tile_b_base + 6] = b_value_lower[3]; tile_B[tile_b_base + 7] = b_value_upper[3]; } - )ADDNL_FN"; + )ADDNL_FN_PART"; } else { ORT_ENFORCE(nbits == 8, "Only 4/8 bits are supported for webgpu matmulnbits"); - shader.AdditionalImplementation() << R"ADDNL_FN( + shader.AdditionalImplementation() << R"ADDNL_FN_PART( fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) { let b_global = tile_base + row; if (b_global >= uniforms.N) { @@ -190,22 +197,28 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c // 256 threads need to load 64 x 32. 4 threads per row or 8 col per thread. // Stored in column major fashion. let b_idx = u32((b_global * uniforms.K + k_idx + col) / 8); - let scale = component_type(scales_b[(b_global * uniforms.K + k_idx + col) / quantization_block_size]); - let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col); - let b_value = input_b[b_idx]; - let b_value0 = (vec4(unpack4xU8(b_value[0])) - vec4(zero)) * scale; - let b_value1 = (vec4(unpack4xU8(b_value[1])) - vec4(zero)) * scale; - let tile_b_base = row * tile_k + col; - tile_B[tile_b_base] = b_value0[0]; - tile_B[tile_b_base + 1] = b_value0[1]; - tile_B[tile_b_base + 2] = b_value0[2]; - tile_B[tile_b_base + 3] = b_value0[3]; - tile_B[tile_b_base + 4] = b_value1[0]; - tile_B[tile_b_base + 5] = b_value1[1]; - tile_B[tile_b_base + 6] = b_value1[2]; - tile_B[tile_b_base + 7] = b_value1[3]; - } - )ADDNL_FN"; + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "let scale = component_type(" + << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size") + << ");" + << " let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col);" + << "let b_value = " + << b.GetByOffset("b_idx") << ';'; + + shader.AdditionalImplementation() << + R"ADDNL_FN_PART(let b_value0 = (vec4(unpack4xU8(b_value[0])) - vec4(zero)) * scale; + let b_value1 = (vec4(unpack4xU8(b_value[1])) - vec4(zero)) * scale; + let tile_b_base = row * tile_k + col; + tile_B[tile_b_base] = b_value0[0]; + tile_B[tile_b_base + 1] = b_value0[1]; + tile_B[tile_b_base + 2] = b_value0[2]; + tile_B[tile_b_base + 3] = b_value0[3]; + tile_B[tile_b_base + 4] = b_value1[0]; + tile_B[tile_b_base + 5] = b_value1[1]; + tile_B[tile_b_base + 6] = b_value1[2]; + tile_B[tile_b_base + 7] = b_value1[3]; + } + )ADDNL_FN_PART"; } shader.MainFunctionBody() << R"MAIN_FN( @@ -266,10 +279,12 @@ Status GenerateShaderCodeOnIntel(ShaderHelper& shader, uint32_t nbits, int32_t c return Status::OK(); } -Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_zero_points) { +Status GenerateShaderCodeOnApple(ShaderHelper& shader, const ShaderVariableHelper& a, const ShaderVariableHelper& b, + const ShaderVariableHelper& scales_b, + const ShaderVariableHelper& output, uint32_t nbits, bool has_zero_points) { // tile/subtile sizes and work distribution are inspired from metal shaders in llama.cpp (kernel_mul_mm) // https://github.com/ggml-org/llama.cpp/blob/d04e7163c85a847bc61d58c22f2c503596db7aa8/ggml/src/ggml-metal/ggml-metal.metal#L6066 - shader.AdditionalImplementation() << R"ADDNL_FN( + shader.AdditionalImplementation() << R"ADDNL_FN_PART( const tile_cols = 64; const tile_rows = 32; const tile_k = 32; @@ -292,13 +307,17 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_ // 128 threads need to load 32 x 32. 4 threads per row or 8 col per thread. for (var col_offset:u32 = 0; col_offset < 8; col_offset++) { - tile_A[row * tile_k + col + col_offset] = compute_precision(input_a[a_global*uniforms.K + k_idx + col + col_offset]); + )ADDNL_FN_PART"; + shader.AdditionalImplementation() + << " tile_A[row * tile_k + col + col_offset] = compute_precision(" + << a.GetByOffset("a_global * uniforms.K + k_idx + col + col_offset") + << ");"; + shader.AdditionalImplementation() << R"ADDNL_FN_PART( } - } - )ADDNL_FN" - << GenerateZeroPointReadingCode(nbits, has_zero_points, "compute_precision"); + })ADDNL_FN_PART"; + shader.AdditionalImplementation() << GenerateZeroPointReadingCode(nbits, has_zero_points, "compute_precision"); if (nbits == 4) { - shader.AdditionalImplementation() << R"ADDNL_FN( + shader.AdditionalImplementation() << R"ADDNL_FN_PART( fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) { let b_global = tile_base + row; if (b_global >= uniforms.N) { @@ -309,28 +328,35 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_ // 128 threads need to load 64 x 32. 2 threads per row or 16 col per thread. // Stored in column major fashion. let b_idx = u32((b_global*uniforms.K + k_idx + col)/8); - let scale = compute_precision(scales_b[(b_global*uniforms.K + k_idx + col)/quantization_block_size]); + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "let scale = compute_precision(" + << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size") + << ");"; + shader.AdditionalImplementation() << R"ADDNL_FN_PART( let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col); for (var step:u32 = 0; step < 2; step++) { - var b_value = input_b[b_idx+step]; - var b_value_lower = (vec4(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4(zero)) * scale; - var b_value_upper = (vec4(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4(zero)) * scale; - let tile_b_base = row * tile_k + col + step * 8; - tile_B[tile_b_base] = b_value_lower[0]; - tile_B[tile_b_base + 1] = b_value_upper[0]; - tile_B[tile_b_base + 2] = b_value_lower[1]; - tile_B[tile_b_base + 3] = b_value_upper[1]; - tile_B[tile_b_base + 4] = b_value_lower[2]; - tile_B[tile_b_base + 5] = b_value_upper[2]; - tile_B[tile_b_base + 6] = b_value_lower[3]; - tile_B[tile_b_base + 7] = b_value_upper[3]; - } - } - )ADDNL_FN"; + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "var b_value = " + << b.GetByOffset("b_idx+step") + << ';'; + shader.AdditionalImplementation() << R"ADDNL_FN_PART(var b_value_lower = (vec4(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4(zero)) * scale; + var b_value_upper = (vec4(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4(zero)) * scale; + let tile_b_base = row * tile_k + col + step * 8; + tile_B[tile_b_base] = b_value_lower[0]; + tile_B[tile_b_base + 1] = b_value_upper[0]; + tile_B[tile_b_base + 2] = b_value_lower[1]; + tile_B[tile_b_base + 3] = b_value_upper[1]; + tile_B[tile_b_base + 4] = b_value_lower[2]; + tile_B[tile_b_base + 5] = b_value_upper[2]; + tile_B[tile_b_base + 6] = b_value_lower[3]; + tile_B[tile_b_base + 7] = b_value_upper[3]; + } +} + )ADDNL_FN_PART"; } else { ORT_ENFORCE(nbits == 8, "Only 4/8 bits are supported for webgpu matmulnbits"); - shader.AdditionalImplementation() << R"ADDNL_FN( + shader.AdditionalImplementation() << R"ADDNL_FN_PART( fn loadSHMB(tile_base: u32, k_idx: u32, row: u32, c_idx: u32) { let b_global = tile_base + row; if (b_global >= uniforms.N) { @@ -341,42 +367,49 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_ // 128 threads need to load 64 x 32. 2 threads per row or 16 col per thread. // Stored in column major fashion. let b_idx = u32((b_global*uniforms.K + k_idx + col)/8); - let scale = compute_precision(scales_b[(b_global*uniforms.K + k_idx + col)/quantization_block_size]); - let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col); - for (var step:u32 = 0; step < 2; step++) - { - var b_value = input_b[b_idx+step]; - var b_value0 = (vec4(unpack4xU8(b_value[0])) - vec4(zero)) * scale; - var b_value1 = (vec4(unpack4xU8(b_value[1])) - vec4(zero)) * scale; - let tile_b_base = row * tile_k + col + step * 8; - tile_B[tile_b_base] = b_value0[0]; - tile_B[tile_b_base + 1] = b_value0[1]; - tile_B[tile_b_base + 2] = b_value0[2]; - tile_B[tile_b_base + 3] = b_value0[3]; - tile_B[tile_b_base + 4] = b_value1[0]; - tile_B[tile_b_base + 5] = b_value1[1]; - tile_B[tile_b_base + 6] = b_value1[2]; - tile_B[tile_b_base + 7] = b_value1[3]; - } - } - )ADDNL_FN"; + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "let scale = compute_precision(" + << scales_b.GetByOffset("(b_global * uniforms.K + k_idx + col) / quantization_block_size") + << ");"; + shader.AdditionalImplementation() << R"ADDNL_FN_PART( + let zero = mm_read_zero(b_global, (k_idx + col) / quantization_block_size, uniforms.N, uniforms.zero_blocks_per_col); + for (var step : u32 = 0; step < 2; step++) { + )ADDNL_FN_PART"; + shader.AdditionalImplementation() << "var b_value = " + << b.GetByOffset("b_idx+step") + << ';'; + + shader.AdditionalImplementation() << R"ADDNL_FN_PART( + var b_value0 = (vec4(unpack4xU8(b_value[0])) - vec4(zero)) * scale; + var b_value1 = (vec4(unpack4xU8(b_value[1])) - vec4(zero)) * scale; + let tile_b_base = row * tile_k + col + step * 8; + tile_B[tile_b_base] = b_value0[0]; + tile_B[tile_b_base + 1] = b_value0[1]; + tile_B[tile_b_base + 2] = b_value0[2]; + tile_B[tile_b_base + 3] = b_value0[3]; + tile_B[tile_b_base + 4] = b_value1[0]; + tile_B[tile_b_base + 5] = b_value1[1]; + tile_B[tile_b_base + 6] = b_value1[2]; + tile_B[tile_b_base + 7] = b_value1[3]; } - shader.AdditionalImplementation() << R"ADDNL_FN( - fn storeOutput(offset:u32, row: u32, col:u32, src_slot:u32, row_limit:i32) { - if (row_limit > 0 && row < u32(row_limit)) - { - output[offset + row * uniforms.N + col] = output_element_t(scratch[src_slot][0][row * 8 + col]); - output[offset + row * uniforms.N + col + 8] = output_element_t(scratch[src_slot][1][row * 8 + col]); - output[offset + row * uniforms.N + col + 16] = output_element_t(scratch[src_slot][2][row * 8 + col]); - output[offset + row * uniforms.N + col + 24] = output_element_t(scratch[src_slot][3][row * 8 + col]); - let col2 = col + 1; - output[offset + row * uniforms.N + col2] = output_element_t(scratch[src_slot][0][row * 8 + col2]); - output[offset + row * uniforms.N + col2 + 8] = output_element_t(scratch[src_slot][1][row * 8 + col2]); - output[offset + row * uniforms.N + col2 + 16] = output_element_t(scratch[src_slot][2][row * 8 + col2]); - output[offset + row * uniforms.N + col2 + 24] = output_element_t(scratch[src_slot][3][row * 8 + col2]); - } - } - )ADDNL_FN"; +} + )ADDNL_FN_PART"; + } + shader.AdditionalImplementation() + << " fn storeOutput(offset:u32, row: u32, col:u32, src_slot:u32, row_limit:i32) {\n" + << " if (row_limit > 0 && row < u32(row_limit))\n" + << " {\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col", "output_element_t(scratch[src_slot][0][row * 8 + col])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col + 8", "output_element_t(scratch[src_slot][1][row * 8 + col])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col + 16", "output_element_t(scratch[src_slot][2][row * 8 + col])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col + 24", "output_element_t(scratch[src_slot][3][row * 8 + col])") << ";\n" + << " let col2 = col + 1;\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col2", "output_element_t(scratch[src_slot][0][row * 8 + col2])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col2 + 8", "output_element_t(scratch[src_slot][1][row * 8 + col2])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col2 + 16", "output_element_t(scratch[src_slot][2][row * 8 + col2])") << ";\n" + << " " << output.SetByOffset("offset + row * uniforms.N + col2 + 24", "output_element_t(scratch[src_slot][3][row * 8 + col2])") << ";\n" + << " }\n" + << " }\n"; shader.MainFunctionBody() << R"MAIN_FN( let a_global_base = workgroup_id.y * tile_rows; @@ -463,18 +496,18 @@ Status GenerateShaderCodeOnApple(ShaderHelper& shader, uint32_t nbits, bool has_ } Status SubgroupMatrixMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { - shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); - shader.AddInput("input_b", ShaderUsage::UseUniform); - shader.AddInput("scales_b", ShaderUsage::UseUniform); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform); + const auto& scales_b = shader.AddInput("scales_b", ShaderUsage::UseUniform); if (has_zero_points_) { shader.AddInput("zero_points", ShaderUsage::UseUniform); } - shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); if (!vendor_.compare("apple")) { - return GenerateShaderCodeOnApple(shader, nbits_, has_zero_points_); + return GenerateShaderCodeOnApple(shader, a, b, scales_b, output, nbits_, has_zero_points_); } else if (!vendor_.compare("intel")) { - return GenerateShaderCodeOnIntel(shader, nbits_, config_index_, has_zero_points_); + return GenerateShaderCodeOnIntel(shader, b, scales_b, nbits_, config_index_, has_zero_points_); } else { return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::NOT_IMPLEMENTED, "onnxruntime does not support subgroup matrix on this verdor."); diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 3f6443aa73d4c..8b599dc86d997 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -2678,6 +2678,27 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { // only return data if it's for a constant initializer. checks for outer scope initializers // if this is a subgraph and the name isn't found locally. const TensorProto* initializer = graph_.GetConstantInitializer(def->Name(), true); + if (initializer != nullptr) { + // Check if this is in-memory external data (data stored in OrtValue) + // ONNX shape inference cannot handle external data, so we need to materialize it + if (utils::HasExternalDataInMemory(*initializer)) { + // Try to get the OrtValue for this initializer + OrtValue ort_value; + if (graph_.GetOrtValueInitializer(def->Name(), ort_value, true)) { + // Create a temporary TensorProto with the actual data from the OrtValue + // This allows ONNX shape inference to access the data + const Tensor& tensor = ort_value.Get(); + auto temp_tensor_proto = utils::TensorToTensorProto(tensor, initializer->name(), /*use_tensor_buffer=*/false); + // Store the temporary proto so it outlives this call, maintain pointers steady + temp_tensor_protos_.push_back(std::make_unique(std::move(temp_tensor_proto))); + return temp_tensor_protos_.back().get(); + } else { + // If we can't get the OrtValue, it is a bug + ORT_THROW("Initializer ", def->Name(), + " has in-memory external data but cannot get OrtValue during shape inference"); + } + } + } return initializer; } @@ -2717,6 +2738,11 @@ class InferenceContextImpl : public ONNX_NAMESPACE::InferenceContext { std::vector> graph_inferencers_; const Graph& graph_; const Graph::ResolveOptions& options_; + // Temporary TensorProtos created for in-memory external data during shape inference + // These need to outlive the shape inference call, so we store them here + // Inference is per node and the instance of this context is on the stack, + // so this is safe. + mutable InlinedVector> temp_tensor_protos_; }; Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph, diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index bc1221475fd90..9518134631f2d 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -729,6 +729,82 @@ Return Value: } } +void +MlasConvExpandThenGemmSegmentedThreaded( + void* Context, + ptrdiff_t Index +) +/*++ + +Routine Description: + + This routine is invoked from a worker thread to execute a segment of a + convolution operation. + + If using this, the entire convolution operation is parallelized on the + (batch size * group count) parameter and this routine has logic to + perform a specific thread's shard of the entire Convolution operation. + +Arguments: + + Context - Supplies the pointer to the context for the threaded operation. + + Index - Supplies the current index of the threaded operation. + +Return Value: + + None. + +--*/ + +{ + MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context; + + const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters; + + const size_t GroupCount = Parameters->GroupCount; + const size_t BatchGroupCount = Parameters->BatchCount * GroupCount; + + const size_t TargetThreadCount = WorkBlock->TargetThreadCount; + + const size_t BatchGroupCountPerThread = BatchGroupCount / TargetThreadCount; + const size_t BatchGroupCountExtra = BatchGroupCount % TargetThreadCount; + + size_t BatchGroupStart; + size_t BatchGroupEnd; + + if (static_cast(Index) < BatchGroupCountExtra) { + BatchGroupStart = (BatchGroupCountPerThread + 1) * Index; + BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread + 1; + } else { + BatchGroupStart = BatchGroupCountPerThread * Index + BatchGroupCountExtra; + BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread; + } + + const size_t FilterCount = Parameters->FilterCount; + const size_t OutputSize = Parameters->OutputSize; + const size_t K = Parameters->K; + + const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize; + const size_t OutputGroupSize = FilterCount * OutputSize; + const size_t FilterGroupSize = FilterCount * K; + + for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) { + size_t group = bg % GroupCount; + + const float* input = WorkBlock->Input + bg * InputGroupSize; + const float* filter = WorkBlock->Filter + group * FilterGroupSize; + float* output = WorkBlock->Output + bg * OutputGroupSize; + const float* bias = WorkBlock->Bias; + if (bias != nullptr) { + bias += group * FilterCount; + } + float* ColumnBuffer = WorkBlock->WorkingBuffer + Index * OutputSize * K; + + MlasConvOperation(Parameters, input, filter, bias, ColumnBuffer, output, 0, OutputSize); + } +} + inline bool MlasConvTryMultithread( @@ -890,8 +966,8 @@ Return Value: ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); - if (size_t(TargetThreadCount) >= BatchGroupCount) { - TargetThreadCount = ptrdiff_t(BatchGroupCount); + if (static_cast(TargetThreadCount) >= BatchGroupCount) { + TargetThreadCount = static_cast(BatchGroupCount); } MLAS_CONV_WORK_BLOCK WorkBlock; @@ -919,6 +995,30 @@ Return Value: #endif + if (Algorithm == MlasConvAlgorithmExpandThenGemmSegmented && ((BatchCount > 1) || (GroupCount > 1))) { + const size_t BatchGroupCount = BatchCount * GroupCount; + + ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); + + if (static_cast(TargetThreadCount) >= BatchGroupCount) { + TargetThreadCount = static_cast(BatchGroupCount); + } + + MLAS_CONV_WORK_BLOCK WorkBlock; + + WorkBlock.Parameters = Parameters; + WorkBlock.Input = Input; + WorkBlock.Filter = Filter; + WorkBlock.Bias = Bias; + WorkBlock.WorkingBuffer = WorkingBuffer; + WorkBlock.Output = Output; + WorkBlock.TargetThreadCount = TargetThreadCount; + + MlasExecuteThreaded(MlasConvExpandThenGemmSegmentedThreaded, &WorkBlock, TargetThreadCount, ThreadPool); + + return; + } + // // Iterate over each batch and group. // @@ -1308,6 +1408,18 @@ Return Value: Parameters->u.ExpandThenGemmSegmented.ThreadStrideN = StrideN; *WorkingBufferSize = TargetThreadCount * MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD; + + if (Parameters->BatchCount > 1 || Parameters->GroupCount > 1) { + + size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K, + Parameters->FilterCount * Parameters->OutputSize, + static_cast(MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD)}); + TargetThreadCount = MaximumThreadCount; + if (static_cast(TargetThreadCount) >= Parameters->BatchCount * Parameters->GroupCount) { + TargetThreadCount = static_cast(Parameters->BatchCount * Parameters->GroupCount); + } + *WorkingBufferSize = TargetThreadCount * WorkingBufferSizePerThread; + } } } #if defined(_MSC_VER) && !defined(__clang__) diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index 5136061c4769d..2e9c4574fd057 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -115,3 +115,37 @@ MlasConv( MLAS_THREADPOOL* ThreadPool ); } + +/*++ + +Routine Description: + + This routine determines if a wraparound will occur when multiplying two size_t variables + Uses __builtin_mul_overflow if available on the current system and if not falls back + to a default implementation to check this wraparound. + +Arguments: + + a - Supplies the first number to be muliplied. + + b - Supplies the second number to be muliplied. + + out - pointer to a size_t which acts as the return value in success cases. + +Return Value: + + Returns false if the operation was successful + Returns true if wraparound of size_t was detected + +--*/ +inline bool mul_overflow_size_t_builtin(size_t a, size_t b, size_t* out) { +#if defined(__has_builtin) +# if __has_builtin(__builtin_mul_overflow) + return __builtin_mul_overflow(a, b, out); +# endif +#endif + // Fallback to manual check if builtin not available + if (b != 0 && a > SIZE_MAX / b) return true; + if (out) *out = a * b; + return false; +} diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index ea38f16205a7c..435ff1fb10017 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -14,6 +14,16 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h" #include "mlasi_kleidiai.h" + +// Thread-local reusable buffers to reduce allocation overhead across tiles. +struct KaiTlsBuffers { + std::vector output_tile; + std::vector bias_zero; + std::vector rhs_packed; + std::vector lhs_packed; +}; +static thread_local KaiTlsBuffers g_kai_tls; + size_t MLASCALL ArmKleidiAI::MlasGemmPackBSize( @@ -51,7 +61,6 @@ Return Value: // Compute the number of bytes required to hold the packed buffer. // size_t bytes = 0; - if (TransA == CblasNoTrans) { switch (TransB) { case CblasNoTrans: @@ -125,15 +134,15 @@ Return Value: const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - // pass zeroed bias values - const std::vector bias(N); + // Ensure size and zero the used span. + g_kai_tls.bias_zero.resize(N, 0.0f); switch (TransB) { case CblasNoTrans: - kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; case CblasTrans: - kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, bias.data(), nullptr, PackedB, 0, nullptr); + kai_run_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(1, N, K, nr, kr, sr, ldb * sizeof(float), B, g_kai_tls.bias_zero.data(), nullptr, PackedB, 0, nullptr); break; default: return false; @@ -225,22 +234,29 @@ Return Value: size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa() : kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(); - if (M < m_step && N < n_step && !Data->BIsPacked) { + if ((M < m_step || N < n_step) && !Data->BIsPacked) { // Fallback to MLAS return false; } - std::vector KaiPackedData; - KaiPackedData.resize(BatchSize); - size_t LhsPackedStride = 0; std::byte* LhsPackedData = nullptr; LhsPackedStride = kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr); - auto LhsPacked = std::make_unique(LhsPackedStride * BatchSize); - LhsPackedData = LhsPacked.get(); - std::unique_ptr RhsPacked{nullptr}; + size_t lhs_resize = 0; + if(mul_overflow_size_t_builtin(LhsPackedStride, BatchSize, &lhs_resize)) + { + // size_t wraparound detected for LhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.lhs_packed.resize(lhs_resize); + LhsPackedData = g_kai_tls.lhs_packed.data(); + + // RHS packed buffer: use TLS reusable vector to minimize allocations + size_t RhsPackedStride = 0; + std::byte* RhsPackedData = nullptr; // It is assumed all B batches require packing or not if (Data[0].BIsPacked) { @@ -248,36 +264,31 @@ Return Value: MlasTrySimpleParallel(ThreadPool, BatchSize, [&](ptrdiff_t batch_idx) { std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); - KaiPackedData[batch_idx].B = Data[batch_idx].B; }); } else { // Multithread pack lhs and rhs - size_t RhsPackedStride = 0; - std::byte* RhsPackedData = nullptr; - RhsPackedStride = ArmKleidiAI::MlasGemmPackBSize(TransA, TransB, N, K); - RhsPacked = std::make_unique(RhsPackedStride * BatchSize); - RhsPackedData = RhsPacked.get(); + size_t rhs_resize = 0; + if (mul_overflow_size_t_builtin(RhsPackedStride, BatchSize, &rhs_resize)) + { + // size_t wraparound detected for RhsPackedStride, fallback to MLAS + return false; + } + + g_kai_tls.rhs_packed.resize(rhs_resize); + RhsPackedData = g_kai_tls.rhs_packed.data(); MlasTrySimpleParallel(ThreadPool, BatchSize * 2, [&](ptrdiff_t batch_idx) { - // lhs odd, rhs even if (batch_idx & 0x1) { batch_idx >>= 1; - std::byte* LhsPackedPtr = &(LhsPackedData[LhsPackedStride * batch_idx]); - kai_run_lhs_pack_f32p2vlx1_f32_sme(M, K, mr, kr, sr, 0, Data[batch_idx].A, Data[batch_idx].lda * sizeof(float), LhsPackedPtr); - - KaiPackedData[batch_idx].A = reinterpret_cast(LhsPackedPtr); } else { batch_idx >>= 1; - std::byte* RhsPackedPtr = &(RhsPackedData[RhsPackedStride * batch_idx]); - - ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, reinterpret_cast(Data[batch_idx].B), Data[batch_idx].ldb, RhsPackedPtr); - - KaiPackedData[batch_idx].B = reinterpret_cast(RhsPackedPtr); + ArmKleidiAI::MlasGemmPackB(TransA, TransB, N, K, + reinterpret_cast(Data[batch_idx].B), + Data[batch_idx].ldb, RhsPackedPtr); } }); } @@ -303,6 +314,14 @@ Return Value: dim[1] = MlasDivRoundup(M, m_step); dim[2] = MlasDivRoundup(N, n_step); + // Pre-check maximum tile size to avoid per-iteration overflow inside the parallel loop. + // Any TileSizeM/TileSizeN used below will be <= m_step/n_step respectively. + size_t max_tile_elems = 0; + if (mul_overflow_size_t_builtin(m_step, n_step, &max_tile_elems)) { + // size_t wraparound detected for tile size, fallback to MLAS + return false; + } + MlasTrySimpleParallel(ThreadPool, static_cast(dim[0] * dim[1] * dim[2]), [=](ptrdiff_t tid) { // compute B,M,N index from iteration index ptrdiff_t BIdx = tid / (dim[1] * dim[2]); @@ -314,18 +333,18 @@ Return Value: UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(NIdx * n_step, K) : kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(NIdx * n_step, K); - auto BTile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].B) + rhs_packed_offset - ); + const std::byte* B_base = Data[0].BIsPacked + ? reinterpret_cast(Data[BIdx].B) + : (RhsPackedData + RhsPackedStride * BIdx); + auto BTile = reinterpret_cast(B_base + rhs_packed_offset); // Get lhs tile, A const size_t lhs_packed_offset = UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(MIdx * m_step, K) : kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(MIdx * m_step, K); - auto ATile = reinterpret_cast( - reinterpret_cast(KaiPackedData[BIdx].A) + lhs_packed_offset - ); + const std::byte* A_base = LhsPackedData + LhsPackedStride * BIdx; + auto ATile = reinterpret_cast(A_base + lhs_packed_offset); auto TileSizeM = (MIdx + 1) * m_step > M ? (M - MIdx * m_step) : m_step; auto TileSizeN = (NIdx + 1) * n_step > N ? (N - NIdx * n_step) : n_step; @@ -336,9 +355,14 @@ Return Value: MIdx * m_step * Data[BIdx].ldc * sizeof(float) + NIdx * n_step * sizeof(float) ); - // Allocate temporary buffer for raw A*B result - std::vector OutputTile(TileSizeM * TileSizeN, 0.0f); - float* temp_tile = OutputTile.data(); + // Allocate temporary buffer for raw A*B result (TLS reusable buffer) + size_t tile_elems = TileSizeM * TileSizeN; + + // resize the tile to the required size + g_kai_tls.output_tile.resize(tile_elems); + + float* temp_tile = g_kai_tls.output_tile.data(); + std::fill_n(temp_tile, tile_elems, 0.0f); if (UseSME2) { kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa( diff --git a/onnxruntime/core/platform/linux/device_discovery.cc b/onnxruntime/core/platform/linux/device_discovery.cc index 6a02a1b46028f..e9c45a6966ef8 100644 --- a/onnxruntime/core/platform/linux/device_discovery.cc +++ b/onnxruntime/core/platform/linux/device_discovery.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "core/common/common.h" @@ -100,27 +101,44 @@ Status ReadValueFromFile(const fs::path& file_path, ValueType& value) { return ParseStringWithClassicLocale(file_text, value); } +std::optional IsGpuDiscrete(uint16_t vendor_id, uint16_t device_id) { + ORT_UNUSED_PARAMETER(device_id); + + // Currently, we only assume that all Nvidia GPUs are discrete. + + constexpr auto kNvidiaPciId = 0x10de; + if (vendor_id == kNvidiaPciId) { + return true; + } + + return std::nullopt; +} + Status GetGpuDeviceFromSysfs(const GpuSysfsPathInfo& path_info, OrtHardwareDevice& gpu_device_out) { OrtHardwareDevice gpu_device{}; const auto& sysfs_path = path_info.path; // vendor id - { - const auto vendor_id_path = sysfs_path / "device" / "vendor"; - ORT_RETURN_IF_ERROR(ReadValueFromFile(vendor_id_path, gpu_device.vendor_id)); - } + uint16_t vendor_id{}; + const auto vendor_id_path = sysfs_path / "device" / "vendor"; + ORT_RETURN_IF_ERROR(ReadValueFromFile(vendor_id_path, vendor_id)); + gpu_device.vendor_id = vendor_id; // TODO vendor name // device id - { - const auto device_id_path = sysfs_path / "device" / "device"; - ORT_RETURN_IF_ERROR(ReadValueFromFile(device_id_path, gpu_device.device_id)); - } + uint16_t device_id{}; + const auto device_id_path = sysfs_path / "device" / "device"; + ORT_RETURN_IF_ERROR(ReadValueFromFile(device_id_path, device_id)); + gpu_device.device_id = device_id; // metadata gpu_device.metadata.Add("card_idx", MakeString(path_info.card_idx)); - // TODO is card discrete? + + if (const auto is_gpu_discrete = IsGpuDiscrete(vendor_id, device_id); + is_gpu_discrete.has_value()) { + gpu_device.metadata.Add("Discrete", (*is_gpu_discrete ? "1" : "0")); + } gpu_device.type = OrtHardwareDeviceType_GPU; diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h index ffd954f13e568..f08f134d0c080 100644 --- a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h +++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h @@ -78,8 +78,9 @@ class ConstantOfShapeBase { auto* t_proto_p = t_proto.get(); #endif if (info.GetAttr("value", t_proto_p).IsOK()) { - ORT_ENFORCE(t_proto_p->dims_size() == 1, "Must have a single dimension"); - ORT_ENFORCE(t_proto_p->dims()[0] == 1, "Must have a single dimension of 1"); + for (auto dim : t_proto_p->dims()) { + ORT_ENFORCE(dim == 1, "The value attribute of ConstantOfShape must be a single-element tensor"); + } SetValueFromTensorProto(*t_proto_p); } else { float f_value = 0.f; diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h index ca568e485da11..09db2e4c46245 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h @@ -134,11 +134,6 @@ struct TreeEnsembleAttributesV5 { for (auto i : nodes_modes_i) { nodes_modes.push_back(static_cast(i)); } -#else - // GetVectorAttrsOrDefault is not part of the minimal build. - // As a result, TreeEnsemble v5 cannot be available in this build. - ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build."); -#endif aggregate_function = info.GetAttrOrDefault("aggregate_function", 1); leaf_targetids = info.GetAttrsOrDefault("leaf_targetids"); @@ -151,6 +146,11 @@ struct TreeEnsembleAttributesV5 { nodes_truenodeids = info.GetAttrsOrDefault("nodes_truenodeids"); post_transform = info.GetAttrOrDefault("post_transform", 0); tree_roots = info.GetAttrsOrDefault("tree_roots"); +#else + // GetVectorAttrsOrDefault is not part of the minimal build. + // As a result, TreeEnsemble v5 cannot be available in this build. + ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build."); +#endif } void convert_to_v3(TreeEnsembleAttributesV3& output) const { diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index a59347841be95..55f901164bdac 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -268,6 +268,7 @@ static bool IsTypeSupported(const NodeArg* node_arg) { case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT: + case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT4E2M1: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ: case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2: @@ -318,6 +319,9 @@ static bool getMIGraphXType(ONNXTensorElementDataType type, case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ: mgx_type = migraphx_shape_fp8e5m2fnuz_type; break; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT4E2M1: + mgx_type = migraphx_shape_fp4x2_type; + break; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4: mgx_type = migraphx_shape_int8_type; break; @@ -949,6 +953,8 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer, "QLinearAdd", "QLinearConv", "QLinearMatMul", + "QLinearAveragePool", + "QLinearGlobalAveragePool", "QuantizeLinear", "QuickGelu", "DynamicQuantizeLinear", diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc index a47ba7893d8fe..368caa518b7ba 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc @@ -21,6 +21,7 @@ #include "core/providers/qnn/builder/qnn_node_group/udo_fusion.h" #include "core/providers/qnn/builder/qnn_node_group/lpbqgemm_fusion.h" #include "core/providers/qnn/builder/qnn_node_group/lpbqmatmul_fusion.h" +#include "core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/ort_api.h" @@ -82,6 +83,7 @@ static std::unordered_map> fusions = { {"Gemm", {LowPowerBlockQuantizedGemmFusion::TryFusion, ReshapeGemmFusion::TryFusion}}, {"Mul", {ScaleSoftmaxFusion::TryFusion}}, {"Cast", {CastLoneQFusion::TryFusion}}, + {"Reshape", {Rank6ToRank5Fusion::TryFusion}}, {"Transpose", {ChannelShuffleFusion::TryFusion}}}; void registerUDO(const std::string& node_type, const std::string& op_package) { @@ -117,8 +119,10 @@ static std::unique_ptr TryQnnFusions( const std::unordered_map& node_to_node_unit, const std::unordered_map& node_unit_to_qnn_node_group, const logging::Logger& logger) { - // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes) except MatMul w/ LPBQ encodings - if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode && starting_node_unit.OpType() != "MatMul") { + // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes) except MatMul w/ LPBQ encodings and Reshape + if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode && + starting_node_unit.OpType() != "MatMul" && + starting_node_unit.OpType() != "Reshape") { return nullptr; } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc new file mode 100644 index 0000000000000..3218e32cac097 --- /dev/null +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.cc @@ -0,0 +1,459 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/common/inlined_containers.h" +#include "core/providers/qnn/builder/qnn_utils.h" +#include "core/providers/qnn/builder/op_builder_factory.h" +#include "core/providers/qnn/builder/qnn_node_group/utils.h" +#include "core/providers/qnn/builder/qnn_model_wrapper.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" +#include "core/common/safeint.h" + +namespace onnxruntime { +namespace qnn { +namespace { + +constexpr size_t kRank6 = 6; +constexpr size_t kRank5 = 5; +constexpr const char* kOpTypeReshape = "Reshape"; +constexpr const char* kOpTypeTranspose = "Transpose"; +constexpr const char* kAttrTransposePerm = "perm"; + +using MapNodeToNodeUnit = std::unordered_map; +using MapNodeUnitToGroup = std::unordered_map; + +/// @brief Get the shape of a tensor from its NodeArg +std::optional GetTensorShape(const NodeArg* node_arg) { + if (node_arg == nullptr) { + return std::nullopt; + } + auto shape_proto = node_arg->Shape(); + if (shape_proto == nullptr) { + return std::nullopt; + } + return utils::GetTensorProtoShape(*shape_proto); +} + +/// @brief Get child NodeUnit of specified type, allowing QDQ-wrapped nodes +const NodeUnit* GetChildNodeUnit( + const GraphViewer& graph_viewer, + const NodeUnit& parent_node_unit, + const std::string& child_op_type, + const MapNodeToNodeUnit& node_to_node_unit, + const MapNodeUnitToGroup& node_unit_to_qnn_node_group, + const logging::Logger& logger) { + const Node& parent_node = parent_node_unit.GetNode(); + + ORT_UNUSED_PARAMETER(logger); + // For QDQ NodeUnits, we need to look at the Q node's output, not the target node's output + const Node* search_node = &parent_node; + if (parent_node_unit.UnitType() == NodeUnit::Type::QDQGroup) { + const auto& q_nodes = parent_node_unit.GetQNodes(); + if (!q_nodes.empty()) { + search_node = q_nodes[0]; // Use first Q node + } + } + + // Search node must have a single child (1 output edge) and must not produce a graph output + if (search_node->GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(*search_node)) { + return nullptr; + } + + // Get the child node from the search node's output edge + const Node* potential_child = &search_node->OutputEdgesBegin()->GetNode(); + if (graph_viewer.GetNode(potential_child->Index()) == nullptr) { + return nullptr; + } + + // If the child is a DequantizeLinear, skip it and look at its child (the target op of the next QDQ group) + if (potential_child->OpType() == "DequantizeLinear") { + if (potential_child->GetOutputEdgesCount() != 1) { + return nullptr; + } + potential_child = &potential_child->OutputEdgesBegin()->GetNode(); + if (graph_viewer.GetNode(potential_child->Index()) == nullptr) { + return nullptr; + } + } + + // Check if this node matches the target type + if (potential_child->OpType() != child_op_type) { + return nullptr; + } + + // Get the NodeUnit for the child + const auto child_node_unit_it = node_to_node_unit.find(potential_child); + if (child_node_unit_it == node_to_node_unit.end()) { + return nullptr; + } + + const NodeUnit* child_node_unit = child_node_unit_it->second; + + // Check if child node has already been handled + if (node_unit_to_qnn_node_group.count(child_node_unit) != 0) { + return nullptr; + } + + return child_node_unit; +} + +/// @brief Match the pattern: Reshape -> Transpose -> Reshape with rank-6 intermediate tensors +std::optional> MatchRank6ToRank5Pattern( + const GraphViewer& graph_viewer, + const NodeUnit* reshape1, + const MapNodeToNodeUnit& node_to_node_unit, + const MapNodeUnitToGroup& node_unit_to_qnn_node_group, + const logging::Logger& logger) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] MatchPattern: Checking node " << reshape1->Name() + << " OpType=" << reshape1->OpType() + << " UnitType=" << static_cast(reshape1->UnitType()); + + // Validate first Reshape in pattern - allow both SingleNode and QDQGroup + if (reshape1->OpType() != kOpTypeReshape) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] First node in pattern is not a Reshape op"; + return std::nullopt; + } + + // Get Transpose child (middle node in pattern) - allow both SingleNode and QDQGroup + const NodeUnit* transpose = GetChildNodeUnit( + graph_viewer, *reshape1, kOpTypeTranspose, node_to_node_unit, node_unit_to_qnn_node_group, logger); + if (transpose == nullptr) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Transpose (middle node in pattern) not found after first Reshape"; + return std::nullopt; + } + + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Found Transpose (middle node): " << transpose->Name(); + + // Get second Reshape child (last node in pattern) - allow both SingleNode and QDQGroup + const NodeUnit* reshape2 = GetChildNodeUnit( + graph_viewer, *transpose, kOpTypeReshape, node_to_node_unit, node_unit_to_qnn_node_group, logger); + if (reshape2 == nullptr) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Second Reshape (last node in pattern) not found after Transpose"; + return std::nullopt; + } + + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Found second Reshape (last node): " << reshape2->Name(); + LOGS(logger, INFO) << "[Rank6ToRank5] Pattern matched: Reshape -> Transpose -> Reshape"; + + return std::array{reshape1, transpose, reshape2}; +} + +/// @brief Validate the pattern conditions and find the unit dimension index +std::optional ValidatePatternConditions( + const NodeUnit* reshape1, + const NodeUnit* transpose, + const NodeUnit* reshape2, + const QnnModelWrapper& qnn_model_wrapper, + const logging::Logger& logger) { + // Check if reshape shape inputs are constants + const NodeArg* reshape1_shape_input = reshape1->GetNode().InputDefs()[1]; + const NodeArg* reshape2_shape_input = reshape2->GetNode().InputDefs()[1]; + + if (!qnn_model_wrapper.IsConstantInput(reshape1_shape_input->Name())) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Reshape1 shape input is not constant"; + return std::nullopt; + } + + if (!qnn_model_wrapper.IsConstantInput(reshape2_shape_input->Name())) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Reshape2 shape input is not constant"; + return std::nullopt; + } + + // Get tensor shapes + auto t0_shape = GetTensorShape(reshape1->GetNode().InputDefs()[0]); + auto t1_shape = GetTensorShape(reshape1->GetNode().OutputDefs()[0]); + auto t2_shape = GetTensorShape(transpose->GetNode().OutputDefs()[0]); + auto t3_shape = GetTensorShape(reshape2->GetNode().OutputDefs()[0]); + + if (!t0_shape.has_value() || !t1_shape.has_value() || + !t2_shape.has_value() || !t3_shape.has_value()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Failed to get tensor shapes"; + return std::nullopt; + } + + auto t1_dims = t1_shape->GetDims(); + auto t2_dims = t2_shape->GetDims(); + + // Condition 1: Rank(t1) == Rank(t2) == 6 + if (t1_shape->NumDimensions() != kRank6 || t2_shape->NumDimensions() != kRank6) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Condition 1 failed - not rank-6: t1_rank=" + << t1_shape->NumDimensions() << " t2_rank=" << t2_shape->NumDimensions(); + return std::nullopt; + } + + if (t1_dims.empty() || t2_dims.empty()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Empty dims"; + return std::nullopt; + } + + // Condition 2: Find a dimension with value 1 that exists at the same index in both t1 and t2 + std::optional unit_dim_index; + for (size_t i = 0; i < kRank6; ++i) { + if (t1_dims[i] == 1 && t2_dims[i] == 1) { + unit_dim_index = i; + break; + } + } + + if (!unit_dim_index.has_value()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: No common unit dimension found in t1 and t2"; + return std::nullopt; + } + + // Condition 3: Transpose must leave the unit dimension in place + NodeAttrHelper transpose_helper(transpose->GetNode()); + std::vector perm = transpose_helper.Get(kAttrTransposePerm, std::vector{}); + if (perm.size() != kRank6) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Invalid permutation size: " << perm.size(); + return std::nullopt; + } + + if (perm[unit_dim_index.value()] != static_cast(unit_dim_index.value())) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] ValidateConditions: Transpose moves unit dimension from index " + << unit_dim_index.value() << " to " << perm[unit_dim_index.value()]; + return std::nullopt; + } + + LOGS(logger, INFO) << "[Rank6ToRank5] ValidateConditions: All conditions passed! Unit dimension at index " + << unit_dim_index.value(); + return unit_dim_index; +} + +/// @brief Create or validate the QNN nodes with rank-5 tensors +Status CreateOrValidateOnQnn( + QnnModelWrapper* qnn_model_wrapper, + gsl::span node_units, + size_t unit_dim_index, + bool validate, + const logging::Logger& logger) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] CreateOrValidateOnQnn: validate=" << validate + << " unit_dim_index=" << unit_dim_index; + + const NodeUnit* reshape1 = node_units[0]; + const NodeUnit* transpose = node_units[1]; + const NodeUnit* reshape2 = node_units[2]; + + // Get input and output definitions + const NodeUnitIODef& reshape1_input = reshape1->Inputs()[0]; + const NodeUnitIODef& reshape2_output = reshape2->Outputs()[0]; + + // Get original shapes + auto t1_shape = GetTensorShape(reshape1->GetNode().OutputDefs()[0]); + auto t2_shape = GetTensorShape(transpose->GetNode().OutputDefs()[0]); + + if (!t1_shape.has_value() || !t2_shape.has_value()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to get intermediate tensor shapes"); + } + + auto t1_dims = t1_shape->GetDims(); + auto t2_dims = t2_shape->GetDims(); + + // Create rank-5 shape for t1 (remove unit dimension at unit_dim_index) + std::vector t1_rank5_dims; + t1_rank5_dims.reserve(kRank5); + for (size_t i = 0; i < t1_dims.size(); ++i) { + if (i != unit_dim_index) { + t1_rank5_dims.push_back(static_cast(t1_dims[i])); + } + } + + // Create rank-5 shape for t2 (remove unit dimension at unit_dim_index) + std::vector t2_rank5_dims; + t2_rank5_dims.reserve(kRank5); + for (size_t i = 0; i < t2_dims.size(); ++i) { + if (i != unit_dim_index) { + t2_rank5_dims.push_back(static_cast(t2_dims[i])); + } + } + + // Get transpose permutation and adjust for rank-5 + NodeAttrHelper transpose_helper(transpose->GetNode()); + std::vector perm = transpose_helper.Get(kAttrTransposePerm, std::vector{}); + if (perm.size() != kRank6) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Expected rank-6 permutation, got rank-", perm.size()); + } + + // Remove unit dimension and adjust indices + std::vector perm_rank5; + perm_rank5.reserve(kRank5); + for (size_t i = 0; i < perm.size(); ++i) { + if (i != unit_dim_index) { + int64_t perm_val = perm[i]; + // Adjust index: if perm_val > unit_dim_index, subtract 1 + if (perm_val > static_cast(unit_dim_index)) { + perm_val--; + } + perm_rank5.push_back(static_cast(perm_val)); + } + } + + // Use original tensor names from ONNX + const std::string& t1_name = reshape1->GetNode().OutputDefs()[0]->Name(); + const std::string& t2_name = transpose->GetNode().OutputDefs()[0]->Name(); + + // Get data type from the NodeUnit's output (handles both quantized and float types) + const NodeUnitIODef& reshape1_output = reshape1->Outputs()[0]; + Qnn_DataType_t data_type; + ORT_RETURN_IF_ERROR(utils::GetQnnDataType(reshape1_output.quant_param.has_value(), + reshape1_output.node_arg.TypeAsProto(), + data_type)); + + // Get input shape for first Reshape + std::vector reshape1_input_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper->GetOnnxShape(reshape1_input.node_arg, reshape1_input_shape), + "Failed to get first Reshape input shape"); + + // Get quantization params for first Reshape input + QnnQuantParamsWrapper quant_param; + ORT_RETURN_IF_ERROR(quant_param.Init(*qnn_model_wrapper, reshape1_input)); + + // Create Reshape1 with rank-5 output using AddReshapeNode + ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddReshapeNode( + reshape1_input.node_arg.Name(), + t1_name, + reshape1_input_shape, + t1_rank5_dims, + data_type, + quant_param, + validate, + false, // is_for_input + false // is_for_output + )); + + // Create Transpose with rank-5 input/output + { + // Get quantization params for transpose output + const NodeUnitIODef& transpose_output = transpose->Outputs()[0]; + QnnQuantParamsWrapper transpose_quant_param; + ORT_RETURN_IF_ERROR(transpose_quant_param.Init(*qnn_model_wrapper, transpose_output)); + + // Check if output tensor already exists + if (!qnn_model_wrapper->IsQnnTensorWrapperExist(t2_name)) { + // Create rank-5 output tensor for transpose with proper quantization params + QnnTensorWrapper t2_tensor(t2_name, QNN_TENSOR_TYPE_NATIVE, data_type, std::move(transpose_quant_param), + std::vector(t2_rank5_dims)); + ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(t2_tensor)), "Failed to add transpose output"); + } + + // Create perm parameter + std::vector perm_shape = {static_cast(perm_rank5.size())}; + QnnParamWrapper perm_param(transpose->Index(), transpose->Name(), QNN_OP_TRANSPOSE_PARAM_PERM, + std::move(perm_shape), std::move(perm_rank5)); + std::vector param_tensor_names = {perm_param.GetParamTensorName()}; + ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(perm_param)), "Failed to add perm param"); + + std::vector transpose_input_names = {t1_name}; + std::vector transpose_output_names = {t2_name}; + + ORT_RETURN_IF_NOT(qnn_model_wrapper->CreateQnnNode( + utils::GetUniqueName(*transpose), + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, + std::move(transpose_input_names), + std::move(transpose_output_names), + std::move(param_tensor_names), + validate), + "Failed to create rank-5 Transpose node"); + } + + // Get output shape for reshape2 + std::vector reshape2_output_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper->GetOnnxShape(reshape2_output.node_arg, reshape2_output_shape), + "Failed to get reshape2 output shape"); + + // Get quantization params for reshape2 + QnnQuantParamsWrapper quant_param2; + ORT_RETURN_IF_ERROR(quant_param2.Init(*qnn_model_wrapper, reshape2_output)); + + // Get data type from the NodeUnit's output (handles both quantized and float types) + ORT_RETURN_IF_ERROR(utils::GetQnnDataType(reshape2_output.quant_param.has_value(), + reshape2_output.node_arg.TypeAsProto(), + data_type)); + + // Create Reshape2 with rank-5 input using AddReshapeNode + ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddReshapeNode( + t2_name, + reshape2_output.node_arg.Name(), + t2_rank5_dims, + reshape2_output_shape, + data_type, + quant_param2, + validate, + false, // is_for_input + false // is_for_output + )); + + return Status::OK(); +} + +} // namespace + +std::unique_ptr Rank6ToRank5Fusion::TryFusion( + QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& reshape1_node_unit, + const MapNodeToNodeUnit& node_to_node_unit, + const MapNodeUnitToGroup& node_unit_to_qnn_node_group, + const logging::Logger& logger) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] TryFusion called for node: " << reshape1_node_unit.Name() + << " OpType: " << reshape1_node_unit.OpType(); + + const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer(); + + // Match the pattern + std::optional> pattern = MatchRank6ToRank5Pattern( + graph_viewer, &reshape1_node_unit, node_to_node_unit, node_unit_to_qnn_node_group, logger); + + if (!pattern.has_value()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Pattern match failed for node: " << reshape1_node_unit.Name(); + return nullptr; + } + + const NodeUnit* reshape1 = pattern->at(0); + const NodeUnit* transpose = pattern->at(1); + const NodeUnit* reshape2 = pattern->at(2); + + // Validate pattern conditions and get unit dimension index + auto unit_dim_index = ValidatePatternConditions(reshape1, transpose, reshape2, qnn_model_wrapper, logger); + if (!unit_dim_index.has_value()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] Pattern condition validation failed"; + return nullptr; + } + + // Validate on QNN + if (CreateOrValidateOnQnn(&qnn_model_wrapper, pattern.value(), unit_dim_index.value(), /*validate=*/true, logger) != Status::OK()) { + LOGS(logger, VERBOSE) << "[Rank6ToRank5] QNN validation failed"; + return nullptr; + } + + LOGS(logger, INFO) << "[Rank6ToRank5] Fusion successful! Creating Rank6ToRank5Fusion node group"; + return std::make_unique(pattern.value(), unit_dim_index.value()); +} + +gsl::span Rank6ToRank5Fusion::GetNodeUnits() const { + return gsl::span{node_units_.data(), node_units_.size()}; +} + +Status Rank6ToRank5Fusion::IsSupported( + QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const { + return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), unit_dim_index_, /*validate=*/true, logger); +} + +Status Rank6ToRank5Fusion::AddToModelBuilder( + QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const { + return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), unit_dim_index_, /*validate=*/false, logger); +} + +} // namespace qnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h new file mode 100644 index 0000000000000..cbce6933fc8d7 --- /dev/null +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_transpose_rank5.h @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h" +#include "core/providers/qnn/ort_api.h" + +namespace onnxruntime { +namespace qnn { + +class QnnModelWrapper; + +/// +/// Represents a fusion of pattern: Reshape -> Transpose -> Reshape where intermediate tensors are rank-6. +/// QNN doesn't support rank-6 Reshape and Transpose operators, so this fusion converts them to rank-5 +/// by removing a unit dimension (value of 1) from intermediate tensors. +/// Pattern: Tensor(t0) -> Reshape(R1) -> Tensor(t1) -> Transpose(T1) -> Tensor(t2) -> Reshape(R2) -> Tensor(t3) +/// Conditions: +/// - Rank(t0) == Rank(t3) AND Last dimension of t0 equals last dimension of t3 +/// - Rank(t1) == Rank(t2) == 6 +/// - There exists a dimension index where both t1 and t2 have value 1 +/// - Transpose must leave that unit dimension in place (perm[unit_dim_index] == unit_dim_index) +/// +class Rank6ToRank5Fusion : public IQnnNodeGroup { + public: + explicit Rank6ToRank5Fusion(gsl::span node_units, size_t unit_dim_index) + : unit_dim_index_(unit_dim_index) { + ORT_ENFORCE(node_units.size() == 3, "Pattern expects exactly 3 NodeUnits."); + node_units_[0] = node_units[0]; + node_units_[1] = node_units[1]; + node_units_[2] = node_units[2]; + } + ORT_DISALLOW_COPY_AND_ASSIGNMENT(Rank6ToRank5Fusion); + + Status IsSupported(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override; + Status AddToModelBuilder(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override; + gsl::span GetNodeUnits() const override; + const NodeUnit* GetTargetNodeUnit() const override { return node_units_[0]; } + std::string_view Type() const override { return "Rank6ToRank5Fusion"; } + + /// + /// Traverses graph to check if the given starting NodeUnit is part of a valid Reshape -> Transpose -> Reshape + /// pattern with rank-6 intermediate tensors. + /// + static std::unique_ptr TryFusion( + QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& reshape1_node_unit, + const std::unordered_map& node_to_node_unit, + const std::unordered_map& node_unit_to_qnn_node_group, + const logging::Logger& logger); + + private: + std::array node_units_; // Reshape1, Transpose, Reshape2 + size_t unit_dim_index_; // Index of the unit dimension (value 1) to remove +}; + +} // namespace qnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index 1ab32e649ed40..cdbd0c074f443 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -78,6 +78,8 @@ struct int64s final { const int64_t* data() const { return g_host->int64s__data(this); } const int64_t& operator[](int index) const { return Get(index); } void Reserve(int size) { g_host->int64s__Reserve(this, size); } + const int64_t* begin() const { return data(); } + const int64_t* end() const { return data() + size(); } PROVIDER_DISALLOW_ALL(int64s) }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 508d932459bf9..cd0c0e4bffdb5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3976,6 +3976,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior. trt_state->context->reset(); trt_state->engine->reset(); + + // Clear dds output allocator map since the engine and context will be recreated. + dds_output_allocator_map.clear(); + auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); if (max_workspace_size_ > 0) { trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h index fe95917e4e906..c4a88754deffe 100644 --- a/onnxruntime/core/providers/webgpu/compute_context.h +++ b/onnxruntime/core/providers/webgpu/compute_context.h @@ -8,6 +8,7 @@ #include #include "core/framework/execution_provider.h" +#include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/program.h" #include "core/providers/webgpu/webgpu_context.h" @@ -16,7 +17,6 @@ namespace onnxruntime { class Tensor; -class WebGpuExecutionProvider; namespace webgpu { @@ -42,6 +42,9 @@ class ComputeContext { inline bool HasFeature(wgpu::FeatureName feature) const { return webgpu_context_.DeviceHasFeature(feature); } + inline bool IsGraphCaptureEnabled() const { + return ep_.IsGraphCaptureEnabled(); + } #if !defined(__wasm__) inline const wgpu::AdapterPropertiesSubgroupMatrixConfigs& SubgroupMatrixConfigs() const { return webgpu_context_.SubgroupMatrixConfigs(); @@ -120,7 +123,7 @@ class ComputeContext { // // Run a compute shader program. // - inline Status RunProgram(const ProgramBase& program) { + inline Status RunProgram(ProgramBase& program) { return webgpu_context_.Run(*this, program); } diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc index ee7a36d17cf55..bf5208883508f 100644 --- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc +++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc @@ -103,7 +103,7 @@ std::string Conv2dMMProgram::Conv2dCommonSnippet(const ShaderVariableHelper& x, } } else { sample_w << "let col = colIn * " << inner_element_size_w << ";\n" - << "if (row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_b_outer)) {\n" + << "if (row < i32(uniforms.dim_inner) && col < i32(uniforms.dim_a_outer)) {\n" << " " << get_w_snippet(inner_element_size_w) << "\n" << "}\n" << "return " << TypeSnippet(inner_element_size_w, data_type) << "(0.0);\n"; diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h index 80f6d831d0909..c8f50837cd8e5 100644 --- a/onnxruntime/core/providers/webgpu/program.h +++ b/onnxruntime/core/providers/webgpu/program.h @@ -226,6 +226,7 @@ struct ProgramInput { ProgramInput(const Tensor* tensor, ProgramTensorMetadataDependency dependency, const TensorShape& override_shape, int component); const Tensor* tensor; + uint32_t segments = 1; ProgramTensorMetadataDependency dependency; ProgramVariableDataType var_type; bool use_override_shape; @@ -245,6 +246,7 @@ struct ProgramOutput { ProgramOutput(Tensor* tensor, ProgramTensorMetadataDependency dependency, const TensorShape& override_shape, int component); Tensor* tensor; + uint32_t segments = 1; ProgramTensorMetadataDependency dependency; ProgramVariableDataType var_type; bool is_atomic; @@ -346,6 +348,18 @@ class ProgramBase { inline const ProgramMetadata& Metadata() const { return metadata_; } inline const std::string& CacheHint() const { return cache_hint_; } inline const std::vector& Inputs() const { return inputs_; } + inline void setSegmentsForInput(size_t index, uint32_t segments) { + if (index >= inputs_.size()) { + throw std::out_of_range("input index out of range"); + } + inputs_[index].segments = segments; + } + inline void setSegmentsForOutput(size_t index, uint32_t segments) { + if (index >= outputs_.size()) { + throw std::out_of_range("output index out of range"); + } + outputs_[index].segments = segments; + } inline const std::vector& Outputs() const { return outputs_; } inline const std::vector& Indices() const { return indices_; } inline uint32_t DispatchGroupSizeX() const { return dispatch_group_size_x_; } diff --git a/onnxruntime/core/providers/webgpu/program_cache_key.cc b/onnxruntime/core/providers/webgpu/program_cache_key.cc index a351cacc783cf..51c004fadaa8d 100644 --- a/onnxruntime/core/providers/webgpu/program_cache_key.cc +++ b/onnxruntime/core/providers/webgpu/program_cache_key.cc @@ -18,7 +18,7 @@ namespace webgpu { namespace { // append the info of an input or output to the cachekey void AppendTensorInfo(std::ostream& ss, const TensorShape& tensor_shape, ProgramVariableDataType var_type, ProgramTensorMetadataDependency dependency, - bool& first) { + bool& first, uint32_t segments = 1) { if (first) { first = false; } else { @@ -34,6 +34,8 @@ void AppendTensorInfo(std::ostream& ss, const TensorShape& tensor_shape, Program ss << ';'; } + ss D("Segs=") << segments << ';'; + if ((dependency & ProgramTensorMetadataDependency::Shape) == ProgramTensorMetadataDependency::Shape) { ss D("Dims=") << tensor_shape.ToString(); } else if ((dependency & ProgramTensorMetadataDependency::Rank) == ProgramTensorMetadataDependency::Rank) { @@ -97,13 +99,18 @@ std::string CalculateProgramCacheKey(const ProgramBase& program, bool is_1d_disp ss << ":" D("Inputs="); first = true; for (const auto& input : program.Inputs()) { - AppendTensorInfo(ss, input.use_override_shape ? input.override_shape : input.tensor->Shape(), input.var_type, input.dependency, first); + AppendTensorInfo(ss, input.use_override_shape ? input.override_shape : input.tensor->Shape(), input.var_type, input.dependency, first, input.segments); } ss << ":" D("Outputs="); first = true; for (const auto& output : program.Outputs()) { - AppendTensorInfo(ss, output.use_override_shape ? output.override_shape : output.tensor->Shape(), output.var_type, output.dependency, first); + AppendTensorInfo(ss, + output.use_override_shape ? output.override_shape : output.tensor->Shape(), + output.var_type, + output.dependency, + first, + output.segments); } if (!program.Indices().empty()) { diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc index dcf89d8bb06a1..33c3514f8f6d3 100644 --- a/onnxruntime/core/providers/webgpu/program_manager.cc +++ b/onnxruntime/core/providers/webgpu/program_manager.cc @@ -38,6 +38,28 @@ Status ProgramManager::NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint return Status::OK(); } +Status ProgramManager::CalculateSegmentsForInputsAndOutputs(ProgramBase& program) { + const uint64_t maxStorageBufferBindingSize = limits_.maxStorageBufferBindingSize; + + // Inputs + for (size_t i = 0; i < program.Inputs().size(); ++i) { + const auto& input = program.Inputs()[i]; + if (input.tensor && input.tensor->SizeInBytes() > maxStorageBufferBindingSize) { + uint32_t segments = static_cast((input.tensor->SizeInBytes() + maxStorageBufferBindingSize - 1) / maxStorageBufferBindingSize); + program.setSegmentsForInput(i, segments); + } + } + // Outputs + for (size_t i = 0; i < program.Outputs().size(); ++i) { + const auto& output = program.Outputs()[i]; + if (output.tensor && output.tensor->SizeInBytes() > maxStorageBufferBindingSize) { + uint32_t segments = static_cast((output.tensor->SizeInBytes() + maxStorageBufferBindingSize - 1) / maxStorageBufferBindingSize); + program.setSegmentsForOutput(i, segments); + } + } + return Status::OK(); +} + Status ProgramManager::Build(const ProgramBase& program, const ProgramMetadata& program_metadata, #ifndef NDEBUG // if debug build diff --git a/onnxruntime/core/providers/webgpu/program_manager.h b/onnxruntime/core/providers/webgpu/program_manager.h index feeb703b95aa2..a473051593852 100644 --- a/onnxruntime/core/providers/webgpu/program_manager.h +++ b/onnxruntime/core/providers/webgpu/program_manager.h @@ -37,6 +37,7 @@ class ProgramManager { ProgramManager(const wgpu::Device& device, const wgpu::Limits& limits) : device_(device), limits_(limits) {} Status NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint32_t& z) const; + Status CalculateSegmentsForInputsAndOutputs(ProgramBase& program); Status Build(const ProgramBase& program, const ProgramMetadata& metadata, diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc index bdeea726a2cf5..0e4a3e08e1c13 100644 --- a/onnxruntime/core/providers/webgpu/shader_helper.cc +++ b/onnxruntime/core/providers/webgpu/shader_helper.cc @@ -91,7 +91,7 @@ const ShaderVariableHelper& ShaderHelper::AddInput(const std::string& name, Shad const auto& dims = program_.Inputs()[input_index].use_override_shape ? program_.Inputs()[input_index].override_shape : program_.Inputs()[input_index].tensor->Shape(); - return AddVariableImpl(true, name, usage, dims); + return AddVariableImpl(true, name, usage, dims, program_.Inputs()[input_index].segments); } const ShaderVariableHelper& ShaderHelper::AddOutput(const std::string& name, ShaderUsage usage) { @@ -101,7 +101,7 @@ const ShaderVariableHelper& ShaderHelper::AddOutput(const std::string& name, Sha const auto& dims = program_.Outputs()[output_index].use_override_shape ? program_.Outputs()[output_index].override_shape : program_.Outputs()[output_index].tensor->Shape(); - return AddVariableImpl(false, name, usage, dims); + return AddVariableImpl(false, name, usage, dims, program_.Outputs()[output_index].segments); } const ShaderIndicesHelper& ShaderHelper::AddIndices(const std::string& name, ShaderUsage usage) { @@ -263,12 +263,16 @@ Status ShaderHelper::ValidateVariable(const ProgramOutput& output, const ShaderV #endif // NDEBUG -const ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input, - const std::string& name, - ShaderUsage usage, - const TensorShape& dims) { - ORT_ENFORCE(input_vars_.size() + output_vars_.size() < limits_.maxStorageBuffersPerShaderStage, - "Too many storage buffers in shader. Max is ", limits_.maxStorageBuffersPerShaderStage); +ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input, + const std::string& name, + ShaderUsage usage, + const TensorShape& dims, + uint32_t segments) { + // Add the segments for the new variable we're about to create + numbers_storage_buffers_ += segments; + ORT_ENFORCE(numbers_storage_buffers_ <= limits_.maxStorageBuffersPerShaderStage, + "Too many storage buffers in shader. Current: ", numbers_storage_buffers_, + ", Max is ", limits_.maxStorageBuffersPerShaderStage); ProgramVariableDataType type = ProgramVariableDataType::InvalidType; auto& vars = is_input ? input_vars_ : output_vars_; @@ -276,12 +280,18 @@ const ShaderVariableHelper& ShaderHelper::AddVariableImpl(bool is_input, if (is_input) { const auto& input = program_.Inputs()[vars.size()]; type = input.var_type; + if (segments > 1) { + usage |= ShaderUsage::UseGetByOffsetSegments; + } } else { const auto& output = program_.Outputs()[vars.size()]; type = output.var_type; + if (segments > 1) { + usage |= ShaderUsage::UseSetByOffsetSegments; + } } - const auto& var = vars.emplace_back(std::make_unique(name, type, usage, dims)); + const auto& var = vars.emplace_back(std::make_unique(name, type, usage, dims, segments, limits_.maxStorageBufferBindingSize)); return *var; } @@ -418,28 +428,49 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector& sha // // Input/output variables // + size_t binding_index = 0; // running binding index accounting for segmented buffers + // inputs for (size_t i = 0; i < input_vars_.size(); ++i) { const auto& input = input_vars_[i]; - ss << "@group(0) @binding(" << i << ") var " << input->name_ << ": array<" << input->StorageType() << ">;\n"; + uint32_t segments = input->segments_; + for (uint32_t seg = 0; seg < segments; ++seg) { + ss << "@group(0) @binding(" << binding_index++ << ") var "; + if (seg == 0) { + ss << input->name_; + } else { + ss << input->name_ << seg; // naming convention matches ShaderVariableHelper::Impl usage (name + index) + } + ss << ": array<" << input->StorageType() << ">;\n"; + } } + // outputs for (size_t i = 0; i < output_vars_.size(); ++i) { const auto& output = output_vars_[i]; bool is_atomic = program_.Outputs()[i].is_atomic; - ss << "@group(0) @binding(" << input_vars_.size() + i << ") var " << output->name_ << ": array<"; - if (is_atomic) { - if (output->type_ == ProgramVariableDataType::Float32) { - ss << "atomic"; - } else if (output->type_ == ProgramVariableDataType::Uint32) { - ss << "atomic"; - } else if (output->type_ == ProgramVariableDataType::Int32) { - ss << "atomic"; + uint32_t segments = output->segments_; + for (uint32_t seg = 0; seg < segments; ++seg) { + ss << "@group(0) @binding(" << binding_index++ << ") var "; + if (seg == 0) { + ss << output->name_; } else { - ORT_RETURN_IF(true, "Unsupported atomic type: ", int(output->type_)); + ss << output->name_ << seg; } - } else { - ss << output->StorageType(); + ss << ": array<"; + if (is_atomic) { + if (output->type_ == ProgramVariableDataType::Float32) { + ss << "atomic"; // emulate float atomic via i32 + } else if (output->type_ == ProgramVariableDataType::Uint32) { + ss << "atomic"; + } else if (output->type_ == ProgramVariableDataType::Int32) { + ss << "atomic"; + } else { + ORT_RETURN_IF(true, "Unsupported atomic type: ", int(output->type_)); + } + } else { + ss << output->StorageType(); + } + ss << ">;\n"; } - ss << ">;\n"; } // @@ -559,7 +590,7 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector& sha ss << "\n};\n" "@group(0) @binding(" - << input_vars_.size() + output_vars_.size() << ") var uniforms: Uniforms;\n"; + << binding_index << ") var uniforms: Uniforms;\n"; } // diff --git a/onnxruntime/core/providers/webgpu/shader_helper.h b/onnxruntime/core/providers/webgpu/shader_helper.h index ea19a6ae9a875..6878f5236fddf 100644 --- a/onnxruntime/core/providers/webgpu/shader_helper.h +++ b/onnxruntime/core/providers/webgpu/shader_helper.h @@ -128,10 +128,11 @@ class ShaderHelper final { } } - const ShaderVariableHelper& AddVariableImpl(bool is_input, - const std::string& name, - ShaderUsage usage, - const TensorShape& dims); + ShaderVariableHelper& AddVariableImpl(bool is_input, + const std::string& name, + ShaderUsage usage, + const TensorShape& dims, + uint32_t segments); #ifndef NDEBUG // if debug build Status ValidateVariable(const ProgramInput& input, const ShaderVariableHelper& var) const; @@ -165,6 +166,8 @@ class ShaderHelper final { const ProgramBase& program_; const ProgramMetadata& program_metadata_; + uint32_t numbers_storage_buffers_ = 0; + std::vector> input_vars_; std::vector> output_vars_; std::vector> indices_vars_; diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc index c197e227e2a8c..aa1f6c9a0ec0b 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.cc +++ b/onnxruntime/core/providers/webgpu/shader_variable.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include "core/providers/webgpu/shader_variable.h" @@ -94,6 +95,33 @@ constexpr static const std::string_view ELEMENT_TYPE_ARRAY[] = { }; constexpr static const auto ELEMENT_TYPE = details::_to_std_array(ELEMENT_TYPE_ARRAY); +constexpr static const uint32_t BYTES_ARRAY[] = { + 4, // Float32 + 8, // Float32x2 + 16, // Float32x4 + 2, // Float16 + 4, // Float16x2 + 8, // Float16x4 + 4, // Int32 + 8, // Int32x2 + 16, // Int32x4 + 4, // Uint32 + 8, // Uint32x2 + 16, // Uint32x4 + 8, // Int64 (vec2) + 8, // Uint64 (vec2) + 4, // Boolx4 (packed in u32) + 4, // Uint8x4 (packed in u32) + 8, // Uint8x8 (vec2) + 16, // Uint8x16 (vec4) + 4, // Int8x4 (packed in u32) + 8, // Int8x8 (vec2) + 16, // Int8x16 (vec4) + 4, // Uint4x8 (packed in u32) + 4, // Int4x8 (packed in u32) +}; +constexpr static const auto BYTES = details::_to_std_array(BYTES_ARRAY); + inline std::string GetIndicesType(int rank) { return rank < 2 ? "u32" : (rank <= 4 ? MakeStringWithClassicLocale("vec", rank, "") @@ -114,8 +142,10 @@ ShaderIndicesHelper::ShaderIndicesHelper(std::string_view name, ProgramVariableD element_type_alias_{name_ + "_element_t"}, indices_type_alias_{name_ + "_indices_t"} {} -ShaderVariableHelper::ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims) - : ShaderIndicesHelper{name, type, usage, dims} { +ShaderVariableHelper::ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims, uint32_t segments, uint64_t maxStorageBufferBindingSize) + : ShaderIndicesHelper{name, type, usage, dims}, + segments_{segments}, + max_storage_buffer_binding_size_{maxStorageBufferBindingSize} { ORT_ENFORCE(type_ != ProgramVariableDataType::InvalidType, "Invalid type for variable ", name_); ORT_ENFORCE(num_components_ > 0, "Invalid number of components for variable ", name_); } @@ -273,11 +303,47 @@ void ShaderVariableHelper::Impl(std::ostream& ss) const { SS_APPEND(ss, "}\n"); } } + // Implementation of "fn get_{name}_by_offset" for multi-buffer segmented inputs + if (usage_ & ShaderUsage::UseGetByOffsetSegments) { + // Multi-buffer segmented input accessor. + // Compute which physical storage buffer chunk the global linear element offset belongs to. + SS_APPEND(ss, "fn get_", name_, "_by_offset(global_offset: u32) -> ", ValueType(), " {\n"); + SS_APPEND(ss, " const CHUNK_SIZE_IN_ELEMENTS: u32 = ", max_storage_buffer_binding_size_, "u / ", BYTES[static_cast(type_)], "u;\n"); + SS_APPEND(ss, " let buffer_index: u32 = global_offset / CHUNK_SIZE_IN_ELEMENTS;\n"); + SS_APPEND(ss, " let local_offset: u32 = global_offset % CHUNK_SIZE_IN_ELEMENTS;\n"); + SS_APPEND(ss, " switch(buffer_index) {\n"); + // case 0 (base buffer name_) + SS_APPEND(ss, " case 0u: { return ", name_, "[local_offset]; }\n"); + for (uint32_t i = 1; i < segments_; ++i) { + SS_APPEND(ss, " case ", i, "u: { return ", name_, i, "[local_offset]; }\n"); + } + SS_APPEND(ss, " default: { return ", name_, "[local_offset]; }\n"); + SS_APPEND(ss, " }\n"); + SS_APPEND(ss, "}\n"); + } + // Implementation of "fn set_{name}_by_offset" for multi-buffer segmented variables + if (usage_ & ShaderUsage::UseSetByOffsetSegments) { + SS_APPEND(ss, "fn set_", name_, "_by_offset(global_offset: u32, value: ", ValueType(), ") {\n"); + SS_APPEND(ss, " const CHUNK_SIZE_IN_ELEMENTS: u32 = ", max_storage_buffer_binding_size_, "u / ", BYTES[static_cast(type_)], "u;\n"); + SS_APPEND(ss, " let buffer_index: u32 = global_offset / CHUNK_SIZE_IN_ELEMENTS;\n"); + SS_APPEND(ss, " let local_offset: u32 = global_offset % CHUNK_SIZE_IN_ELEMENTS;\n"); + SS_APPEND(ss, " switch(buffer_index) {\n"); + SS_APPEND(ss, " case 0u: { ", name_, "[local_offset] = value; return; }\n"); + for (uint32_t i = 1; i < segments_; ++i) { + SS_APPEND(ss, " case ", i, "u: { ", name_, i, "[local_offset] = value; return; }\n"); + } + SS_APPEND(ss, " default: { ", name_, "[local_offset] = value; return; }\n"); + SS_APPEND(ss, " }\n"); + SS_APPEND(ss, "}\n"); + } } std::string ShaderVariableHelper::GetByOffsetImpl(std::string_view offset) const { SS(ss, kStringInitialSizeGetByOffsetImpl); + if (usage_ & ShaderUsage::UseGetByOffsetSegments) { + return MakeStringWithClassicLocale("get_", name_, "_by_offset(", offset, ")"); + } switch (type_) { case onnxruntime::webgpu::ProgramVariableDataType::InvalidType: ORT_THROW("Invalid type"); @@ -303,12 +369,16 @@ std::string ShaderVariableHelper::GetByOffsetImpl(std::string_view offset) const std::string ShaderVariableHelper::SetByOffsetImpl(std::string_view offset, std::string_view value) const { SS(ss, kStringInitialSizeSetByOffsetImpl); + if (usage_ & ShaderUsage::UseSetByOffsetSegments) { + return MakeStringWithClassicLocale("set_", name_, "_by_offset(", offset, ",", value, ");"); + } + switch (type_) { case onnxruntime::webgpu::ProgramVariableDataType::InvalidType: ORT_THROW("Invalid type"); break; case onnxruntime::webgpu::ProgramVariableDataType::Int64: - ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, " << value << " < 0));"; + ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), select(0u, 0xFFFFFFFFu, i32(" << value << ") < 0));"; break; case onnxruntime::webgpu::ProgramVariableDataType::Uint64: ss << name_ << "[" << offset << "]=vec2(u32(" << value << "), 0u);"; diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 78c98ab26f5b8..8e921d6deafbb 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -69,6 +69,8 @@ struct ShaderUsage { UseSetByIndices = 512, // use implementation of fn set_{name}_by_indices UseGet = 1024, // use implementation of fn get_{name} UseGetByIndices = 2048, // use implementation of fn get_{name}_by_indices + UseGetByOffsetSegments = 4096, // use implementation of fn get_{name}_by_offset + UseSetByOffsetSegments = 8192, // use implementation of fn set_{name}_by_offset UseUniform = 32768, // use uniform for shape and stride } usage; @@ -157,7 +159,7 @@ class ShaderIndicesHelper { // A helper class to make it easier to generate shader code related to a variable setting/getting and its indices calculation. class ShaderVariableHelper : public ShaderIndicesHelper { public: - ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims); + ShaderVariableHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims, uint32_t segments, uint64_t maxStorageBufferBindingSize); ShaderVariableHelper(ShaderVariableHelper&&) = default; ShaderVariableHelper& operator=(ShaderVariableHelper&&) = default; @@ -203,6 +205,9 @@ class ShaderVariableHelper : public ShaderIndicesHelper { std::string_view ValueType() const; std::string_view ElementType() const; + uint32_t segments_ = 1; + uint64_t max_storage_buffer_binding_size_ = 0; + friend class ShaderHelper; }; #if defined(__GNUC__) diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc index 313a96ba25509..daf4aa323c12e 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.cc +++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc @@ -11,75 +11,29 @@ namespace onnxruntime { namespace webgpu { namespace { -const std::vector& CastOpTypeConstraints() { - // currently support boolean, integer and float types that explicitly allowed in WGSL: +const std::vector& CastOpTypeConstraints(bool enable_graph_capture) { + // Base types that are always supported - boolean, integer and float types that explicitly allowed in WGSL: // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section - // - static std::vector types{ + static std::vector base_types{ DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}; - return types; + + if (enable_graph_capture) { + static std::vector types_with_int64 = []() { + auto types = base_types; + types.push_back(DataTypeImpl::GetTensorType()); + return types; + }(); + return types_with_int64; + } else { + return base_types; + } } } // namespace -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 6, 8, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 9, 12, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 13, 18, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 19, 20, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Cast, - kOnnxDomain, - 21, 22, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); -ONNX_OPERATOR_KERNEL_EX( - Cast, - kOnnxDomain, - 23, - kWebGpuExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T1", CastOpTypeConstraints()) - .TypeConstraint("T2", CastOpTypeConstraints()), - Cast); - Status Cast::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); auto* output_tensor = context.Output(0, input_tensor->Shape()); @@ -87,12 +41,17 @@ Status Cast::ComputeInternal(ComputeContext& context) const { if (size == 0) { return Status::OK(); } + bool is_from_int64 = input_tensor->DataType() == DataTypeImpl::GetType(); + const int in_components = is_from_int64 ? 1 : 4; + const int out_components = to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64 ? 1 : 4; uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); + uint32_t in_vec_size = onnxruntime::narrow(in_components == 1 ? size : vec_size); + uint32_t out_vec_size = onnxruntime::narrow(out_components == 1 ? size : vec_size); - CastProgram program{to_}; + CastProgram program{to_, is_from_int64}; program - .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}) - .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {vec_size}, 4}) + .AddInput({input_tensor, ProgramTensorMetadataDependency::Type, {in_vec_size}, in_components}) + .AddOutput({output_tensor, ProgramTensorMetadataDependency::None, {out_vec_size}, out_components}) .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({ {static_cast(vec_size)}, @@ -121,15 +80,78 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const { case ONNX_NAMESPACE::TensorProto_DataType_BOOL: expression = "vec4(a)"; break; + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + expression = "int32(a)"; + break; default: ORT_NOT_IMPLEMENTED("Cast to type ", to_, " is not supported."); } - sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size") - << " let a = " << input.GetByOffset("global_idx") << ";\n " - << output.SetByOffset("global_idx", expression); + + sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"); + if (is_from_int64_) { + sh.MainFunctionBody() << " let a0 = " << input.GetByOffset("global_idx * 4") << ";\n" + << " let a1 = " << input.GetByOffset("global_idx * 4 + 1") << ";\n" + << " let a2 = " << input.GetByOffset("global_idx * 4 + 2") << ";\n" + << " let a3 = " << input.GetByOffset("global_idx * 4 + 3") << ";\n" + << " let a = vec4(a0, a1, a2, a3);\n"; + } else { + sh.MainFunctionBody() << " let a = " << input.GetByOffset("global_idx") << ";\n"; + } + if (to_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + sh.MainFunctionBody() << output.SetByOffset("global_idx * 4", "a.x") << "\n" + << output.SetByOffset("global_idx * 4 + 1", "a.y") << "\n" + << output.SetByOffset("global_idx * 4 + 2", "a.z") << "\n" + << output.SetByOffset("global_idx * 4 + 3", "a.w") << "\n"; + } else { + sh.MainFunctionBody() << output.SetByOffset("global_idx", expression); + } return Status::OK(); } +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) { + const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture); + + KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr& out) -> Status { + out = std::make_unique(info); + return Status::OK(); + }; + + if constexpr (StartVersion == EndVersion) { + // Non-versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } else { + // Versioned kernel + return { + KernelDefBuilder() + .SetName("Cast") + .SetDomain(kOnnxDomain) + .SinceVersion(StartVersion, EndVersion) + .Provider(kWebGpuExecutionProvider) + .TypeConstraint("T1", type_constraints) + .TypeConstraint("T2", type_constraints) + .Build(), + kernel_create_fn}; + } +} + +// Explicit template instantiations +template KernelCreateInfo CreateCastKernelInfo<6, 8>(bool); +template KernelCreateInfo CreateCastKernelInfo<9, 12>(bool); +template KernelCreateInfo CreateCastKernelInfo<13, 18>(bool); +template KernelCreateInfo CreateCastKernelInfo<19, 20>(bool); +template KernelCreateInfo CreateCastKernelInfo<21, 22>(bool); +template KernelCreateInfo CreateCastKernelInfo<23>(bool); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h index 925cd200f0aba..7dfb50e3241c8 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.h +++ b/onnxruntime/core/providers/webgpu/tensor/cast.h @@ -3,6 +3,8 @@ #pragma once +#include "core/framework/kernel_registry.h" +#include "core/framework/op_kernel.h" #include "core/providers/webgpu/webgpu_kernel.h" namespace onnxruntime { @@ -10,7 +12,7 @@ namespace webgpu { class CastProgram final : public Program { public: - CastProgram(int32_t to) : Program{"Cast"}, to_{to} {} + CastProgram(int32_t to, bool is_from_int64) : Program{"Cast"}, to_{to}, is_from_int64_{is_from_int64} {} Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -18,6 +20,7 @@ class CastProgram final : public Program { private: int32_t to_; + bool is_from_int64_; }; class Cast final : public WebGpuKernel { @@ -37,5 +40,9 @@ class Cast final : public WebGpuKernel { int32_t to_; }; +// Create Cast kernel info with appropriate type constraints based on graph capture support +template +KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture); + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc index 7c3aced3f0295..cab1dc03848b9 100644 --- a/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/webgpu/tensor/gather_nd.cc @@ -43,7 +43,7 @@ Status GatherNDProgram::GenerateShaderCode(ShaderHelper& shader) const { data_dim += indices_innerest_dim_; for (uint32_t i = 0; i < static_cast(data.Rank() - data_dim); i++) { - shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; + shader.MainFunctionBody() << " " << data.IndicesSet("data_indices", data_dim + i, output.IndicesGet("output_indices", indices.Rank() - 1 + i)) << "\n"; } shader.MainFunctionBody() << " " << output.SetByOffset("global_idx", data.GetByIndices("data_indices")); diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 985fcd03f33ac..f48b78c9adb91 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -178,7 +178,7 @@ Status WebGpuContext::Wait(wgpu::Future f) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to wait for the operation:", uint32_t(status)); } -Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) { +Status WebGpuContext::Run(ComputeContext& context, ProgramBase& program) { const auto& inputs = program.Inputs(); const auto& outputs = program.Outputs(); @@ -263,6 +263,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) { ORT_ENFORCE(x == 0 && y == 0 && z == 0, "Only one of SetIndirectDispatchTensor and SetDispatchGroupSize should be called for program", program.Name()); } + ORT_RETURN_IF_ERROR(program_mgr_->CalculateSegmentsForInputsAndOutputs(program)); bool is_1d_dispatch = (y == 1 && z == 1); @@ -437,19 +438,26 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) { WriteTimestamp(num_pending_dispatches_ * 2); + const size_t total_buffer_count = inputs.size() + outputs.size() + (uniform_buffer ? 1 : 0); + std::vector bind_buffers; - bind_buffers.reserve(inputs.size() + outputs.size() + (uniform_buffer ? 1 : 0)); + std::vector bind_buffers_segments; + bind_buffers.reserve(total_buffer_count); + bind_buffers_segments.reserve(total_buffer_count); for (const auto& input : inputs) { bind_buffers.push_back(reinterpret_cast(const_cast(input.tensor->DataRaw()))); + bind_buffers_segments.push_back(input.segments); } for (const auto& output : outputs) { bind_buffers.push_back(reinterpret_cast(output.tensor->MutableDataRaw())); + bind_buffers_segments.push_back(output.segments); } if (uniform_buffer) { bind_buffers.push_back(uniform_buffer); + bind_buffers_segments.push_back(1); // uniform buffer defaults to 1 segment } - LaunchComputePipeline(compute_pass_encoder, bind_buffers, *program_artifact, x, y, z, program.IndirectDispatchTensor()); + LaunchComputePipeline(compute_pass_encoder, bind_buffers, bind_buffers_segments, *program_artifact, x, y, z, program.IndirectDispatchTensor()); if (uniform_buffer) { buffer_mgr.Release(uniform_buffer); } @@ -535,7 +543,15 @@ wgpu::Limits WebGpuContext::GetRequiredLimits(const wgpu::Adapter& adapter) cons required_limits.maxBindGroups = adapter_limits.maxBindGroups; required_limits.maxComputeWorkgroupStorageSize = adapter_limits.maxComputeWorkgroupStorageSize; required_limits.maxComputeWorkgroupsPerDimension = adapter_limits.maxComputeWorkgroupsPerDimension; - required_limits.maxStorageBufferBindingSize = adapter_limits.maxStorageBufferBindingSize; + required_limits.maxStorageBuffersPerShaderStage = adapter_limits.maxStorageBuffersPerShaderStage; + + if (small_storage_buffer_binding_size_for_testing_) { + // No matter how small it is set, the minimum storage buffer binding size in WebGPU is 128 MB. + required_limits.maxStorageBufferBindingSize = 134217728; + } else { + required_limits.maxStorageBufferBindingSize = adapter_limits.maxStorageBufferBindingSize; + } + required_limits.maxBufferSize = adapter_limits.maxBufferSize; required_limits.maxComputeInvocationsPerWorkgroup = adapter_limits.maxComputeInvocationsPerWorkgroup; required_limits.maxComputeWorkgroupSizeX = adapter_limits.maxComputeWorkgroupSizeX; @@ -728,15 +744,37 @@ void WebGpuContext::OnRunEnd() { void WebGpuContext::LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder, const std::vector& bind_buffers, + const std::vector& bind_buffers_segments, const ProgramArtifact& program_artifact, uint32_t x, uint32_t y, uint32_t z, const Tensor* indirect_dispatch_tensor) { uint32_t entry_index = 0; std::vector bind_group_entries; - for (WGPUBuffer buffer : bind_buffers) { - bind_group_entries.push_back({nullptr, entry_index++, buffer, 0, WGPU_WHOLE_SIZE, nullptr, nullptr}); + + for (size_t buffer_idx = 0; buffer_idx < bind_buffers.size(); ++buffer_idx) { + WGPUBuffer buffer = bind_buffers[buffer_idx]; + uint64_t buffer_size = wgpuBufferGetSize(buffer); + const uint64_t kMaxBufferSize = device_limits_.maxStorageBufferBindingSize; + const uint32_t total_segments = bind_buffers_segments[buffer_idx]; + // `total_segments` we used is calculated by tensor size, not actual buffer size. Because for bucketed buffer, + // the actual buffer size may be larger than the tensor size, an extreme case is that tensor size = 127MB, buffer size = 256MB, + // maxStorageBufferBindingSize = 128MB, in this case we only need to bind 1 segment instead of 2 segments because + // there is no data for the second segment. + if (total_segments > 1) { + uint64_t offset = 0; + for (uint32_t segment = 0; segment < total_segments; ++segment) { + uint64_t segment_size = std::min(kMaxBufferSize, buffer_size - offset); + bind_group_entries.push_back({nullptr, entry_index++, buffer, offset, segment_size, nullptr, nullptr}); + offset += segment_size; + } + } else { + bind_group_entries.push_back({nullptr, entry_index++, buffer, 0, std::min(kMaxBufferSize, buffer_size), nullptr, nullptr}); + } } + ORT_ENFORCE(entry_index < device_limits_.maxBindingsPerBindGroup, "Number of bind group entries (", entry_index, + ") exceeds device limit (", device_limits_.maxBindingsPerBindGroup, ")."); + WGPUBindGroupLayout bind_group_layout = program_artifact.compute_pipeline.GetBindGroupLayout(0).MoveToCHandle(); WGPUBindGroupDescriptor bind_group_desc{}; bind_group_desc.layout = bind_group_layout; @@ -912,7 +950,7 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co auto it = contexts_.find(context_id); if (it == contexts_.end()) { GSL_SUPPRESS(r.11) - auto context = std::unique_ptr(new WebGpuContext(instance, device, config.validation_mode, config.preserve_device)); + auto context = std::unique_ptr(new WebGpuContext(instance, device, config.validation_mode, config.preserve_device, config.small_storage_buffer_binding_size_for_testing)); it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first; } else if (context_id != 0) { ORT_ENFORCE(it->second.context->instance_.Get() == instance && diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h index 0c0d116cf9394..e21a0e577311f 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.h +++ b/onnxruntime/core/providers/webgpu/webgpu_context.h @@ -40,6 +40,7 @@ struct WebGpuContextConfig { const void* dawn_proc_table; ValidationMode validation_mode; bool preserve_device; + bool small_storage_buffer_binding_size_for_testing; }; struct WebGpuBufferCacheConfig { @@ -166,7 +167,7 @@ class WebGpuContext final { // Status PopErrorScope(); - Status Run(ComputeContext& context, const ProgramBase& program); + Status Run(ComputeContext& context, ProgramBase& program); void OnRunEnd(); private: @@ -176,12 +177,13 @@ class WebGpuContext final { AtPasses }; - WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode, bool preserve_device) - : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None}, preserve_device_{preserve_device} {} + WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode, bool preserve_device, bool small_storage_buffer_binding_size_for_testing = false) + : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None}, preserve_device_{preserve_device}, small_storage_buffer_binding_size_for_testing_{small_storage_buffer_binding_size_for_testing} {} ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuContext); void LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder, const std::vector& bind_buffers, + const std::vector& bind_buffers_segments, const ProgramArtifact& program_artifact, uint32_t x, uint32_t y, uint32_t z, const Tensor* indirect_dispatch_tensor = nullptr); @@ -264,6 +266,7 @@ class WebGpuContext final { uint64_t gpu_timestamp_offset_ = 0; bool is_profiling_ = false; bool preserve_device_; + bool small_storage_buffer_binding_size_for_testing_; GraphCaptureState graph_capture_state_{GraphCaptureState::Default}; // External vector to store captured commands, owned by EP diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index bbb3fbdd221d3..0f7607ac1dbfe 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -28,6 +28,7 @@ #include "core/providers/webgpu/data_transfer.h" #include "core/providers/webgpu/external_data_loader.h" #include "core/providers/webgpu/webgpu_profiler.h" +#include "core/providers/webgpu/tensor/cast.h" namespace onnxruntime { @@ -417,7 +418,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND); -std::unique_ptr RegisterKernels() { +std::unique_ptr RegisterKernels(bool enable_graph_capture = false) { auto kernel_registry = std::make_unique(); static const BuildKernelCreateInfoFn function_table[] = { @@ -464,13 +465,6 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO(13, Tanh), KERNEL_CREATE_INFO(1, Not), - KERNEL_CREATE_INFO_VERSIONED(6, 8, Cast), - KERNEL_CREATE_INFO_VERSIONED(9, 12, Cast), - KERNEL_CREATE_INFO_VERSIONED(13, 18, Cast), - KERNEL_CREATE_INFO_VERSIONED(19, 20, Cast), - KERNEL_CREATE_INFO_VERSIONED(21, 22, Cast), - KERNEL_CREATE_INFO(23, Cast), - // // activations BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -771,6 +765,14 @@ std::unique_ptr RegisterKernels() { } } + // Register Cast kernels with conditional int64 support based on graph capture + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture))); + ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture))); + #ifndef DISABLE_CONTRIB_OPS Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry); ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage()); @@ -869,9 +871,13 @@ std::vector> WebGpuExecutionProvider::GetCapa } std::shared_ptr WebGpuExecutionProvider::GetKernelRegistry() const { - static std::shared_ptr registry = webgpu::RegisterKernels(); - - return registry; + if (enable_graph_capture_) { + static std::shared_ptr registry = webgpu::RegisterKernels(true); + return registry; + } else { + static std::shared_ptr registry = webgpu::RegisterKernels(false); + return registry; + } } std::unique_ptr WebGpuExecutionProvider::GetDataTransfer() const { diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc index 80b3988215c6b..60934bef574fa 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc @@ -155,6 +155,19 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( } } + std::string small_storage_buffer_binding_size_for_testing_str; + bool small_storage_buffer_binding_size_for_testing = false; + if (config_options.TryGetConfigEntry(kSmallStorageBufferBindingSizeForTesting, small_storage_buffer_binding_size_for_testing_str)) { + if (small_storage_buffer_binding_size_for_testing_str == "1" || small_storage_buffer_binding_size_for_testing_str == "true") { + small_storage_buffer_binding_size_for_testing = true; + } else if (small_storage_buffer_binding_size_for_testing_str == "0" || small_storage_buffer_binding_size_for_testing_str == "false") { + small_storage_buffer_binding_size_for_testing = false; + } else { + ORT_THROW("Invalid small storage buffer binding size for testing: ", small_storage_buffer_binding_size_for_testing_str); + } + } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP small storage buffer binding size for testing: " << small_storage_buffer_binding_size_for_testing; + webgpu::WebGpuContextConfig context_config{ context_id, reinterpret_cast(webgpu_instance), @@ -162,6 +175,7 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( reinterpret_cast(dawn_proc_table), validation_mode, preserve_device, + small_storage_buffer_binding_size_for_testing, }; LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id; diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h index a3b6cca4ceaf0..761ff0d85fc98 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h @@ -32,6 +32,8 @@ constexpr const char* kEnablePIXCapture = "ep.webgpuexecutionprovider.enablePIXC constexpr const char* kPreserveDevice = "ep.webgpuexecutionprovider.preserveDevice"; +constexpr const char* kSmallStorageBufferBindingSizeForTesting = "ep.webgpuexecutionprovider.smallStorageBufferBindingSizeForTesting"; + // The following are the possible values for the provider options. constexpr const char* kDawnBackendType_D3D12 = "D3D12"; diff --git a/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc b/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc index c239605733df8..7208cbd5a7d8a 100644 --- a/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc +++ b/onnxruntime/core/providers/webgpu/wgsl_templates/wgsl_gen.cc @@ -258,6 +258,15 @@ duk_ret_t ShaderVariable_SetByOffset(duk_context* ctx) { return 1; } +/** @brief JavaScript binding for ShaderVariableHelper::GetByOffset */ +duk_ret_t ShaderVariable_GetByOffset(duk_context* ctx) { + const char* offset_expr = duk_require_string(ctx, 0); + const ShaderVariableHelper* helper = GetHelperFromFunction(ctx); + std::string result = helper->GetByOffset(offset_expr); + duk_push_string(ctx, result.c_str()); + return 1; +} + /** @brief JavaScript binding for ShaderVariableHelper::Rank */ duk_ret_t ShaderVariable_Rank(duk_context* ctx) { const ShaderVariableHelper* helper = GetHelperFromFunction(ctx); @@ -363,6 +372,7 @@ Status ApplyTemplateDynamic(ShaderHelper& shader_helper, CreateShaderVariableMethod(ctx, "OffsetToIndices", ShaderVariable_OffsetToIndices, 1, var_helper); CreateShaderVariableMethod(ctx, "SetByOffset", ShaderVariable_SetByOffset, 2, var_helper); + CreateShaderVariableMethod(ctx, "GetByOffset", ShaderVariable_GetByOffset, 1, var_helper); CreateShaderVariableMethod(ctx, "Rank", ShaderVariable_Rank, 0, var_helper); duk_put_prop_string(ctx, -2, arg.name.c_str()); } diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index baedb98a34c28..fbabc23504636 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -38,7 +38,7 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type); // Collects all the initializer tensors in the subGraph and its ancestor graphs. InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer); -inline std::vector HandleNegativeAxes(const std::vector& axes, size_t input_size) { +inline std::vector HandleNegativeAxes(const gsl::span axes, size_t input_size) { std::vector new_axes(axes.size()); for (size_t i = 0; i < axes.size(); ++i) { new_axes[i] = HandleNegativeAxis(axes[i], input_size); diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 0ea927967d989..5a80f01c17236 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -250,29 +250,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&, std::vector c_shape; if (!GetShape(*input_defs[c_idx], c_shape, logger)) return false; - - size_t c_dim = c_shape.size(); - - if (c_dim > 1) { - // TODO: Supports other shape of C. - // Currently WebNN implementation in Chromium only supports 1-D C. - return false; - } - if (c_dim == 0) { - LOGS(logger, VERBOSE) << "C of Gemm is a scalar"; - } else { - auto c_size = c_shape[c_dim - 1]; - NodeAttrHelper helper(node); - const auto transB = helper.Get("transB", 0); - if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) { - LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape[" - << (transB == 0 ? "1" : "0") << "]" - << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]" - << " c_size: " << c_size; - - return false; - } - } } } diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc index 6ea9b0a440d93..d07e636d578b1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc @@ -19,6 +19,8 @@ namespace webnn { class ReductionOpBuilder : public BaseOpBuilder { // Add operator related. public: + // Allow axes potentially being empty inputs that are ignored during processing. + ReductionOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {} void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override; // Add operator related. @@ -37,6 +39,7 @@ void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons const auto& input_defs = node.InputDefs(); if (input_defs.size() > 1) { model_builder.AddInitializerToSkip(input_defs[1]->Name()); // axes + model_builder.AddInputToSkip(input_defs[1]->Name()); // axes } } @@ -53,71 +56,50 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); const auto keep_dims = helper.Get("keepdims", 1); + emscripten::val options = emscripten::val::object(); options.set("label", node.Name()); options.set("keepDimensions", keep_dims == 1); - std::vector axes_data; - - emscripten::val output = emscripten::val::object(); + std::vector axes_data; const auto opset = node.SinceVersion(); const auto& op_type = node.OpType(); if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) { // 'axes' is an optional input. - const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0); - if (!GetTensorName(input_defs, 1).empty()) { - // Optional input axes is provided, use axes initializer data. - const auto& initializers(model_builder.GetInitializerTensors()); - const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); - Initializer axes_initializer(axes_tensor); - const auto axes_data_span = axes_initializer.DataAsSpan(); - std::transform( - axes_data_span.begin(), axes_data_span.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); - } else { - if (noop_with_empty_axes) { - // When axes is empty and this attribute is set to true, input tensor will not be reduced. - output = input; - model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); - return Status::OK(); + std::vector axes_shape; + if (TensorExists(input_defs, 1)) { + ORT_RETURN_IF_NOT(GetShape(*input_defs[1], axes_shape, logger), "Cannot get shape of input axes"); + if (axes_shape[0] != 0) { + // Optional input axes is provided and we already ensure it is an initializer. + // Use that initializer data. + const auto& initializers(model_builder.GetInitializerTensors()); + const auto& axes_tensor = *initializers.at(input_defs[1]->Name()); + Initializer axes_initializer(axes_tensor); + const auto axes_data_span = axes_initializer.DataAsSpan(); + axes_data = HandleNegativeAxes(axes_data_span, input_rank); } } } else { if (helper.HasAttr("axes")) { - auto axes = helper.Get("axes", std::vector{}); - std::transform( - axes.begin(), axes.end(), std::back_inserter(axes_data), - [input_rank](int64_t axis) -> int32_t { return SafeInt(HandleNegativeAxis(axis, input_rank)); }); + axes_data = GetResolvedAxes(helper, input_rank); } } - if (axes_data.size() > 0) { - options.set("axes", emscripten::val::array(axes_data)); - } - if (op_type == "ReduceL1") { - output = model_builder.GetBuilder().call("reduceL1", input, options); - } else if (op_type == "ReduceL2") { - output = model_builder.GetBuilder().call("reduceL2", input, options); - } else if (op_type == "ReduceLogSum") { - output = model_builder.GetBuilder().call("reduceLogSum", input, options); - } else if (op_type == "ReduceLogSumExp") { - output = model_builder.GetBuilder().call("reduceLogSumExp", input, options); - } else if (op_type == "ReduceMax") { - output = model_builder.GetBuilder().call("reduceMax", input, options); - } else if (op_type == "ReduceMean") { - output = model_builder.GetBuilder().call("reduceMean", input, options); - } else if (op_type == "ReduceMin") { - output = model_builder.GetBuilder().call("reduceMin", input, options); - } else if (op_type == "ReduceProd") { - output = model_builder.GetBuilder().call("reduceProduct", input, options); - } else if (op_type == "ReduceSum") { - output = model_builder.GetBuilder().call("reduceSum", input, options); - } else if (op_type == "ReduceSumSquare") { - output = model_builder.GetBuilder().call("reduceSumSquare", input, options); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type); + // When axes is not provided or is empty, check the 'noop_with_empty_axes' attribute: + // - If it is false, perform reduction over all dimensions. + // (In WebNN, this means the 'axes' option is not set.) + // - If it is true, no reduction is applied, but other operations are still performed. + // (In WebNN, this requires setting 'axes' to an empty array.) + if (!axes_data.empty() || helper.Get("noop_with_empty_axes", 0) == 1) { + options.set("axes", emscripten::val::array(GetNarrowedIntFromInt64(axes_data))); } + const std::string_view webnn_op_type = GetWebNNOpType(op_type); + ORT_RETURN_IF(webnn_op_type.empty(), "Cannot get WebNN op type"); + + emscripten::val output = model_builder.GetBuilder().call( + std::string(webnn_op_type).c_str(), input, options); + model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } @@ -128,11 +110,25 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); - const std::string axes_name = GetTensorName(input_defs, 1); - // If the optional input 'axes' is provided, it must be an initializer. - if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) { - LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; - return false; + + if (TensorExists(input_defs, 1)) { + std::vector axes_shape; + if (!GetShape(*input_defs[1], axes_shape, logger)) { + LOGS(logger, VERBOSE) << "Cannot get shape of input axes"; + return false; + } + + if (axes_shape.size() != 1) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be 1D"; + return false; + } + + const std::string axes_name = GetTensorName(input_defs, 1); + // If the optional input 'axes' is provided and not empty, it must be an initializer. + if (axes_shape[0] != 0 && !graph_viewer.GetConstantInitializer(axes_name)) { + LOGS(logger, VERBOSE) << "Input axes of " << node.OpType() << " must be a constant"; + return false; + } } return true; diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 00f5017a55db0..9bc6c8d0a96a1 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -71,16 +71,22 @@ struct OrtShapeInferContext { auto num_inputs = ctx_.getNumInputs(); for (size_t ith_input = 0; ith_input < num_inputs; ++ith_input) { const auto* input_type = ctx_.getInputType(ith_input); - const auto& value_case = input_type->value_case(); - ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType, - "shape inference not yet supported for non-tensor types"); - const auto& shape_proto = input_type->tensor_type().shape(); - const auto& type_proto = input_type->tensor_type(); - auto elem_type = ::onnxruntime::utils::CApiElementTypeFromProtoType(type_proto.elem_type()); - auto tensor_shape = ::onnxruntime::utils::GetTensorShapeFromTensorShapeProto(shape_proto); - auto symbolic_dims = GetSymbolicDims(shape_proto); - input_type_shapes_.emplace_back( - OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, &tensor_shape, &symbolic_dims)); + if (input_type != nullptr) { + const auto& value_case = input_type->value_case(); + ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType, + "shape inference not yet supported for non-tensor types"); + const auto& shape_proto = input_type->tensor_type().shape(); + const auto& type_proto = input_type->tensor_type(); + auto elem_type = ::onnxruntime::utils::CApiElementTypeFromProtoType(type_proto.elem_type()); + auto tensor_shape = ::onnxruntime::utils::GetTensorShapeFromTensorShapeProto(shape_proto); + auto symbolic_dims = GetSymbolicDims(shape_proto); + input_type_shapes_.emplace_back( + OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, &tensor_shape, &symbolic_dims)); + } else { + input_type_shapes_.emplace_back( + OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper( + ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED, nullptr, nullptr)); + } } } diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 9b258d0983570..7603397ea9cad 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -3758,7 +3758,7 @@ Second example, if we wanted to add and remove some members, we'd do this: In GetApi we now make it return ort_api_3 for version 3. */ -static constexpr OrtApi ort_api_1_to_23 = { +static constexpr OrtApi ort_api_1_to_24 = { // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering. // Shipped as version 1 - DO NOT MODIFY (see above text for more information) @@ -4266,16 +4266,16 @@ static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Siz static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: -static_assert(std::string_view(ORT_VERSION) == "1.23.0", +static_assert(std::string_view(ORT_VERSION) == "1.24.0", "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly"); // 1. Update the hardcoded version string in above static_assert to silence it -// 2. If there were any APIs added to ort_api_1_to_23 above: +// 2. If there were any APIs added to ort_api_1_to_24 above: // a. Add the 'End of version #' markers (pattern above should be obvious) // b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) { if (version >= 1 && version <= ORT_API_VERSION) - return &ort_api_1_to_23; + return &ort_api_1_to_24; fprintf(stderr, "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build." diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py index 4a72916d3e485..6912d19897d67 100644 --- a/onnxruntime/python/onnxruntime_validation.py +++ b/onnxruntime/python/onnxruntime_validation.py @@ -23,9 +23,9 @@ def check_distro_info(): __my_distro__ = __my_system__ __my_distro_ver__ = platform.release().lower() - if __my_distro_ver__ not in ["10", "11"]: + if __my_distro_ver__ not in ["10", "11", "2016server", "2019server", "2022server", "2025server"]: warnings.warn( - f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only." + f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, or Windows Server 2016 and above." ) elif __my_system__ == "linux": """Although the 'platform' python module for getting Distro information works well on standard OS images diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 85ac77be2af31..d1612af3d75b1 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -417,7 +417,14 @@ def collect_data(self, data_reader: CalibrationDataReader): inputs = data_reader.get_next() if not inputs: break - self.intermediate_outputs.append(self.infer_session.run(None, inputs)) + self.intermediate_outputs.append( + [ + value if sess_o.name not in self.model_original_outputs else None + for sess_o, value in zip( + self.infer_session.get_outputs(), self.infer_session.run(None, inputs), strict=False + ) + ] + ) if ( self.max_intermediate_outputs is not None and len(self.intermediate_outputs) == self.max_intermediate_outputs diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index d6b39a6b2aeb4..66ab0c44f8814 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -613,7 +613,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch): for ref_o, o in zip(ref_output, output, strict=False): # abs(desired-actual) < rtol * abs(desired) + atol try: - np.testing.assert_allclose(ref_o, o, rtol, atol) + np.testing.assert_allclose(o, ref_o, rtol, atol) except Exception as e: if percentage_in_allowed_threshold(e, percent_mismatch): continue diff --git a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc index 3e339d86c7943..1652d16f5cb66 100644 --- a/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc +++ b/onnxruntime/test/contrib_ops/cuda_kernels/fpA_intB_gemm_kernel_test.cc @@ -3,7 +3,7 @@ // Test can be run like the following: // ./onnxruntime_provider_test --gtest_filter=CUDA_EP_Unittest.* - +#if USE_FPA_INTB_GEMM #include #include #include @@ -620,3 +620,4 @@ TEST_F(Bf16Int4GroupwiseTest, BF16_Int4_Gemm_CudaKernel) { } } } +#endif diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc index 574ec49da67ea..3bf37ea193245 100644 --- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc +++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc @@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector& data, const std::vector& s ORT_ENFORCE(static_cast(data.size()) == total_elements, "Data size does not match the shape", "Data size: ", data.size(), ", Expected size: ", total_elements, - ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name()); + ", Shape: ", VectorToString(shape), " Name:", name); } // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 3a9bd02ef8d72..cc0e3207e6795 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -25,6 +25,7 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/ort_env.h" #include "core/util/qmath.h" +#include "core/providers/webgpu/webgpu_provider_options.h" extern std::unique_ptr ort_env; @@ -545,7 +546,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop #ifdef USE_ROCM execution_providers.push_back(DefaultRocmExecutionProvider()); #endif - +#ifdef USE_WEBGPU + ConfigOptions config_options{}; + ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kSmallStorageBufferBindingSizeForTesting, "1").IsOK()); + execution_providers.push_back(WebGpuExecutionProviderWithOptions(config_options)); +#endif RunTest(opts, std::move(execution_providers)); } } @@ -599,6 +604,23 @@ TEST(MatMulNBits, Float16_Large) { } } +#ifdef USE_WEBGPU +// Similar to Float16_Large but for float32 and crafted so that the input_b and output buffer size exceeds +// maxStorageBufferBindingSize (128MB) so it must be split into 2 segments internally (~128.00006MB). +// +// input_b size(4-bits): N * K / 2 = 8388612 * 32 / 2 = 134217792 bytes > 134217728 bytes (128MB) +// output size(float32): M * N * 4 = 4 * 8388612 * 4 = 134217792 bytes > 134217728 bytes (128MB) +TEST(MatMulNBits, Float32_Large) { + // Keep tolerance similar to Float16_Large (float path typically equal or better numerically). + constexpr float abs_error = 0.1f; + constexpr bool zp_is_4bit = true; + constexpr bool has_zeropoint = false; + constexpr auto block_size = 16; + + RunTest(4 /*M*/, 8388612 /*N*/, 32 /*K*/, block_size, has_zeropoint, zp_is_4bit, abs_error); +} +#endif + #ifdef USE_CUDA TEST(MatMulNBits, Fp16_Int4_Int4ZeroPoint) { constexpr float abs_error = 0.1f; diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc index f5258760eb20d..2d5c3a43ee8ed 100644 --- a/onnxruntime/test/framework/shape_inference_test.cc +++ b/onnxruntime/test/framework/shape_inference_test.cc @@ -129,6 +129,9 @@ const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata // that inference proceeds for all of the outputs when absent optional inputs are present TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) { MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider}; + custom_op.InferOutputShapeFn = [](const OrtCustomOp* /*op*/, OrtShapeInferContext* /*ctx*/) -> OrtStatusPtr { + return nullptr; + }; const auto& env = GetEnvironment(); diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 4fd9830440846..7371ad5cf0ded 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -2,13 +2,17 @@ // Licensed under the MIT License. #include +#include #include "core/common/inlined_containers.h" #include "core/common/span_utils.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" #include "core/graph/op.h" +#include "core/session/inference_session.h" +#include "core/session/environment.h" #include "test/providers/provider_test_utils.h" +#include "test/test_environment.h" #include "gtest/gtest.h" #include "gmock/gmock.h" #include "onnx/defs/function.h" @@ -2573,5 +2577,259 @@ TEST_F(GraphTest, GraphConstruction_MemoryEfficientTopologicalSort_SubgraphGener #endif +// Test for shape inference with in-memory external data (issue #26261) +// This tests the fix for a regression where Constant nodes with large tensors (>127 bytes) +// stored as in-memory external data would cause shape inference to fail +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalData) { + // Create a model with a Constant node that produces a tensor larger than kSmallTensorExternalDataThreshold (127 bytes) + // This will trigger the in-memory externalization path + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes, just over the 127 threshold) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + // Add the value attribute with a tensor + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); // 16 elements * 8 bytes = 128 bytes + // Each split will be size 1, totaling 16 + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node that uses the constant as input + // Split requires constant input for the 'split' parameter, which triggers shape inference + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); // Use constant as split sizes + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + // Add axis attribute + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load the model - this should succeed with the fix + // Before the fix, this would fail with: + // "Cannot parse data from external tensors. Please load external data into raw data for tensor" + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + // Verify the graph was properly constructed + Graph& graph = model->MainGraph(); + ASSERT_STATUS_OK(graph.Resolve()); + + // Verify the constant node was converted to an initializer + const ONNX_NAMESPACE::TensorProto* initializer = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("const_output", initializer)); + ASSERT_NE(initializer, nullptr); + + // Verify the Split node can access the constant data during shape inference + const Node* split_node_ptr = nullptr; + for (const auto& node : graph.Nodes()) { + if (node.Name() == "split_node") { + split_node_ptr = &node; + break; + } + } + ASSERT_NE(split_node_ptr, nullptr); + + // Verify outputs are properly shaped + ASSERT_EQ(split_node_ptr->OutputDefs().size(), 16u); +} + +// Test for shape inference with in-memory external data using InferenceSession +// This test more accurately reproduces the issue by going through the full session initialization +// which includes graph optimizations that trigger the in-memory externalization +TEST_F(GraphTest, ShapeInferenceWithInMemoryExternalDataViaSession) { + // Create the same model as above + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create a Constant node with a tensor of 16 INT64 values (128 bytes) + auto* constant_node = graph_proto->add_node(); + constant_node->set_op_type("Constant"); + constant_node->set_name("const_node"); + constant_node->add_output("const_output"); + + auto* attr = constant_node->add_attribute(); + attr->set_name("value"); + attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR); + auto* tensor = attr->mutable_t(); + tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + tensor->add_dims(16); + for (int64_t i = 0; i < 16; ++i) { + tensor->add_int64_data(1); + } + + // Create a Split node + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("const_output"); + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Save to a temporary file + const std::string model_path = "test_in_memory_external_data.onnx"; + { + std::ofstream file(model_path, std::ios::binary); + ASSERT_TRUE(file.is_open()); + ASSERT_TRUE(model_proto.SerializeToOstream(&file)); + } + + // Test with ORT_DISABLE_ALL optimization which should trigger the bug without the fix + SessionOptions so; + so.graph_optimization_level = TransformerLevel::Default; // This triggers the issue + so.session_logid = "GraphTest.ShapeInferenceWithInMemoryExternalDataViaSession"; + + InferenceSession session_object{so, GetEnvironment()}; + + // This should succeed with the fix, fail without it + ASSERT_STATUS_OK(session_object.Load(model_path)); + ASSERT_STATUS_OK(session_object.Initialize()); + + // Clean up + std::remove(model_path.c_str()); +} + +// Test that explicitly triggers the in-memory externalization and then shape inference +// This test directly reproduces the bug scenario +TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) { + // Create a model with a Split node that depends on a constant initializer + ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + auto* opset = model_proto.add_opset_import(); + opset->set_version(17); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("test_graph"); + + // Create initializer directly (not as Constant node) with 128 bytes + auto* initializer = graph_proto->add_initializer(); + initializer->set_name("split_sizes"); + initializer->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + initializer->add_dims(16); // 16 * 8 = 128 bytes + for (int64_t i = 0; i < 16; ++i) { + initializer->add_int64_data(1); + } + + // Create a Split node that uses this initializer + auto* split_node = graph_proto->add_node(); + split_node->set_op_type("Split"); + split_node->set_name("split_node"); + split_node->add_input("input_data"); + split_node->add_input("split_sizes"); // Uses the large initializer + for (int i = 0; i < 16; ++i) { + split_node->add_output("split_output_" + std::to_string(i)); + } + + auto* axis_attr = split_node->add_attribute(); + axis_attr->set_name("axis"); + axis_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); + axis_attr->set_i(0); + + // Add graph input + auto* input = graph_proto->add_input(); + input->set_name("input_data"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(16); + input_type->mutable_shape()->add_dim()->set_dim_value(10); + + // Add graph outputs + for (int i = 0; i < 16; ++i) { + auto* output = graph_proto->add_output(); + output->set_name("split_output_" + std::to_string(i)); + } + + // Load model + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(std::move(model_proto), model, nullptr, *logger_)); + + Graph& graph = model->MainGraph(); + // First resolve should succeed + ASSERT_STATUS_OK(graph.Resolve()); + + // Now trigger the in-memory externalization + // This converts initializers > 127 bytes to OrtValues with external data references + Status convert_status = graph.ConvertInitializersIntoOrtValues(); + ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage(); + + // Check if the initializer was actually externalized + const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr; + ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after)); + ASSERT_NE(initializer_after, nullptr); + // Debug: verify it was externalized + ASSERT_TRUE(utils::HasExternalDataInMemory(*initializer_after)) + << "Initializer was not externalized to in-memory external data"; + + // Mark the graph as needing resolve to force shape inference to run again + graph.SetGraphResolveNeeded(); + + // Resolve again - this should trigger shape inference with the externalized initializer + // Without the fix, this will fail with "Cannot parse data from external tensors" + // With the fix, getInputData() materializes the external data for shape inference + Status second_resolve = graph.Resolve(); + ASSERT_TRUE(second_resolve.IsOK()) << "Second resolve failed: " << second_resolve.ErrorMessage(); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index 39d135236b89c..dc37980002978 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -3,6 +3,7 @@ #include "mlas.h" #include "bench_util.h" +#include "core/util/thread_utils.h" #include #include @@ -138,6 +139,113 @@ void SCONV_NCHW(benchmark::State& state, const char* /*dummy*/) { } } +static MLAS_THREADPOOL* GetMlasThreadPoolForConvBenchmark(void) { + static auto threadpool = std::make_unique( + &onnxruntime::Env::Default(), onnxruntime::ThreadOptions(), nullptr, 4, true); + return threadpool.get(); +} + +void SCONV_NCHW_THREADED(benchmark::State& state, const char* /*dummy*/) { + MLAS_THREADPOOL* tp = GetMlasThreadPoolForConvBenchmark(); + + const int64_t rank = state.range(0); // Rank + const int64_t batch_size = state.range(1); // N + const int64_t groups = state.range(2); // G + const int64_t input_channels_per_group = state.range(3); // Cpg + const int64_t output_channels_per_group = state.range(4); // Fpg + + if (rank <= 0) throw std::invalid_argument("Kernel rank must greater than 0!"); + if (batch_size <= 0) throw std::invalid_argument("Batch size must greater than 0!"); + if (groups <= 0) throw std::invalid_argument("Group count must greater than 0!"); + if (input_channels_per_group <= 0) throw std::invalid_argument("input_channels_per_group must greater than 0!"); + if (output_channels_per_group <= 0) throw std::invalid_argument("output_channels_per_group must greater than 0!"); + + size_t arg_position = 5; + const auto input_shape = BenchArgsVector(state, arg_position, rank); + const auto kernel_shape = BenchArgsVector(state, arg_position, rank); + const auto paddings = BenchArgsVector(state, arg_position, rank * 2); + const auto strides = BenchArgsVector(state, arg_position, rank); + const auto dilations = BenchArgsVector(state, arg_position, rank); + + // do not check the size of each vector as they are forced from args. + if (std::any_of(input_shape.begin(), input_shape.end(), [](const int64_t& dim) { return dim <= 0; })) { + throw std::invalid_argument("all input image dim must > 0"); + } + + if (std::any_of(kernel_shape.begin(), kernel_shape.end(), [](const int64_t& dim) { return dim <= 0; })) { + throw std::invalid_argument("all kernel dim must > 0"); + } + + if (std::any_of(strides.begin(), strides.end(), [](const int64_t& dim) { return dim <= 0; })) { + throw std::invalid_argument("all strides dim must > 0"); + } + + if (std::any_of(dilations.begin(), dilations.end(), [](const int64_t& dim) { return dim <= 0; })) { + throw std::invalid_argument("all dilations dim must > 0"); + } + + const int64_t GC = groups * input_channels_per_group; + const int64_t GF = groups * output_channels_per_group; + std::vector x_shape = {batch_size, GC}; + x_shape.insert(x_shape.end(), input_shape.begin(), input_shape.end()); + std::vector f_shape = {GF, input_channels_per_group}; + f_shape.insert(f_shape.end(), kernel_shape.begin(), kernel_shape.end()); + + std::vector output_shape((size_t)rank); + for (int64_t i = 0; i < rank; ++i) { + auto km = 1 + dilations[i] * (kernel_shape[i] - 1); + output_shape[i] = (paddings[i] + paddings[i + rank] + input_shape[i] - km) / strides[i] + 1; + } + std::vector y_shape = {batch_size, GF}; + y_shape.insert(y_shape.end(), output_shape.begin(), output_shape.end()); + + MLAS_ACTIVATION activation; + activation.ActivationKind = MlasIdentityActivation; + MLAS_CONV_PARAMETERS Parameters; + size_t WorkingBufferSize = 0; + MlasConvPrepare(&Parameters, + static_cast(rank), + static_cast(batch_size), + static_cast(groups), + static_cast(input_channels_per_group), + input_shape.data(), + kernel_shape.data(), + dilations.data(), + paddings.data(), + strides.data(), + output_shape.data(), + static_cast(output_channels_per_group), + &activation, + &WorkingBufferSize, + 0.0f, + tp); + + auto X = RandomVectorUniform(x_shape, -2.0, 2.0); + auto F = RandomVectorUniform(f_shape, -1.0, 1.0); + int64_t y_size = std::accumulate(y_shape.begin(), y_shape.end(), 1LL, std::multiplies()); + std::vector Y(static_cast(y_size)); + std::vector working_buffer(WorkingBufferSize); + + // warm up first round. + MlasConv(&Parameters, + X.data(), + F.data(), + nullptr, + working_buffer.data(), + Y.data(), + tp); + + for (auto _ : state) { + MlasConv(&Parameters, + X.data(), + F.data(), + nullptr, + working_buffer.data(), + Y.data(), + tp); + } +} + static void ResNet50(benchmark::internal::Benchmark* b) { b->ArgNames(ArgNamesForConv(2)); @@ -221,6 +329,7 @@ static void TeamsModel(benchmark::internal::Benchmark* b) { } BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime(); +BENCHMARK_CAPTURE(SCONV_NCHW_THREADED, TeamsModel, "")->Apply(TeamsModel)->UseRealTime(); static void General_Conv2d(benchmark::internal::Benchmark* b) { b->ArgNames(ArgNamesForConv(2)); diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp index a048ded8349b8..6d05e93f517ae 100644 --- a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp @@ -4,10 +4,12 @@ // SPDX-License-Identifier: MIT // -#include "test_util.h" // Currently this test only applies to KleidiAI Guard against it running in any other situation #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) +#include "test_util.h" +#include "core/mlas/lib/mlasi.h" // for MLAS_CPUIDINFO + class MlasDynamicQgemmTest { private: MatrixGuardBuffer buffer_a; @@ -18,6 +20,11 @@ class MlasDynamicQgemmTest { public: void Test(size_t M, size_t N, size_t K, size_t BatchSize) { + // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops. + if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()) { + GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME but it was not detected. Skipping test."; + } + // Setup buffers for holding various data float* A = buffer_a.GetBuffer(M * K * BatchSize); diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index 6df98ff505fa1..cbb25bb9b629e 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -1435,9 +1435,22 @@ std::unique_ptr> GetBrokenTests(const std::string& provider broken_tests->insert({"scatter_elements_with_negative_indices", "unknown version"}); // Fails since ONNX==1.19.0 broken_tests->insert({"l2normalization_axis_0", "unknown version"}); + broken_tests->insert({"attention_3d_gqa", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_3d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_attn_mask", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_causal", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_scaled", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_softcap", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present", "unknown version"}); + broken_tests->insert({"attention_4d_gqa_with_past_and_present_fp16", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "unknown version"}); + broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "unknown version"}); broken_tests->insert({"attention_4d_diff_heads_mask4d_padded_kv", "need nonpad_kv_seqlen "}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal", "attention op implementation is wrong"}); - broken_tests->insert({"attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal", "attention op implementation is wrong"}); } #ifdef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index b6f2cb2683677..463634b370d4c 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -795,24 +795,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); // Please make no more changes to the list static const ORTCHAR_T* immutable_broken_tests[] = { - // pending ONNX update - ORT_TSTR("attention_3d_gqa"), - ORT_TSTR("attention_3d_gqa_attn_mask"), - ORT_TSTR("attention_3d_gqa_causal"), - ORT_TSTR("attention_3d_gqa_scaled"), - ORT_TSTR("attention_3d_gqa_softcap"), - ORT_TSTR("attention_3d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_gqa"), - ORT_TSTR("attention_4d_gqa_attn_mask"), - ORT_TSTR("attention_4d_gqa_causal"), - ORT_TSTR("attention_4d_gqa_scaled"), - ORT_TSTR("attention_4d_gqa_softcap"), - ORT_TSTR("attention_4d_gqa_with_past_and_present"), - ORT_TSTR("attention_4d_diff_heads_mask4d_padded_kv"), - ORT_TSTR("attention_4d_gqa_with_past_and_present_fp16"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal"), - ORT_TSTR("attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal"), - // unsupported case ORT_TSTR("AvgPool1d"), ORT_TSTR("AvgPool1d_stride"), ORT_TSTR("AvgPool2d"), diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index e84c1ea583250..59f5d8333657e 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -171,6 +171,10 @@ ABSL_FLAG(std::string, plugin_ep_options, "", "--plugin_ep_options \"ep_1_option_1_key|ep_1_option_1_value ...;;ep_3_option_1_key|ep_3_option_1_value ...;... \""); ABSL_FLAG(bool, list_ep_devices, false, "Prints all available device indices and their properties (including metadata). This option makes the program exit early without performing inference.\n"); ABSL_FLAG(std::string, select_ep_devices, "", "Specifies a semicolon-separated list of device indices to add to the session and run with."); +ABSL_FLAG(std::string, filter_ep_devices, "", + "Specifies EP or Device metadata entries as key-value pairs to filter ep devices passed to AppendExecutionProvider_V2.\n" + "[Usage]: --filter_ep_devices \"| |\" \n" + "Devices that match any of the key-value pair will be appended to the session. --select_ep_devices will take precedence over this option.\n"); ABSL_FLAG(bool, compile_ep_context, DefaultPerformanceTestConfig().run_config.compile_ep_context, "Generate an EP context model"); ABSL_FLAG(std::string, compile_model_path, "model_ctx.onnx", "The compiled model path for saving EP context model. Overwrites if already exists"); ABSL_FLAG(bool, compile_binary_embed, DefaultPerformanceTestConfig().run_config.compile_binary_embed, "Embed binary blob within EP context node"); @@ -490,6 +494,22 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a if (!select_ep_devices.empty()) test_config.selected_ep_device_indices = select_ep_devices; } + // --filter_ep_devices + { + const auto& filter_ep_devices = absl::GetFlag(FLAGS_filter_ep_devices); + if (!filter_ep_devices.empty()) { + ORT_TRY { + ParseEpDeviceFilterKeyValuePairs(filter_ep_devices, test_config.filter_ep_device_kv_pairs); + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + fprintf(stderr, "Error parsing filter_ep_devices: %s\n", ex.what()); + }); + return false; + } + } + } + // --compile_ep_context test_config.run_config.compile_ep_context = absl::GetFlag(FLAGS_compile_ep_context); diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index f2a54b0335fe1..fa1725d9003d7 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -105,7 +105,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device if (added_ep_device_index_set.find(index) == added_ep_device_index_set.end()) { added_ep_devices[device.EpName()].push_back(device); added_ep_device_index_set.insert(index); - fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s] has been added to session.\n", index, device.EpName()); + fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s, Type: %d] has been added to session.\n", static_cast(index), device.EpName(), device.Device().Type()); } } else { std::string err_msg = "[Plugin EP] [WARNING] : The EP device index and its corresponding OrtEpDevice is not created from " + @@ -113,6 +113,28 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device fprintf(stderr, "%s", err_msg.c_str()); } } + } else if (!performance_test_config.filter_ep_device_kv_pairs.empty()) { + // Find and select the OrtEpDevice associated with the EP in "--filter_ep_devices". + for (size_t index = 0; index < ep_devices.size(); ++index) { + auto device = ep_devices[index]; + if (ep_set.find(std::string(device.EpName())) == ep_set.end()) + continue; + + // Check both EP metadata and device metadata for a match + auto ep_metadata_kv_pairs = device.EpMetadata().GetKeyValuePairs(); + auto device_metadata_kv_pairs = device.Device().Metadata().GetKeyValuePairs(); + for (const auto& kv : performance_test_config.filter_ep_device_kv_pairs) { + auto ep_metadata_itr = ep_metadata_kv_pairs.find(kv.first); + auto device_metadata_itr = device_metadata_kv_pairs.find(kv.first); + + if ((ep_metadata_itr != ep_metadata_kv_pairs.end() && kv.second == ep_metadata_itr->second) || + (device_metadata_itr != device_metadata_kv_pairs.end() && kv.second == device_metadata_itr->second)) { + added_ep_devices[device.EpName()].push_back(device); + fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s, Type: %d] has been added to session.\n", static_cast(index), device.EpName(), device.Device().Type()); + break; + } + } + } } else { // Find and select the OrtEpDevice associated with the EP in "--plugin_eps". for (size_t index = 0; index < ep_devices.size(); ++index) { diff --git a/onnxruntime/test/perftest/strings_helper.cc b/onnxruntime/test/perftest/strings_helper.cc index 5743346f8edf1..d9fd2a2a55c09 100644 --- a/onnxruntime/test/perftest/strings_helper.cc +++ b/onnxruntime/test/perftest/strings_helper.cc @@ -137,5 +137,22 @@ void ParseEpDeviceIndexList(const std::string& input, std::vector& result) } } } + +void ParseEpDeviceFilterKeyValuePairs(const std::string& input, std::vector>& result) { + std::stringstream ss(input); + std::string token; + + while (std::getline(ss, token, ' ')) { + if (!token.empty()) { + size_t delimiter_location = token.find("|"); + if (delimiter_location == std::string::npos || delimiter_location == 0 || delimiter_location == token.size() - 1) { + ORT_THROW("Use a '|' to separate the key and value for the device filter you are trying to use.\n"); + } + std::string key = token.substr(0, delimiter_location); + std::string value = token.substr(delimiter_location + 1); + result.emplace_back(std::make_pair(std::move(key), std::move(value))); + } + } +} } // namespace perftest } // namespace onnxruntime diff --git a/onnxruntime/test/perftest/strings_helper.h b/onnxruntime/test/perftest/strings_helper.h index a33b3d5089c9b..d6c6f6112ab6c 100644 --- a/onnxruntime/test/perftest/strings_helper.h +++ b/onnxruntime/test/perftest/strings_helper.h @@ -24,5 +24,7 @@ void ParseEpList(const std::string& input, std::vector& result); void ParseEpOptions(const std::string& input, std::vector>& result); void ParseEpDeviceIndexList(const std::string& input, std::vector& result); + +void ParseEpDeviceFilterKeyValuePairs(const std::string& input, std::vector>& result); } // namespace perftest } // namespace onnxruntime diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h index 8d0b65d3158f5..1be09917e1a45 100644 --- a/onnxruntime/test/perftest/test_configuration.h +++ b/onnxruntime/test/perftest/test_configuration.h @@ -81,6 +81,7 @@ struct PerformanceTestConfig { std::basic_string plugin_ep_names_and_libs; std::vector registered_plugin_eps; std::string selected_ep_device_indices; + std::vector> filter_ep_device_kv_pairs; bool list_available_ep_devices = false; }; diff --git a/onnxruntime/test/platform/device_discovery_test.cc b/onnxruntime/test/platform/device_discovery_test.cc index 6b43ccbc8f670..bd0110748b098 100644 --- a/onnxruntime/test/platform/device_discovery_test.cc +++ b/onnxruntime/test/platform/device_discovery_test.cc @@ -5,8 +5,8 @@ #include "gtest/gtest.h" +#if !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX) namespace onnxruntime::test { - namespace { std::vector GetDevicesByType(OrtHardwareDeviceType device_type) { @@ -31,3 +31,4 @@ TEST(DeviceDiscoveryTest, HasCpuDevice) { } } // namespace onnxruntime::test +#endif // !defined(ORT_MINIMAL_BUILD) && !defined(_GAMING_XBOX) diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc index a1a863d2442d1..924f9da41abef 100644 --- a/onnxruntime/test/platform/file_io_test.cc +++ b/onnxruntime/test/platform/file_io_test.cc @@ -19,6 +19,7 @@ #include "gtest/gtest.h" #include "core/common/span_utils.h" +#include "test/util/include/asserts.h" #include "test/util/include/file_util.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cf49601e6c671..ca1a3104e0bed 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -678,7 +678,14 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("fp16_coreml_FNS-Candy"), ORT_TSTR("fp16_test_tiny_yolov2"), ORT_TSTR("fp16_test_shufflenet"), - ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"), + // models from model zoo. #26274: cuDNN frontend no valid engine + ORT_TSTR("YOLOv3"), + ORT_TSTR("YOLOv3-12"), + ORT_TSTR("YOLOv4"), + ORT_TSTR("SSD-MobilenetV1"), + ORT_TSTR("SSD-MobilenetV1-12")}; + // For ROCm EP, also disable the following tests due to flakiness, // mainly with precision issue and random memory access fault. static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 0b8624ad6c67f..7c84aefa1c01f 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -339,6 +339,61 @@ TEST(ConvTest, Conv2D_2) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvTest, Conv2D_3) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X_shape = {2, 2, 3, 3}; + vector X = {1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, + 7.f, 8.f, 9.f, + + 10.f, 11.f, 12.f, + 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, + + 1.f, 2.f, 3.f, + 7.f, 8.f, 9.f, + 4.f, 5.f, 6.f, + + 13.f, 14.f, 15.f, + 10.f, 11.f, 12.f, + 16.f, 17.f, 18.f}; + + vector W_shape = {2, 1, 2, 2}; + vector W = {1.f, 2.f, 3.f, 4.f, 2.f, 4.f, 6.f, 8.f}; + + vector Y_shape = {2, 2, 2, 2}; + auto Y = { + 37.f, + 47.f, + 67.f, + 77.f, + 254.f, + 274.f, + 314.f, + 334.f, + 58.f, + 68.f, + 55.f, + 65.f, + 230.f, + 250.f, + 296.f, + 316.f, + }; + + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, Y, Y_shape); + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, Y, Y_shape, true); +} + TEST(ConvTest, Conv2D_Bias_1) { ConvOpAndTestAttributes attrs = { "", // auto_pad diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc index 8f4c4ff0896ba..289e94397fb39 100644 --- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -1477,7 +1477,7 @@ template void CastOpTestFloatFloat4(std::vector shape, std::vector float_data, bool is_fp4_input = false) { - size_t num_pairs = float_data.size() / 2; + int num_pairs = static_cast(float_data.size()) / 2; int num_fp4_elements = static_cast((float_data.size() + 1) / 2); bool is_odd_count = (float_data.size() % 2 != 0); diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h index 1d8a50dc2fa04..5bd9ee2ceb826 100644 --- a/onnxruntime/test/providers/provider_test_utils.h +++ b/onnxruntime/test/providers/provider_test_utils.h @@ -5,6 +5,10 @@ #include "test/unittest_util/checkers.h" #include "test/unittest_util/conversion.h" + +#if !defined(ORT_MINIMAL_BUILD) #include "test/unittest_util/model_tester.h" #include "test/unittest_util/op_tester.h" +#endif // !defined(ORT_MINIMAL_BUILD) + #include "test/unittest_util/run_options_config_keys.h" diff --git a/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc b/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc new file mode 100644 index 0000000000000..d167898e6a3b9 --- /dev/null +++ b/onnxruntime/test/providers/qnn/qnn_node_group/reshape_transpose_rank5_test.cc @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#if !defined(ORT_MINIMAL_BUILD) + +#include "core/graph/graph.h" +#include "core/graph/node_attr_utils.h" + +#include "test/providers/qnn/qnn_test_utils.h" +#include "test/unittest_util/qdq_test_utils.h" +#include "gtest/gtest.h" + +namespace onnxruntime { +namespace test { + +namespace { + +// Build float test: Add -> Reshape(rank-6) -> Transpose -> Reshape -> Add +// Uses smaller dimensions for testing +GetTestModelFn BuildRank6ToRank5FloatTestCase() { + return [](ModelTestBuilder& builder) -> void { + auto input_def = TestInputDef({256, 64}, false, -10.0f, 10.0f); + NodeArg* input = MakeTestInput(builder, input_def); + + NodeArg* add_const1 = builder.MakeScalarInitializer(1.0f); + NodeArg* add1_out = builder.MakeIntermediate(); + builder.AddNode("Add", {input, add_const1}, {add1_out}); + + // Reshape: (256, 64) -> (1, 4, 4, 4, 4, 64) + NodeArg* reshape1_shape = builder.Make1DInitializer({1, 4, 4, 4, 4, 64}); + NodeArg* reshape1_out = builder.MakeIntermediate(); + builder.AddNode("Reshape", {add1_out, reshape1_shape}, {reshape1_out}); + + // Transpose: perm [0, 2, 1, 3, 4, 5] + NodeArg* transpose_out = builder.MakeIntermediate(); + Node& transpose = builder.AddNode("Transpose", {reshape1_out}, {transpose_out}); + transpose.AddAttribute("perm", std::vector{0, 2, 1, 3, 4, 5}); + + // Reshape: (1, 4, 4, 4, 4, 64) -> (1, 256, 64) + NodeArg* reshape2_shape = builder.Make1DInitializer({1, 256, 64}); + NodeArg* reshape2_out = builder.MakeIntermediate(); + builder.AddNode("Reshape", {transpose_out, reshape2_shape}, {reshape2_out}); + + NodeArg* add_const2 = builder.MakeScalarInitializer(1.0f); + NodeArg* output = builder.MakeOutput(); + builder.AddNode("Add", {reshape2_out, add_const2}, {output}); + }; +} + +ProviderOptions GetProviderOptions() { + ProviderOptions provider_options; + provider_options["backend_type"] = "htp"; + return provider_options; +} + +} // namespace + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) + +TEST_F(QnnHTPBackendTests, Rank6ToRank5Fusion_Float) { + RunQnnModelTest(BuildRank6ToRank5FloatTestCase(), + GetProviderOptions(), + 13, + ExpectedEPNodeAssignment::All, + 1e-2f); +} + +#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) + +} // namespace test +} // namespace onnxruntime + +#endif // !defined(ORT_MINIMAL_BUILD) diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index 327dfab96c2d1..a746493d779f8 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -713,6 +713,52 @@ TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) { ASSERT_TRUE(status.IsOK()); } +TEST(TensorrtExecutionProviderTest, DDSOutputTest) { + PathString model_name = ORT_TSTR("testdata/ort_github_issue_26272_dds.onnx"); + SessionOptions so; + so.session_logid = "TensorrtExecutionProviderRunWithDDSOutput"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + InferenceSession session_object{so, GetEnvironment()}; + auto cuda_provider = DefaultCudaExecutionProvider(); + auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[1]; + std::vector dims_op_x = {3, 4}; + std::vector values_op_x(12, 0.f); // 12=3*4 + OrtValue ml_value_x; + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("data", ml_value_x)); + + // prepare outputs + std::vector output_names; + output_names.push_back("output"); + std::vector fetches; + + OrtTensorRTProviderOptionsV2 params; + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // First pass run + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + + // Second pass run with new shape + dims_op_x = {6, 4}; + values_op_x.resize(24, 0.f); // 24=6*4 + CreateMLValue(cuda_allocator, dims_op_x, values_op_x, &ml_value_x); + feeds.clear(); + + feeds.insert(std::make_pair("data", ml_value_x)); + + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); +} + TEST_P(TensorrtExecutionProviderCacheTest, Run) { // GetParam() returns the parameter of following format: // ##cache type##_##input shape type## diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index 72c6a5664f395..d2e9557f633b0 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -43,13 +43,13 @@ def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None """ def assert_similar_array(ref_output, output): - np.testing.assert_equal(ref_output.dtype, output.dtype) + np.testing.assert_equal(output.dtype, ref_output.dtype) if ref_output.dtype == object: - np.testing.assert_array_equal(ref_output, output) + np.testing.assert_array_equal(output, ref_output) else: - np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol) + np.testing.assert_allclose(output, ref_output, rtol=rtol, atol=atol) - np.testing.assert_equal(len(ref_outputs), len(outputs)) + np.testing.assert_equal(len(outputs), len(ref_outputs)) for i in range(len(outputs)): # pylint: disable=consider-using-enumerate if isinstance(outputs[i], list): for j in range(len(outputs[i])): diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index e44adcdb9827f..7f003453add89 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -54,7 +54,7 @@ def run_model(self, session_object, run_options): input_name = session_object.get_inputs()[0].name res = session_object.run([], {input_name: x}, run_options=run_options) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def run_model_with_input(self, session_object, input_name, input_value, iter_num, queue): for _ in range(iter_num): @@ -714,7 +714,7 @@ def test_run_model(self): res = sess.run([outputs[0].name], {inputs[0].name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_async(self): event = threading.Event() @@ -733,7 +733,7 @@ def callback(res: np.ndarray, data: MyData, err: str) -> None: self.assertEqual(len(err), 0) self.assertEqual(len(res), 1) self.assertEqual(data.get_id(), 123456) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) event.set() so = onnxrt.SessionOptions() @@ -762,7 +762,7 @@ def test_run_model_from_bytes(self): self.assertEqual(output_shape, [3, 2]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -777,7 +777,7 @@ def test_run_model2(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model2_contiguous(self): sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers()) @@ -792,10 +792,10 @@ def test_run_model2_contiguous(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xcontiguous = np.ascontiguousarray(x) rescontiguous = sess.run([output_name], {input_name: xcontiguous}) - np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(rescontiguous[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_multiple_threads(self): # Skip this test for a "pure" DML onnxruntime python wheel. @@ -860,14 +860,14 @@ def test_list_as_input(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x.tolist()}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_string_list_as_input(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2)) x_name = sess.get_inputs()[0].name res = sess.run([], {x_name: x.tolist()}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_run_device(self): device = onnxrt.get_device() @@ -888,7 +888,7 @@ def test_run_model_symbolic_input(self): self.assertEqual(output_shape, ["None", 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_boolean_inputs(self): sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers) @@ -920,7 +920,7 @@ def test_boolean_inputs(self): output_expected = np.array([[True, False], [False, False]], dtype=bool) res = sess.run([output_name], {a_name: a, b_name: b}) - np.testing.assert_equal(output_expected, res[0]) + np.testing.assert_equal(res[0], output_expected) def test_string_input1(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -941,7 +941,7 @@ def test_string_input1(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_string_input2(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -962,7 +962,7 @@ def test_string_input2(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_bytes(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -983,7 +983,7 @@ def test_input_bytes(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0].astype("|S8")) + np.testing.assert_equal(res[0].astype("|S8"), x) def test_input_object(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1004,7 +1004,7 @@ def test_input_object(self): self.assertEqual(output_type, "tensor(string)") res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(res[0], x) def test_input_void(self): sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm) @@ -1029,7 +1029,7 @@ def test_input_void(self): res = sess.run([output_name], {x_name: x}) expr = np.array([["must", "have"], ["same", "size"]], dtype=object) - np.testing.assert_equal(expr, res[0]) + np.testing.assert_equal(res[0], expr) def test_raise_wrong_num_inputs(self): with self.assertRaises(ValueError) as context: @@ -1164,7 +1164,7 @@ def test_sequence_construct(self): }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_sequence_insert(self): opt = onnxrt.SessionOptions() @@ -1194,7 +1194,7 @@ def test_sequence_insert(self): "input_seq": [], }, ) - np.testing.assert_array_equal(output_expected, res[0]) + np.testing.assert_array_equal(res[0], output_expected) def test_ort_execution_mode(self): opt = onnxrt.SessionOptions() @@ -1375,7 +1375,7 @@ def test_register_custom_ops_library(self): input_1 = np.zeros((3, 5)).astype(np.float32) res = sess1.run([output_name], {input_name_0: input_0, input_name_1: input_1}) output_expected = np.ones((3, 5)).astype(np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Create an alias of SessionOptions instance # We will use this alias to construct another InferenceSession @@ -1969,7 +1969,7 @@ def test_adater_export_read(self): self.assertTrue(value.is_tensor()) self.assertEqual(expected_val.element_type(), value.element_type()) self.assertEqual(expected_val.shape(), value.shape()) - np.testing.assert_allclose(expected_val.numpy(), value.numpy()) + np.testing.assert_allclose(value.numpy(), expected_val.numpy()) def test_run_with_adapter(self): model_path = get_name("lora/two_params_lora_model.onnx") diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py index d66951bd66f3d..a24269a312e9b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py +++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py @@ -66,7 +66,7 @@ def test_cuda_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -98,7 +98,7 @@ def test_cuda_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -146,7 +146,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -249,7 +249,7 @@ def test_example_plugin_ep_devices(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) del sess # Delete session before unregistering library self.unregister_execution_provider_library(ep_name) @@ -282,11 +282,11 @@ def test_example_plugin_ep_data_transfer(self): gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57) # copy back to CPU cpu_data = gpu_value.numpy() - np.testing.assert_equal(data, cpu_data) + np.testing.assert_equal(cpu_data, data) gpu_value.update_inplace(data2) # update the fake GPU data cpu_data_2 = gpu_value.numpy() # copy back to CPU - np.testing.assert_equal(data2, cpu_data_2) + np.testing.assert_equal(cpu_data_2, data2) gpu_value = None # Delete OrtValue before unregistering library as the allocator will be destroyed. @@ -336,8 +336,8 @@ def test_copy_tensors(self): del b_device # Verify the contents - np.testing.assert_array_equal(a, a_cpu_copy.numpy()) - np.testing.assert_array_equal(b, b_cpu_copy.numpy()) + np.testing.assert_array_equal(a_cpu_copy.numpy(), a) + np.testing.assert_array_equal(b_cpu_copy.numpy(), b) self.unregister_execution_provider_library(ep_name) diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py index 6ed7dfe59b1f6..416d9b6edecd1 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py @@ -19,7 +19,7 @@ def test_run_model(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self): """ diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py index c245699e211d4..9e3c1acbc923b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py @@ -23,8 +23,8 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows): for i in range(num_rows): # use np.testing.assert_allclose so we can specify the tolerance np.testing.assert_allclose( - [expected_rows[i][key] for key in sorted_keys], [actual_rows[i][key] for key in sorted_keys], + [expected_rows[i][key] for key in sorted_keys], rtol=1e-05, atol=1e-07, ) @@ -37,7 +37,7 @@ def test_run_model_non_tensor(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = rep.run(x) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_proto(self): name = datasets.get_example("logreg_iris.onnx") @@ -47,7 +47,7 @@ def test_run_model_proto(self): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { @@ -72,7 +72,7 @@ def test_run_model_proto_api(self): outputs = ort_backend.run_model(model, inputs) output_expected = np.array([0, 0, 0], dtype=np.float32) - np.testing.assert_allclose(output_expected, outputs[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(outputs[0], output_expected, rtol=1e-05, atol=1e-08) output_expected = [ {0: 0.950599730014801, 1: 0.027834169566631317, 2: 0.02156602405011654}, { diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py index 5ab2fe8939f6a..d6c1dd9cff3f3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithCudaGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "CUDAExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_cuda_graph(self): if "TensorrtExecutionProvider" in onnxrt.get_available_providers(): @@ -105,11 +105,11 @@ def run_model_with_cuda_graph(self, providers): # One regular run for the necessary memory allocation and cuda graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, CUDA graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay CUDA graph x_ortvalue.update_inplace( @@ -120,8 +120,8 @@ def run_model_with_cuda_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -162,7 +162,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -176,7 +176,7 @@ def run_model_with_cuda_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) def test_arena_with_cuda_graph(self): if "CUDAExecutionProvider" in onnxrt.get_available_providers(): @@ -214,7 +214,7 @@ def test_arena_with_cuda_graph(self): session.run_with_iobinding(io_binding) output = cuda_graph_helper.get_output("softmaxout_1") - np.testing.assert_allclose(expected_output, output, rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(output, expected_output, rtol=1e-02, atol=1e-02) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py index 033eae1cb4c8d..4a6aa7b63d9c3 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py @@ -63,18 +63,18 @@ class TestInferenceSessionWithDmlGraph(unittest.TestCase): def test_ort_value_update_in_place(self): x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0) - np.testing.assert_allclose(x0, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x0) x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) ortvalue_cpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_cpu.numpy()) + np.testing.assert_allclose(ortvalue_cpu.numpy(), x1) if "DmlExecutionProvider" in onnxrt.get_available_providers(): ortvalue_gpu = onnxrt.OrtValue.ortvalue_from_numpy(x0, "dml", 0) - np.testing.assert_allclose(x0, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x0) ortvalue_gpu.update_inplace(x1) - np.testing.assert_allclose(x1, ortvalue_gpu.numpy()) + np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) def test_select_ep_to_run_dml_graph(self): if "DmlExecutionProvider" in onnxrt.get_available_providers(): @@ -104,11 +104,11 @@ def run_model_with_dml_graph(self, providers): # One regular run for the necessary memory allocation and dml graph capturing session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # After capturing, DML graph replay happens from this Run onwards session.run_with_iobinding(io_binding, ro) - np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalue.numpy(), expected_y, rtol=1e-05, atol=1e-05) # Update input and then replay DML graph x_ortvalue.update_inplace( @@ -119,8 +119,8 @@ def run_model_with_dml_graph(self, providers): ) session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose( - np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), y_ortvalue.numpy(), + np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32), rtol=1e-05, atol=1e-05, ) @@ -163,7 +163,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) del ro ro = onnxrt.RunOptions() @@ -177,7 +177,7 @@ def run_model_with_dml_graph_annotation(self, providers): session.run_with_iobinding(io_bindings[i], ro) io_bindings[i].synchronize_outputs() expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32) - np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05) + np.testing.assert_allclose(y_ortvalues[i].numpy(), expected_y, rtol=1e-05, atol=1e-05) if __name__ == "__main__": diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py index 8b6b029c57752..70b8c0fc0b980 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py @@ -80,7 +80,7 @@ def test_dict_vectorizer(self): x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) xwrong = x.copy() xwrong["a"] = 5.6 @@ -96,17 +96,17 @@ def test_dict_vectorizer(self): x = {np.int64(k): np.float32(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int64(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = {np.int32(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_label_encoder(self): sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers()) @@ -127,18 +127,18 @@ def test_label_encoder(self): x = np.array([["4"]]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[3]], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) # Python type x = np.array(["4"], ndmin=2) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) x = np.array(["4"], ndmin=2, dtype=object) res = sess.run([output_name], {input_name: x}) output_expected = np.array([3], ndmin=2, dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_run_model_mlnet(self): available_providers = onnxrt.get_available_providers() diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py index d5c80a4a1f4ba..034f0288e2508 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py +++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py @@ -99,7 +99,7 @@ def test_nv_tensorrt_rtx_ep_register_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): """ @@ -117,7 +117,7 @@ def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self): """ @@ -152,7 +152,7 @@ def my_delegate( input_name = sess.get_inputs()[0].name res = sess.run([], {input_name: x}) output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) def test_bind_input_only(self): """ diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py index bea110e566fb9..f02f4da4eb0fb 100644 --- a/onnxruntime/test/python/quantization/test_fusions.py +++ b/onnxruntime/test/python/quantization/test_fusions.py @@ -34,8 +34,8 @@ def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e for idx, expected_output in enumerate(orig_results): actual_output = fused_results[idx] np.testing.assert_allclose( - expected_output, actual_output, + expected_output, rtol=rtol, atol=atol, err_msg=f"Fused model output {idx} differs", diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py index 5d70641547eae..20b40fc157c16 100644 --- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py +++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py @@ -156,7 +156,7 @@ def test_saved_tensors_match_internal_tensors(self): for expected, actual in zip(model_outputs, test_outputs, strict=False): exp = expected.reshape(-1) act = actual.reshape(-1) - np.testing.assert_equal(exp, act) + np.testing.assert_equal(act, exp) def test_create_activation_matching_present(self): float_model_path = str(Path(self._tmp_model_dir.name) / "float_model2.onnx") diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py index a8f7591186766..906bf7aab8698 100644 --- a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py +++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py @@ -131,8 +131,8 @@ def test_quantize_blockwise_bnb4(self): matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type) quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type) quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type) - np.testing.assert_allclose(quant_value_ref, quant_value) - np.testing.assert_allclose(absmax_ref, absmax) + np.testing.assert_allclose(quant_value, quant_value_ref) + np.testing.assert_allclose(absmax, absmax_ref) if __name__ == "__main__": diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py index 7916d93c3e531..1dedc475c9962 100644 --- a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py +++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py @@ -40,7 +40,7 @@ def test_basic_all(self): x = np.arange(2**2).reshape((2,) * 2).astype(np.float32) t = np.arange(8).reshape((2, 4)).astype(np.float32) got = sess.run(None, {"X": x})[0] - np.testing.assert_allclose(t, got, atol=1e-5) + np.testing.assert_allclose(got, t, atol=1e-5) if __name__ == "__main__": diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index f5f6a3ae3bc39..0558d008a2275 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -31,23 +31,12 @@ "current_failing_tests": [ "^test_adagrad", "^test_adagrad_multiple", - "^test_attention_4d_diff_heads_mask4d_padded_kv*", // pending onnx update - "^test_attention_3d_gqa*", // pending onnx update - "^test_attention_3d_gqa_causal", // pending onnx update - "^test_attention_3d_gqa_scaled", // pending onnx update - "^test_attention_3d_gqa_softcap", // pending onnx update - "^test_attention_3d_gqa_with_past_and_present", // pending onnx update - "^test_attention_4d_gqa*", // pending onnx update - "^test_attention_4d_gqa_causal", // pending onnx update - "^test_attention_4d_gqa_scaled", // pending onnx update - "^test_attention_4d_gqa_softcap", // pending onnx update - "^test_attention_4d_gqa_with_past_and_present", // pending onnx update - "^test_attention_*causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal*", // pending onnx update - "^test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal*", // pending onnx update - "^test_attention_4d_attn_mask_3d_causal_expanded*", // pending onnx update "^test_attention_4d_fp16*", // precision issue: 1 / 192 mismatched elements "^test_attention_4d_fp16_expanded*", // precision issue: 3 / 192 mismatched elements + "^test_attention_4d_gqa_with_past_and_present_fp16_expanded*", // webgpu mismatched elements 38 / 576 + "^test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded*", // webgpu + "^test_attention_4d_attn_mask_3d_causal_expanded*", // webgpu + "^test_attention_4d_diff_heads_mask4d_padded_kv*", // Need nonpad_kv_seqlen "^test_l2normalization*", // LpNormalization(22) not implemented "^test_l1normalization*", // LpNormalization(22) not implemented "^test_lpnormalization*", // LpNormalization(22) not implemented @@ -123,13 +112,9 @@ "^test_if_opt", "^test_loop16_seq_none", "^test_identity_opt", - // rotary dim should be fixed in onnx==1.19.1 - "^test_rotary_embedding_no_position_ids_rotary_dim", - "^test_rotary_embedding_with_interleaved_rotary_dim", - "^test_rotary_embedding_with_rotary_dim", - "^test_rotary_embedding_3d_input_expanded", - "^test_rotary_embedding_interleaved_expanded", - "^test_rotary_embedding_no_position_ids_interleaved_expanded", + "^test_rotary_embedding_3d_input_expanded", // win cuda fail + "^test_rotary_embedding_interleaved_expanded", // win cuda fail + "^test_rotary_embedding_no_position_ids_interleaved_expanded", // win cuda fail "^test_rotary_embedding_expanded", //webgpu "^test_rotary_embedding_no_position_ids_expanded", //webgpu // Following tests are for opset 16 ops and are not yet implemented in ORT diff --git a/onnxruntime/test/testdata/ort_github_issue_26272.py b/onnxruntime/test/testdata/ort_github_issue_26272.py new file mode 100644 index 0000000000000..fa381e5df1094 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272.py @@ -0,0 +1,26 @@ +import onnx +from onnx import TensorProto, helper + +# Create a simple ONNX model with DDS output +input = helper.make_tensor_value_info("data", TensorProto.FLOAT, ["d1", "d2"]) +output = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["nzr"]) + +nonzeros_node = helper.make_node("NonZero", ["data"], ["nonzeros"], "nonzeros_node") +transpose_node = helper.make_node("Transpose", ["nonzeros"], ["nonzeros_t"], "transpose_node") +gathernd_node = helper.make_node("GatherND", ["data", "nonzeros_t"], ["output"], "gathernd_node") + +value_info = [ + helper.make_tensor_value_info("nonzeros", TensorProto.INT64, [2, "nzr"]), + helper.make_tensor_value_info("nonzeros_t", TensorProto.INT64, ["nzr", 2]), +] + +graph = helper.make_graph( + [nonzeros_node, transpose_node, gathernd_node], + "test_graph", + [input], + [output], + value_info=value_info, +) + +model = helper.make_model(graph) +onnx.save(model, "ort_github_issue_26272_dds.onnx") diff --git a/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx new file mode 100644 index 0000000000000..371f99c537898 --- /dev/null +++ b/onnxruntime/test/testdata/ort_github_issue_26272_dds.onnx @@ -0,0 +1,28 @@ + +:“ +( +datanonzeros nonzeros_node"NonZero +1 +nonzeros +nonzeros_ttranspose_node" Transpose +3 +data + +nonzeros_toutput gathernd_node"GatherND +test_graphZ +data + +d1 +d2b +output +  +nzrj +nonzeros + + +nzrj + +nonzeros_t + +nzr +B \ No newline at end of file diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 327caf83c7850..591be538ac873 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1015,6 +1015,9 @@ def generate_build_tree( if path_to_protoc_exe: cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"] + if args.cmake_deps_mirror_dir: + cmake_args += [f"-Donnxruntime_CMAKE_DEPS_MIRROR_DIR={args.cmake_deps_mirror_dir}"] + if args.fuzz_testing: if not ( args.build_shared_lib @@ -1330,7 +1333,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe cmd_args.extend(["--target", *targets]) build_tool_args = [] - if num_parallel_jobs != 1: + if num_parallel_jobs != 0: if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm: # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests # not maxing out CL_MPCount @@ -1748,7 +1751,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): # Install cpu only version of torch when cuda is not enabled in Linux. extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"] run_subprocess( - [sys.executable, "-m", "pip", "install", "torch", *extra], + [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra], cwd=cwd, dll_path=dll_path, python_path=python_path, diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index c5454903474d1..05d5052067b2e 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -204,6 +204,7 @@ def add_testing_args(parser: argparse.ArgumentParser) -> None: help="Run onnx_test_runner against test data. Only used in ONNX Runtime's CI pipelines", ) parser.add_argument("--path_to_protoc_exe", help="Path to protoc executable.") + parser.add_argument("--cmake_deps_mirror_dir", help="Path to the local mirror of cmake dependencies.") parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing.") parser.add_argument( "--enable_symbolic_shape_infer_tests", diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml index 53b62762319ba..e54216fe4ef4e 100644 --- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml @@ -31,5 +31,5 @@ stages: machine_pool: 'onnxruntime-Ubuntu2404-AMD-CPU' extra_build_arg: '' cmake_build_type: Release - cuda_version: 12.2 + cuda_version: 12.8 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250714.2 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 91736752e22d4..086d65c93062b 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -73,12 +73,12 @@ variables: - name: ReleaseVersionSuffix value: '' - name: win_trt_version - value: 12.2 + value: 12.8 - name: win_trt_home value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home - value: $(Agent.TempDirectory)\v12.2 + value: $(Agent.TempDirectory)\v12.8 extends: # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks. # For non-production pipelines, use "Unofficial" as defined below. @@ -142,7 +142,7 @@ extends: - template: stages/nuget-combine-cuda-stage.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} win_trt_home: ${{ variables.win_trt_home }} diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml index 46363c07b3e3e..7e107c33ed8c0 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml @@ -127,7 +127,7 @@ stages: NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_win.yml parameters: @@ -136,7 +136,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' MoreSuffix: '_Windows' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -144,7 +144,7 @@ stages: ArtifactSuffix: 'GPU' StageSuffix: 'GPU' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' - CudaVersion: 12.2 + CudaVersion: 12.8 - template: nuget/templates/test_linux.yml parameters: @@ -153,7 +153,7 @@ stages: StageSuffix: 'GPU' MoreSuffix: '_Linux' NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' - CudaVersion: 12.2 + CudaVersion: 12.8 @@ -202,7 +202,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -257,7 +257,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index 5535d7b4f264d..d7fc0efbf45ea 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -48,9 +48,9 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml @@ -59,13 +59,13 @@ variables: - name: win_trt_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: $(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda12 }} - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml index 1ad6f411d9848..5ce6ec278b1e7 100644 --- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: QnnSdk displayName: QNN SDK Version @@ -40,8 +40,8 @@ variables: - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: $(Agent.TempDirectory)\v12.8 resources: repositories: @@ -178,9 +178,6 @@ extends: inputs: targetType: 'inline' script: | - mkdir -p $(Build.BinariesDirectory)/osx-x64 - Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-x86_64* -Destination $(Build.BinariesDirectory)/osx-x64 - mkdir -p $(Build.BinariesDirectory)/osx-arm64 Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64 @@ -200,12 +197,6 @@ extends: foreach ($dir in $dirs) { Write-Host "Directory: $($dir.FullName)" } - $osx_x64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64* - if ($osx_x64_archive.Count -eq 0) { - Write-Host "No osx-x64 archive found." - } else { - Write-Host "osx-x64 archive found: $($osx_x64_archive[0].FullName)" - } $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64* if ($osx_arm64_archive.Count -eq 0) { Write-Host "No osx-arm64 archive found." @@ -233,13 +224,10 @@ extends: script: | Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64 Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64x-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64 - $osx_x64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName - tar -xzf $osx_x64_archive -C $(Build.BinariesDirectory)/osx-x64 2>$null tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64x-qnn*)[0].FullName - $osx_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-x64 -Filter onnxruntime-osx-x86_64*)[0].FullName $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64" Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64" diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml index d387c07d6dc6e..463c02203e21a 100644 --- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml +++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml @@ -40,7 +40,7 @@ stages: - template: templates/jobs/download_win_gpu_library.yml parameters: - CudaVersion: 12.2 + CudaVersion: 12.8 DownloadCUDA: true DownloadTRT: true @@ -105,7 +105,7 @@ stages: - name: runCodesignValidationInjection value: false - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 timeoutInMinutes: 60 steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index 0410001d77d13..5e6671e3797ce 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,21 +31,21 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} jobs: diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index 89ce3f3c86727..b60ef7576184e 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -58,9 +58,9 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu Context: tools/ci_build/github/linux/docker/ - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: DockerBuildArgs: " - --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04 + --build-arg BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 --build-arg TRT_VERSION=${{ replace(variables.linux_trt_version_cuda12, '-1.', '-1+') }} --build-arg BUILD_UID=$( id -u ) " @@ -107,4 +107,4 @@ stages: DisableContribOps: $(DisableContribOps) DisableMlOps: $(DisableMlOps) IsReleaseBuild: $(IsReleaseBuild) - PACKAGENAME: ${{ parameters.NugetPackageName }} \ No newline at end of file + PACKAGENAME: ${{ parameters.NugetPackageName }} diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index deb8b84bf19b8..fdfafd4d9a179 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -2,16 +2,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} stages: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index c2c89686a077e..02b6a6df76611 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,8 +18,8 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 - cuda_version: '12.2' + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 + cuda_version: '12.8' - stage: Republish_Wheels dependsOn: diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml index 4c536bad45368..290af4a3e4449 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml @@ -49,4 +49,4 @@ extends: - template: stages/py-gpu-packaging-stage.yml parameters: cmake_build_type: ${{ parameters.cmake_build_type }} - cuda_version: '12.2' + cuda_version: '12.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index 858de4d173484..b53aee639372d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -2,9 +2,9 @@ parameters: - name: CudaVersion displayName: 'CUDA version' type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: machine_pool type: string @@ -44,13 +44,13 @@ jobs: - template: ../../templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250724.1 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20251008.2 + ${{ if eq(parameters.CudaVersion, '12.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} pool: ${{ parameters.machine_pool }} steps: @@ -105,4 +105,4 @@ jobs: inputs: targetType: filePath filePath: tools/ci_build/github/linux/run_python_dockertest.sh - arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.2 + arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda${{ replace(parameters.CudaVersion, '.', '') }}xtrt86buildx86_64 -u 12.8 diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml index 0a11ba80fb5df..7b120fa06190b 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml @@ -12,9 +12,7 @@ parameters: displayName: 'NPM packages publish configuration' type: string default: 'dev' -- name: is1ES - type: boolean - default: false + jobs: - job: ReactNative_CI_Android pool: @@ -153,30 +151,16 @@ jobs: targetFolder: $(Build.ArtifactStagingDirectory) displayName: Create Artifacts onnxruntime-react-native - - ${{ if eq(parameters.is1ES, true) }}: - - task: 1ES.PublishPipelineArtifact@1 - inputs: - artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt) - targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts' - condition: succeededOrFailed() - displayName: Publish React Native Detox E2E test logs - - task: 1ES.PublishPipelineArtifact@1 - inputs: - artifactName: '${{parameters.PackageName}}' - targetPath: '$(Build.ArtifactStagingDirectory)' - displayName: Publish Pipeline Artifact - - - ${{ if eq(parameters.is1ES, false) }}: - - task: PublishPipelineArtifact@1 - inputs: - artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt) - targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts' - condition: succeededOrFailed() - displayName: Publish React Native Detox E2E test logs - - task: PublishPipelineArtifact@1 - inputs: - artifactName: '${{parameters.PackageName}}' - targetPath: '$(Build.ArtifactStagingDirectory)' - displayName: Publish Pipeline Artifact + - task: 1ES.PublishPipelineArtifact@1 + inputs: + artifact: android_e2e_test_logs_$(Build.BuildId)_$(Build.BuildNumber)_$(System.JobAttempt) + targetPath: '$(Build.SourcesDirectory)/js/react_native/e2e/artifacts' + condition: succeededOrFailed() + displayName: Publish React Native Detox E2E test logs + - task: 1ES.PublishPipelineArtifact@1 + inputs: + artifactName: '${{parameters.PackageName}}' + targetPath: '$(Build.ArtifactStagingDirectory)' + displayName: Publish Pipeline Artifact - template: ../../templates/explicitly-defined-final-tasks.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml index bca95a4a2fd02..8cbb81ba89c12 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-linux-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' stages: - stage: Linux_Nodejs_Packaging_x64 @@ -20,14 +20,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index 121e80fca1021..b1e5f541b90e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -1,7 +1,7 @@ parameters: - name: CudaVersion type: string - default: '12.2' + default: '12.8' - name: buildJava type: boolean - name: buildNodejs @@ -22,7 +22,7 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} @@ -74,14 +74,14 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self @@ -140,12 +140,12 @@ stages: - name: CUDA_VERSION_MAJOR ${{ if eq(parameters.CudaVersion, '11.8') }}: value: '11' - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: '12' - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index 61afeba2d302b..e7e541205ba0a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -60,7 +60,7 @@ stages: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual" + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu @@ -80,7 +80,7 @@ stages: msbuildPlatform: x64 CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90a-virtual" + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75-real;86-real;89-real;90-virtual" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index d8bb51b5ef79d..3c5cf591039e0 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -19,9 +19,9 @@ parameters: - name: cuda_version type: string displayName: 'CUDA version. Windows Only.' - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: PythonVersions type: object @@ -38,7 +38,7 @@ stages: PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90a-virtual" + EP_BUILD_FLAGS: --enable_lto --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual" use_tensorrt: True - template: py-linux-gpu-stage.yml @@ -48,4 +48,4 @@ stages: extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20250724.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12:20251008.2 diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml index 715470eb9f012..ab1fb919af413 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml @@ -22,9 +22,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 stages: - stage: Linux_py_GPU_Wheels_${{ parameters.arch }} @@ -55,7 +55,7 @@ stages: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index e2683c04f21f2..c3957fc8341de 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -20,9 +20,9 @@ parameters: default: '' - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 - name: cmake_build_type type: string @@ -47,7 +47,7 @@ stages: workspace: clean: all pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows templateContext: sdl: @@ -76,7 +76,7 @@ stages: - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} - name: trt_build_flag ${{ if eq(parameters.use_tensorrt, true) }}: @@ -119,7 +119,7 @@ stages: --cmake_generator "$(VSGenerator)" --enable_pybind --enable_onnx_tests - --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build --msvc_toolset 14.40 + --parallel 8 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --update --build $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }} workingDirectory: '$(Build.BinariesDirectory)' diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 2a6f8461d773c..338789a8da9e3 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -107,20 +107,15 @@ stages: workspace: clean: all pool: - name: 'Azure Pipelines' - image: 'macOS-15' - os: 'macOS' + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia timeoutInMinutes: 300 steps: - template: set-version-number-variables-step.yml - - task: JavaToolInstaller@0 - inputs: - versionSpec: "17" - jdkArchitectureOption: "x64" - jdkSourceOption: 'PreInstalled' - - template: use-xcode-version.yml parameters: xcodeVersion: 16.4 @@ -143,7 +138,6 @@ stages: - script: | python3 tools/ci_build/github/apple/test_apple_packages.py \ - --fail_if_cocoapods_missing \ --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \ --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \ --skip_macos_test \ @@ -209,13 +203,6 @@ stages: - input: pipelineArtifact artifactName: drop-onnxruntime-java-linux-aarch64 targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-linux-aarch64' - - - input: pipelineArtifact - artifactName: drop-onnxruntime-java-osx-x86_64 - targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-x86_64' - - input: pipelineArtifact - artifactName: drop-onnxruntime-java-osx-arm64 - targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-arm64' outputs: - output: pipelineArtifact targetPath: $(Build.BinariesDirectory)\java-artifact\onnxruntime-java-win-x64 @@ -448,7 +435,6 @@ stages: # - Windows arm64 (CPU, DML, WebGPU) # - Linux x64 (CPU, CUDA, TensorRT, WebGPU) # - Linux arm64 (CPU only) - # - macOS x64 (CPU, CoreML, WebGPU) # - macOS arm64 (CPU, CoreML, WebGPU) # # File manifest: @@ -485,12 +471,6 @@ stages: # - onnxruntime_binding.node # - libonnxruntime.so.1 # - # - macOS x64 (CPU, CoreML, WebGPU): - # dependency: MacOS_C_API_Packaging_CPU_x86_64 (drop-onnxruntime-nodejs-osx-x86_64) - # files: - # - onnxruntime_binding.node - # - libonnxruntime.{version}.dylib - # # - macOS arm64 (CPU, CoreML, WebGPU): # dependency: MacOS_C_API_Packaging_CPU_arm64 (drop-onnxruntime-nodejs-osx-arm64) # files: @@ -518,12 +498,6 @@ stages: artifactName: 'drop-onnxruntime-nodejs-win-arm64' targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/arm64/' - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - Nodejs (macOS x86_64)' - inputs: - artifactName: 'drop-onnxruntime-nodejs-osx-x86_64' - targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/darwin/x64/' - - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - Nodejs (macOS arm64)' inputs: @@ -600,16 +574,6 @@ stages: *.node TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\arm64' - # Node.js binding darwin/x64 - - task: CopyFiles@2 - displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64\' - inputs: - SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\darwin\x64' - Contents: | - libonnxruntime.*.dylib - *.node - TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64' - # Node.js binding darwin/arm64 - task: CopyFiles@2 displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\arm64\' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index 681138a5ab3d1..be213337091e8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -7,10 +7,10 @@ parameters: default: false - name: CudaVersion type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 - name: TrtVersion type: string default: '10.9.0.34' @@ -46,11 +46,11 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]11.8" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '8.6.1.6')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '8.6.1.6')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.8'), eq(parameters.TrtVersion, '10.9.0.34')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 96436883fb8b8..d7c940cda30f4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -9,7 +9,7 @@ parameters: default: false - name: PrimaryCUDAVersion type: string - default: '12.2' + default: '12.8' # - name: SecondaryCUDAVersion # type: string # default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml index 56cc84a90dc68..907563cb77242 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml @@ -26,13 +26,6 @@ stages: jobs: - template: mac-cpu-packing-jobs.yml parameters: - MacosArch: 'x86_64' - AllowReleasedOpsetOnly: ${{ parameters.AllowReleasedOpsetOnly }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} - - - template: mac-cpu-packing-jobs.yml - parameters: - MacosArch: 'arm64' AllowReleasedOpsetOnly: ${{ parameters.AllowReleasedOpsetOnly }} AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} @@ -41,14 +34,12 @@ stages: jobs: - job: MacOS_C_API_Package_Publish pool: - name: 'Azure Pipelines' - image: 'macOS-14' - os: 'macOS' + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia templateContext: inputs: - - input: pipelineArtifact - artifactName: onnxruntime-osx-x86_64 # The files in this artifact are not signed - targetPath: $(Build.ArtifactStagingDirectory) - input: pipelineArtifact artifactName: onnxruntime-osx-arm64 # The files in this artifact are not signed targetPath: $(Build.ArtifactStagingDirectory) @@ -64,12 +55,16 @@ stages: versionSpec: '3.13' addToPath: true - - task: PythonScript@0 - displayName: 'Prepare, Create Universal Binary, and Zip with Python' - inputs: - scriptSource: 'filePath' - scriptPath: 'tools/ci_build/prepare_macos_package.py' - arguments: '--staging_dir $(Build.ArtifactStagingDirectory)' + - script: | + set -ex + cd $(Build.ArtifactStagingDirectory) + # Find and extract the arm64 tarball + find . -name 'onnxruntime-osx-arm64*.tgz' -exec tar -xzf {} \; + # Remove _manifest directories if they exist + find . -type d -name '_manifest' -exec rm -rf {} + || true + # Find the extracted directory and zip it + find . -maxdepth 1 -type d -name 'onnxruntime-osx-arm64*' -exec zip -FSr --symlinks {}.zip {} \; + displayName: 'Prepare ARM64 Package for Signing' - template: mac-esrp-dylib.yml parameters: diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml index c43bfe2886f22..8e454f2137ce8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml @@ -1,9 +1,7 @@ parameters: - name: MacosArch type: string - values: - - 'x86_64' - - 'arm64' + default: 'arm64' - name: AdditionalBuildFlags displayName: Additional build flags for build.py @@ -21,11 +19,6 @@ steps: make install DESTDIR=$(Build.BinariesDirectory)/installed displayName: 'Build ${{ parameters.MacosArch }}' -- ${{ if eq(parameters.MacosArch, 'x86_64') }}: - - script: | - python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --test ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release --use_vcpkg --use_vcpkg_ms_internal_asset_cache - displayName: 'Running Tests' - - task: ShellScript@2 displayName: 'Copy build artifacts for zipping' inputs: @@ -42,31 +35,13 @@ steps: archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion).tgz' replaceExistingArchive: true -- script: | - set -e -x - mkdir -p $(Build.ArtifactStagingDirectory)/testdata - cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.ArtifactStagingDirectory)/testdata - displayName: 'Copy libcustom_op_library.dylib to ArtifactStagingDirectory' - condition: and(succeeded(), eq('${{ parameters.MacosArch }}', 'x86_64')) - - task: 1ES.PublishPipelineArtifact@1 inputs: targetPath: '$(Build.ArtifactStagingDirectory)' artifactName: 'onnxruntime-osx-${{ parameters.MacosArch }}' -- template: java-api-artifacts-package-and-publish-steps-posix.yml - parameters: - arch: 'osx-${{ parameters.MacosArch }}' - buildConfig: 'Release' - artifactName: 'onnxruntime-java-osx-${{ parameters.MacosArch }}' - libraryName: 'libonnxruntime.dylib' - nativeLibraryName: 'libonnxruntime4j_jni.dylib' - - template: nodejs-artifacts-package-and-publish-steps-posix.yml parameters: - ${{ if eq(parameters.MacosArch, 'x86_64') }}: - arch: x64 - ${{ if eq(parameters.MacosArch, 'arm64') }}: - arch: arm64 + arch: arm64 os: 'darwin' artifactName: 'drop-onnxruntime-nodejs-osx-${{ parameters.MacosArch }}' diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml index 3ae07ebffdb8c..bfccaef1c9852 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml @@ -1,10 +1,4 @@ parameters: -- name: MacosArch - type: string - values: - - 'x86_64' - - 'arm64' - - name: AdditionalBuildFlags displayName: Additional build flags for build.py type: string @@ -20,35 +14,30 @@ parameters: - 0 jobs: -- job: MacOS_C_API_Packaging_CPU_${{ parameters.MacosArch }} +- job: MacOS_C_API_Packaging_CPU_arm64 workspace: clean: all variables: MACOSX_DEPLOYMENT_TARGET: '14.0' ALLOW_RELEASED_ONNX_OPSET_ONLY: ${{ parameters.AllowReleasedOpsetOnly }} pool: - name: "Azure Pipelines" - image: 'macOS-15' - os: macOS + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia timeoutInMinutes: 300 steps: - checkout: self clean: true submodules: none - - task: JavaToolInstaller@0 - inputs: - versionSpec: "17" - jdkArchitectureOption: "x64" - jdkSourceOption: 'PreInstalled' - - template: use-xcode-version.yml parameters: xcodeVersion: 16.4 - template: setup-build-tools.yml parameters: - host_cpu_arch: ${{ parameters.MacosArch }} + host_cpu_arch: arm64 - template: set-version-number-variables-step.yml @@ -58,14 +47,7 @@ jobs: export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF" python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt' - - ${{ if eq(parameters.MacosArch, 'arm64') }}: - - template: mac-cpu-packaging-steps.yml - parameters: - MacosArch: ${{ parameters.MacosArch }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 - - - ${{ if eq(parameters.MacosArch, 'x86_64') }}: - - template: mac-cpu-packaging-steps.yml - parameters: - MacosArch: ${{ parameters.MacosArch }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=x86_64 \ No newline at end of file + - template: mac-cpu-packaging-steps.yml + parameters: + MacosArch: arm64 + AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-macos.yml b/tools/ci_build/github/azure-pipelines/templates/py-macos.yml index 6fb560d7fec7e..b59de879e2984 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-macos.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-macos.yml @@ -24,9 +24,10 @@ jobs: workspace: clean: all pool: - name: "Azure Pipelines" - image: "macOS-15" + name: AcesShared os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia templateContext: outputs: - output: pipelineArtifact @@ -44,7 +45,7 @@ jobs: - template: use-xcode-version.yml parameters: - xcodeVersion: '16.4.0' + xcodeVersion: '16.4' - template: setup-build-tools.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 1415586521f30..263f73a9e29b0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -18,9 +18,9 @@ parameters: - name: cuda_version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 # TODO: Ideally it should fetch information from the build that triggers it - name: cmake_build_type @@ -46,7 +46,7 @@ jobs: - name: trt_version ${{ if eq(parameters.cuda_version, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} - ${{ if eq(parameters.cuda_version, '12.2') }}: + ${{ if eq(parameters.cuda_version, '12.8') }}: value: ${{ variables.linux_trt_version_cuda12 }} workspace: clean: all diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index f20172e1c70a6..8018da41fbc2d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -26,9 +26,6 @@ parameters: - name: enable_code_sign displayName: Use GPG to sign the jars type: boolean -- name: is1ES - type: boolean - default: false stages: - stage: Build_Android_Packages @@ -44,7 +41,7 @@ stages: enable_code_sign: '${{parameters.enable_code_sign}}' pool_name: '${{parameters.PoolName}}' packageName: 'onnxruntime-android' - is1ES: '${{parameters.is1ES}}' + is1ES: true - stage: ReactNative_CI_Android displayName: ReactNative_CI_Android @@ -55,34 +52,21 @@ stages: PackageName: '${{parameters.PackageName}}' ArtifactName: 'onnxruntime-android-full-aar' NpmPackagingMode: '${{parameters.NpmPackagingMode}}' - is1ES: '${{parameters.is1ES}}' - stage: ReactNative_CI_iOS displayName: ReactNative_CI_iOS dependsOn: '${{parameters.InitialStageDependsOn}}' variables: - - name: publishPipelineArtifactTask - ${{ if eq(parameters.is1ES, true) }}: - value: 1ES.PublishPipelineArtifact@1 - ${{ else }}: - value: PublishPipelineArtifact@1 jobs: - job: ReactNative_CI_iOS_build - - ${{ if eq(parameters.is1ES, false) }}: - pool: - vmImage: 'macOS-14' - ${{ if eq(parameters.is1ES, true) }}: - pool: - name: 'Azure Pipelines' - image: 'macOS-14' - os: 'macOS' - + pool: + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia timeoutInMinutes: 120 - variables: runCodesignValidationInjection: false - steps: - template: use-xcode-version.yml @@ -106,7 +90,7 @@ stages: --build-settings-file $(Build.SourcesDirectory)/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json displayName: Build iOS package and assemble pods - - task: ${{ variables.publishPipelineArtifactTask }} + - task: 1ES.PublishPipelineArtifact@1 inputs: artifactName: 'ios_pod' targetPath: '$(Build.BinariesDirectory)/ios_pod' @@ -114,16 +98,11 @@ stages: - job: ReactNative_CI_iOS_unit_tests dependsOn: 'ReactNative_CI_iOS_build' - - ${{ if eq(parameters.is1ES, false) }}: - pool: - vmImage: 'macOS-14' - ${{ if eq(parameters.is1ES, true) }}: - pool: - name: 'Azure Pipelines' - image: 'macOS-14' - os: 'macOS' - + pool: + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia timeoutInMinutes: 90 steps: diff --git a/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml b/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml index df7fea537ce6f..548ff8a54a854 100644 --- a/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml +++ b/tools/ci_build/github/azure-pipelines/templates/setup-build-tools.yml @@ -16,13 +16,12 @@ parameters: steps: - template: telemetry-steps.yml -# Currently all ADO macOS machines are x64 machines - task: UsePythonVersion@0 displayName: 'Use Python ${{ parameters.host_cpu_arch }} (macOS)' condition: and(succeeded(), eq(variables['Agent.OS'], 'Darwin')) inputs: versionSpec: ${{ parameters.python_version }} - architecture: 'x64' + architecture: ${{ parameters.host_cpu_arch }} - task: UsePythonVersion@0 displayName: 'Use Python ${{ parameters.host_cpu_arch }} (non-macOS)' diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index e0b58e68e24cb..e9f170ff60301 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -17,8 +17,8 @@ stages: # Note: Keep the Xcode version and iOS simulator version compatible. # Check the table here to see what iOS simulator versions are supported by a particular Xcode version: # https://developer.apple.com/support/xcode/ - xcodeVersion: "15.3.0" - iosSimulatorRuntimeVersion: "17.4" + xcodeVersion: "16.4" + iosSimulatorRuntimeVersion: "18.5" buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json" cPodName: onnxruntime-c objcPodName: onnxruntime-objc diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 0310735d94b2e..ca698123a04e7 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -78,7 +78,7 @@ parameters: default: '11.8' values: - 11.8 - - 12.2 + - 12.8 - name: SpecificArtifact displayName: Use Specific Artifact @@ -136,7 +136,7 @@ stages: ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}: pool: - name: onnxruntime-Win-CPU-2022 + name: onnxruntime-Win-CPU-VS2022-Latest os: windows ${{ else }}: pool: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml index c20f4a2c1bd19..8b320b0ceb4ac 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml @@ -32,10 +32,10 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - 11.8 - - 12.2 + - 12.8 stages: - stage: kernelDocumentation diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index c12bb3552920c..08953749f6527 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -31,16 +31,16 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '12.2' + default: '12.8' values: - - 12.2 + - 12.8 variables: - template: templates/common-variables.yml - name: win_trt_folder ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.win_trt_folder_cuda11 }} - ${{ if eq(parameters.CudaVersion, '12.2') }}: + ${{ if eq(parameters.CudaVersion, '12.8') }}: value: ${{ variables.win_trt_folder_cuda12 }} jobs: diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 9cc140f41cf91..2f3ac991aee9c 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -2,4 +2,4 @@ set -e -x docker run -e SYSTEM_COLLECTIONURI --rm --volume \ $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 65be0c7b60ead..62bf8b4a245bb 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -69,7 +69,7 @@ fi if [ "$BUILD_DEVICE" == "GPU" ]; then SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') #Enable CUDA and TRT EPs. - BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90a-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;86-real;90a-real;90-virtual" "onnxruntime_USE_FPA_INTB_GEMM=OFF") fi if [ "$BUILD_DEVICE" == "NPU" ]; then diff --git a/tools/ci_build/github/linux/build_nodejs_package.sh b/tools/ci_build/github/linux/build_nodejs_package.sh index cc6443cc7fab6..ff5c504376d1d 100755 --- a/tools/ci_build/github/linux/build_nodejs_package.sh +++ b/tools/ci_build/github/linux/build_nodejs_package.sh @@ -3,4 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_nodejs --use_webgpu --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index b8d968c82d002..c0849bf0ace73 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -3,4 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90a-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60-real;70-real;75-real;80-real;90a-real;90-virtual' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index 2a65e7c26b20b..a277286866e41 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm index 3337af3be6074..5410bd64036ce 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ARG ROCM_VERSION=6.2.3 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu index 0007a4e06f7c0..07ad8e933baf0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_webgpu @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 8a84b9b940306..5d98c25b535af 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -ARG BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04 +ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu20.04 ARG TRT_VERSION=10.9.0.34-1+cuda12.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 8b2083c2ccfc1..cef2d11780969 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index f5143d5ac9ab9..79d99d08dcc4e 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt index bae6f4cb51816..1b1dadeaf8db2 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.8 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index cfc2ce7079148..72d98206f9205 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14_dotnet:20251008.2 FROM $BASEIMAGE ENV LANG=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index 8401393a661b1..85f4a074e30bf 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc12_dotnet:20251008.2 FROM $BASEIMAGE ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index b923febc1227f..81ba47f397f91 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1 +ARG BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2 FROM $BASEIMAGE ADD scripts /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile index f3341f32a768d..5ad1023bfb5b2 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile @@ -1,5 +1,5 @@ # Use the specified UBI8 base image with GCC 14 -ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20250724.1" +ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_almalinux8_gcc14:20251008.2" FROM ${BASEIMAGE} ARG BUILD_UID=1000 diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt index 2871f5cab2ea2..dc394ff50f4f9 100644 --- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt @@ -3,7 +3,7 @@ beartype==0.15.0 flatbuffers cerberus h5py -onnx==1.19.0 +onnx==1.19.1 # Python dependencies required for pytorch development astunparse expecttest!=0.2.0 diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 381d42831e715..2d89aece56340 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 4cc94f9148656..c19c0170291e6 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -4,11 +4,11 @@ mypy pytest setuptools==78.1.1 wheel==0.45.1 -onnx==1.19.0 +onnx==1.19.1 argparse sympy==1.14 flatbuffers protobuf==4.25.1 packaging -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt index d48fb66194f2a..3ddce9cc0ec31 100644 --- a/tools/ci_build/github/linux/python/requirements.txt +++ b/tools/ci_build/github/linux/python/requirements.txt @@ -3,12 +3,12 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe diff --git a/tools/ci_build/github/windows/jar_packaging.py b/tools/ci_build/github/windows/jar_packaging.py index 2354363610251..b399782e9410f 100644 --- a/tools/ci_build/github/windows/jar_packaging.py +++ b/tools/ci_build/github/windows/jar_packaging.py @@ -33,7 +33,12 @@ def find_7z_executable(): if seven_zip_exe: return seven_zip_exe - # 2. Check the default installation directory under Program Files + # 2. Check if '7za' is in the PATH (common on Linux systems) + seven_zip_exe = shutil.which("7za") + if seven_zip_exe: + return seven_zip_exe + + # 3. Check the default installation directory under Program Files program_files = os.environ.get("ProgramFiles") # noqa: SIM112 if program_files: default_path = Path(program_files) / "7-Zip" / "7z.exe" @@ -226,9 +231,7 @@ def run_packaging(package_type: str, build_dir: str): "cpu": { "platforms": [ {"path": "onnxruntime-java-linux-x64", "lib": "libcustom_op_library.so", "archive_lib": True}, - {"path": "onnxruntime-java-osx-x86_64", "lib": "libcustom_op_library.dylib", "archive_lib": True}, {"path": "onnxruntime-java-linux-aarch64", "lib": "libcustom_op_library.so", "archive_lib": False}, - {"path": "onnxruntime-java-osx-arm64", "lib": "libcustom_op_library.dylib", "archive_lib": False}, ] }, "gpu": { diff --git a/tools/ci_build/github/windows/jar_packaging_test.py b/tools/ci_build/github/windows/jar_packaging_test.py index 91b68728dad15..2dd61cf9c3088 100644 --- a/tools/ci_build/github/windows/jar_packaging_test.py +++ b/tools/ci_build/github/windows/jar_packaging_test.py @@ -31,7 +31,6 @@ def _setup_test_directory(package_type: str, version_string: str): java_artifact_dir = tmp_path / "java-artifact" win_dir = java_artifact_dir / "onnxruntime-java-win-x64" linux_dir = java_artifact_dir / "onnxruntime-java-linux-x64" - osx_dir = java_artifact_dir / "onnxruntime-java-osx-x86_64" # --- Main artifact directory (Windows) --- win_dir.mkdir(parents=True, exist_ok=True) @@ -53,26 +52,14 @@ def _setup_test_directory(package_type: str, version_string: str): create_empty_file(linux_native_dir / "libonnxruntime_providers_cuda.so") (linux_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True) - # --- macOS and other platforms (for CPU test) --- + # --- Additional platforms (for CPU test) --- if package_type == "cpu": - osx_native_dir = osx_dir / "ai" / "onnxruntime" / "native" / "osx-x86_64" - osx_native_dir.mkdir(parents=True, exist_ok=True) - create_empty_file(osx_dir / "libcustom_op_library.dylib") - create_empty_file(osx_native_dir / "libonnxruntime.dylib") - create_empty_file(osx_native_dir / "libonnxruntime4j_jni.dylib") - (osx_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True) - - # Add linux-aarch64 and osx-arm64 for CPU test + # Add linux-aarch64 for CPU test linux_aarch64_dir = java_artifact_dir / "onnxruntime-java-linux-aarch64" linux_aarch64_native_dir = linux_aarch64_dir / "ai" / "onnxruntime" / "native" / "linux-aarch64" linux_aarch64_native_dir.mkdir(parents=True, exist_ok=True) create_empty_file(linux_aarch64_dir / "libcustom_op_library.so") - osx_arm64_dir = java_artifact_dir / "onnxruntime-java-osx-arm64" - osx_arm64_native_dir = osx_arm64_dir / "ai" / "onnxruntime" / "native" / "osx-arm64" - osx_arm64_native_dir.mkdir(parents=True, exist_ok=True) - create_empty_file(osx_arm64_dir / "libcustom_op_library.dylib") - return tmp_path return _setup_test_directory @@ -134,9 +121,6 @@ def test_cpu_packaging(directory_setup_factory, version_string): # Linux libs assert "ai/onnxruntime/native/linux-x64/libonnxruntime.so" in jar_contents assert "ai/onnxruntime/native/linux-x64/libonnxruntime4j_jni.so" in jar_contents - # macOS libs - assert "ai/onnxruntime/native/osx-x86_64/libonnxruntime.dylib" in jar_contents - assert "ai/onnxruntime/native/osx-x86_64/libonnxruntime4j_jni.dylib" in jar_contents # GPU libs should NOT be present assert "ai/onnxruntime/native/linux-x64/libonnxruntime_providers_cuda.so" not in jar_contents @@ -144,14 +128,9 @@ def test_cpu_packaging(directory_setup_factory, version_string): with zipfile.ZipFile(testing_jar_path, "r") as zf: jar_contents = zf.namelist() assert "libcustom_op_library.so" in jar_contents - assert "libcustom_op_library.dylib" in jar_contents # 3. Verify the custom op libraries were removed from the source directories linux_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-x64" - osx_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-x86_64" linux_aarch64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-aarch64" - osx_arm64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-arm64" assert not (linux_dir / "libcustom_op_library.so").exists() - assert not (osx_dir / "libcustom_op_library.dylib").exists() assert not (linux_aarch64_dir / "libcustom_op_library.so").exists() - assert not (osx_arm64_dir / "libcustom_op_library.dylib").exists() diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index 6ab2ab2b7b61f..bb307a20d7f18 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -3,13 +3,13 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.19.0 +onnx==1.19.1 protobuf==4.25.1 sympy==1.14 flatbuffers psutil -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 jinja2 markupsafe semver diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index f93938e2a9009..f095f58f9920e 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,13 +1,13 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( -set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index ecadab5d3f8a3..115a19b6f3a01 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -1,14 +1,14 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64;%PATH% ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% -@REM The default version is still cuda v12.2, because set cuda v11.8 after it +@REM The default version is still cuda v12.8, because set cuda v11.8 after it set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 45e0d970fb541..6110249a9cde6 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,10 +1,10 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.8\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.8\bin;%AGENT_TEMPDIRECTORY%\v12.8\extras\CUPTI\lib64 ) else ( - set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 + set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\extras\CUPTI\lib64 ) set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index bcd5a434c58e8..21894c2ba003d 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -3,12 +3,13 @@ packaging # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt protobuf==4.25.1 numpy==2.2.6 -torch>=2.6.0 +torch==2.8.0 +torchvision==0.23.0 coloredlogs==15.0 transformers==4.52.1 parameterized>=0.8.1 sentencepiece psutil einops -onnxscript==0.3.2 -onnx-ir +onnxscript==0.5.3 +onnx-ir==0.1.10 diff --git a/tools/python/update_version.py b/tools/python/update_version.py index 6d040ea90947f..7807441285d4c 100755 --- a/tools/python/update_version.py +++ b/tools/python/update_version.py @@ -1,122 +1,217 @@ -import os +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import re +import shutil +import sys +from pathlib import Path + +# --- Helper Functions for Updating Files --- + + +def update_versioning_md(file_path: Path, new_version: str): + """Updates the version table in Versioning.md.""" + print(f"Checking '{file_path.name}' for version updates...") + if not file_path.exists(): + print(f"Warning: File not found at '{file_path}'. Skipping.") + return + content = file_path.read_text() + + # Find the first version number in the markdown table + match = re.search(r"^\| ([\d.]+) \|", content, re.MULTILINE) + if not match: + print(f"Warning: Could not find current version in '{file_path.name}'. Skipping.") + return + + current_version = match.group(1) + print(f"Found current version: {current_version}") + + if new_version != current_version: + print(f"Updating version in '{file_path.name}' to {new_version}...") + # Prepare the new row by duplicating the header separator line's structure + header_separator_match = re.search(r"(\r\n?|\n)(\|---\|.*)", content) + if not header_separator_match: + print(f"Warning: Could not find table header separator in '{file_path.name}'. Skipping.") + return + + header_separator = header_separator_match.group(2) + # Create a new row based on the separator, replacing dashes with spaces and adding the version + new_row_parts = [" " + part.replace("-", " ") + " " for part in header_separator.split("|")] + new_row_parts[1] = f" {new_version} " # Set the new version + new_row = "|".join(new_row_parts) + + # Insert the new row right after the header separator line + insertion_point = header_separator_match.end(0) + new_content = content[:insertion_point] + "\n" + new_row + content[insertion_point:] + file_path.write_text(new_content) + print("Update complete.") + else: + print("Version is already up to date.") + + +def update_readme_rst(file_path: Path, new_version: str): + """Updates the release history in the Python README.rst.""" + print(f"Checking '{file_path.name}' for version updates...") + if not file_path.exists(): + print(f"Warning: File not found at '{file_path}'. Skipping.") + return + content = file_path.read_text() + + # Find the first version header in the file + match = re.search(r"^([\d.]+)", content, re.MULTILINE) + if not match: + print(f"Warning: Could not find current version in '{file_path.name}'. Skipping.") + return + + current_version = match.group(1) + print(f"Found current version: {current_version}") + + if new_version != current_version: + print(f"Updating version in '{file_path.name}' to {new_version}...") + new_header = f"{new_version}\n{'^' * len(new_version)}" + release_notes = f"Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v{new_version}" + new_section = f"{new_header}\n\n{release_notes}\n\n" + + # Insert the new section before the first version header found + insertion_point = match.start(0) + new_content = content[:insertion_point] + new_section + content[insertion_point:] + file_path.write_text(new_content) + print("Update complete.") + else: + print("Version is already up to date.") + + +def update_init_py(file_path: Path, new_version: str): + """Updates the __version__ variable in the project's __init__.py.""" + print(f"Checking '{file_path.name}' for version updates...") + if not file_path.exists(): + print(f"Warning: File not found at '{file_path}'. Skipping.") + return + content = file_path.read_text() + + # Find the __version__ line + match = re.search(r"__version__\s*=\s*[\"']([\d.]+)[\"']", content) + if not match: + print(f"Warning: Could not find __version__ in '{file_path.name}'. Skipping.") + return + + current_version = match.group(1) + print(f"Found current version: {current_version}") + + if new_version != current_version: + print(f"Updating version in '{file_path.name}' to {new_version}...") + new_content = re.sub(r"__version__\s*=\s*[\"'][\d.]+[\"']", f'__version__ = "{new_version}"', content) + file_path.write_text(new_content) + print("Update complete.") + else: + print("Version is already up to date.") + + +def update_npm_packages(js_root: Path, new_version: str): + """Updates versions for all NPM packages in the js directory.""" + print("\nUpdating NPM package versions...") + + # This script assumes a 'util' module is available in the search path. + try: + from util import is_windows # noqa: PLC0415 + from util import run as run_command # noqa: PLC0415 + except ImportError: + print("Error: Could not import 'is_windows' and 'run' from a 'util' module.", file=sys.stderr) + print("Please ensure the 'util' module is in Python's search path.", file=sys.stderr) + return + + command_prefix = [] + # Check if node and npm are directly available in the system's PATH. + if shutil.which("node") and shutil.which("npm"): + print("Found node and npm in PATH.") + # If not, and if on Linux, check if 'fnm' is available. + elif shutil.which("fnm"): + print("node/npm not in PATH. Found 'fnm' on Linux, will use it to run commands.") + nvmrc_path = js_root / ".nvmrc" + # Check for .nvmrc file. + if not nvmrc_path.exists(): + print(f"Error: 'fnm' is being used, but the version file '{nvmrc_path}' was not found.", file=sys.stderr) + print( + "Please create a .nvmrc file in the 'js' directory with the desired Node.js version.", file=sys.stderr + ) + return + + node_version = nvmrc_path.read_text().strip() + print(f"Found node version '{node_version}' in .nvmrc.") + + # Ensure the required node version is installed by fnm. + print(f"Ensuring Node.js version '{node_version}' is installed via fnm...") + run_command("fnm", "install", node_version, cwd=js_root) + + print(f"Using Node.js version '{node_version}' with fnm.") + command_prefix = ["fnm", "exec", f"--using={node_version}", "--"] + # If neither is available, skip the NPM updates. + else: + print("Error: Could not find 'node' and 'npm' in your PATH.", file=sys.stderr) + if sys.platform.startswith("linux"): + print("Hint: Install 'fnm' (Fast Node Manager) to manage Node.js versions.", file=sys.stderr) + print("Skipping NPM package updates.", file=sys.stderr) + return + + def run_npm(args, cwd): + """Helper to run npm commands, prepending fnm if necessary.""" + full_command = command_prefix + list(args) + print(full_command) + run_command(*full_command, cwd=cwd) + + npm_exe = "npm.cmd" if is_windows() else "npm" + packages = ["common", "node", "web", "react_native"] + + for package in packages: + print(f"\n--- Updating package: {package} ---") + # Use npm's --prefix argument and run from js_root. + # --allow-same-version prevents an error if the version is already correct. + run_npm([npm_exe, "--prefix", package, "version", new_version, "--allow-same-version"], cwd=js_root) + run_npm([npm_exe, "--prefix", package, "install", "--package-lock-only", "--ignore-scripts"], cwd=js_root) + + print("\n--- Finalizing JS versions and formatting ---") + run_npm([npm_exe, "ci"], cwd=js_root) + for package in packages: + run_npm([npm_exe, "run", "update-version", package], cwd=js_root) + + run_npm([npm_exe, "run", "format"], cwd=js_root) + print("NPM package updates complete.") + + +# Define repository root relative to the script's location +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_DIR = SCRIPT_DIR.parent.parent def update_version(): - version = "" - cwd = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(cwd, "..", "..", "VERSION_NUMBER")) as f: - version = f.readline().strip() - lines = [] - current_version = "" - file_path = os.path.join(cwd, "..", "..", "docs", "Versioning.md") - with open(file_path) as f: - lines = f.readlines() - for line in lines: - if line.startswith("|"): - sections = line.split("|") - if len(sections) == 8 and sections[1].strip()[0].isdigit(): - current_version = sections[1].strip() - break - print("Current version of ORT seems to be: " + current_version) - if version != current_version: - with open(file_path, "w") as f: - for i, line in enumerate(lines): - f.write(line) - if line.startswith("|--"): - sections = lines[i + 1].split("|") - # Make sure there are no 'False Positive' version additions - # by making sure the line we are building a new line from - # contains the current_version - if len(sections) > 1 and sections[1].strip() == current_version: - sections[1] = " " + version + " " - new_line = "|".join(sections) - f.write(new_line) - lines = [] - current_version = "" - file_path = os.path.join(cwd, "..", "..", "docs", "python", "README.rst") - with open(file_path) as f: - lines = f.readlines() - for line in lines: - sections = line.strip().split(".") - if len(sections) == 3 and sections[0].isdigit() and sections[1].isdigit() and sections[2].isdigit(): - current_version = line.strip() - break - if version != current_version: - inserted = False - with open(file_path, "w") as f: - for line in lines: - sections = line.strip().split(".") - if ( - inserted is False - and len(sections) == 3 - and sections[0].isdigit() - and sections[1].isdigit() - and sections[2].isdigit() - ): - f.write(version + "\n") - f.write("^" * len(version) + "\n\n") - f.write( - "Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v" - + version.strip() - + "\n\n" - ) - inserted = True - f.write(line) - lines = [] - current_version = "" - file_path = os.path.join(cwd, "..", "..", "onnxruntime", "__init__.py") - with open(file_path) as f: - lines = f.readlines() - for line in lines: - if line.startswith("__version__"): - current_version = line.split("=")[1].strip()[1:-1] - break - if version != current_version: - with open(file_path, "w") as f: - for line in lines: - if line.startswith("__version__"): - f.write('__version__ = "' + version + '"\n') - continue - f.write(line) - - # update version for NPM packages - current_version = "" - js_root = os.path.join(cwd, "..", "..", "js") - - def run(args, cwd): - from util import is_windows, run # noqa: PLC0415 - - if is_windows(): - args = ["cmd", "/c", *args] - run(*args, cwd=cwd) - - # check if node and npm are installed - run(["node", "--version"], cwd=js_root) - run(["npm", "--version"], cwd=js_root) - - # upgrade version for onnxruntime-common - run(["npm", "version", version], cwd=os.path.join(js_root, "common")) - run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "common")) - - # upgrade version for onnxruntime-node - run(["npm", "version", version], cwd=os.path.join(js_root, "node")) - run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "node")) - - # upgrade version for onnxruntime-web - run(["npm", "version", version], cwd=os.path.join(js_root, "web")) - run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "web")) - - # upgrade version for onnxruntime-react-native - run(["npm", "version", version], cwd=os.path.join(js_root, "react_native")) - run(["npm", "install", "--package-lock-only", "--ignore-scripts"], cwd=os.path.join(js_root, "react_native")) - - # upgrade version.ts in each package - run(["npm", "ci"], cwd=js_root) - run(["npm", "run", "update-version", "common"], cwd=js_root) - run(["npm", "run", "update-version", "node"], cwd=js_root) - run(["npm", "run", "update-version", "web"], cwd=js_root) - run(["npm", "run", "update-version", "react_native"], cwd=js_root) - run(["npm", "run", "format"], cwd=js_root) + """Main function to read the new version and orchestrate updates across the project.""" + # Read and validate the new version from VERSION_NUMBER + version_file = REPO_DIR / "VERSION_NUMBER" + print(f"Reading new version from '{version_file}'...") + try: + new_version = version_file.read_text().strip() + except FileNotFoundError: + print(f"Error: '{version_file}' not found.", file=sys.stderr) + sys.exit(1) + + # Validate that the version is in x.y.z format + if not re.fullmatch(r"\d+\.\d+\.\d+", new_version): + print( + f"Error: Version '{new_version}' from '{version_file.name}' is not a valid x.y.z semantic version.", + file=sys.stderr, + ) + sys.exit(1) + + print(f"Target version to set: {new_version}\n") + + # Update files using absolute paths from REPO_DIR + update_versioning_md(REPO_DIR / "docs" / "Versioning.md", new_version) + update_readme_rst(REPO_DIR / "docs" / "python" / "README.rst", new_version) + update_init_py(REPO_DIR / "onnxruntime" / "__init__.py", new_version) + + # Update all NPM packages + update_npm_packages(REPO_DIR / "js", new_version) if __name__ == "__main__": diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py index e8dda5cc592b9..c7fbde2aac584 100644 --- a/tools/python/util/android/android.py +++ b/tools/python/util/android/android.py @@ -130,7 +130,7 @@ def start_emulator( extra_args: typing.Sequence[str] | None = None, timeout_minutes: int = 20, ) -> subprocess.Popen: - if check_emulator_running_using_avd_name(avd_name=avd_name): + if check_emulator_running_using_avd_name(sdk_tool_paths=sdk_tool_paths, avd_name=avd_name): raise RuntimeError( f"An emulator with avd_name{avd_name} is already running. Please close it before starting a new one." ) @@ -234,12 +234,12 @@ def start_emulator( time.sleep(sleep_interval_seconds) # Verify if the emulator is now running - if not check_emulator_running_using_avd_name(avd_name=avd_name): + if not check_emulator_running_using_avd_name(sdk_tool_paths=sdk_tool_paths, avd_name=avd_name): raise RuntimeError("Emulator failed to start.") return emulator_process -def check_emulator_running_using_avd_name(avd_name: str) -> bool: +def check_emulator_running_using_avd_name(sdk_tool_paths: SdkToolPaths, avd_name: str) -> bool: """ Check if an emulator is running based on the provided AVD name. :param avd_name: Name of the Android Virtual Device (AVD) to check. @@ -247,7 +247,7 @@ def check_emulator_running_using_avd_name(avd_name: str) -> bool: """ try: # Step 1: List running devices - result = subprocess.check_output(["adb", "devices"], text=True).strip() + result = subprocess.check_output([sdk_tool_paths.adb, "devices"], text=True).strip() _log.info(f"adb devices output:\n{result}") running_emulators = [line.split("\t")[0] for line in result.splitlines()[1:] if "emulator" in line] @@ -259,7 +259,7 @@ def check_emulator_running_using_avd_name(avd_name: str) -> bool: for emulator in running_emulators: try: avd_info = ( - subprocess.check_output(["adb", "-s", emulator, "emu", "avd", "name"], text=True) + subprocess.check_output([sdk_tool_paths.adb, "-s", emulator, "emu", "avd", "name"], text=True) .strip() .split("\n")[0] )