JuliaGPU · maleadt · Aug 9, 2025 · Aug 11, 2025 · Aug 12, 2025 · Sep 3, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -333,6 +333,10 @@ steps:
             build.message !~ /\[skip tests\]/ &&
             build.message !~ /\[skip special\]/
         timeout_in_minutes: 30
+        matrix:
+          setup:
+            cuda:
+              - "12.9"
 
       - label: "CuArray with {{matrix.memory}} memory"
         plugins:
@@ -360,6 +364,8 @@ steps:
             memory:
               - "unified"
               - "host"
+            cuda:
+              - "12.9"
         commands: |
           echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
 

diff --git a/lib/cupti/wrappers.jl b/lib/cupti/wrappers.jl
@@ -1,7 +1,44 @@
+# https://docs.nvidia.com/cupti/api/group__CUPTI__VERSION__API.html
+const cupti_versions = [
+    v"4.0",
+    v"4.1",
+    v"5.0",
+    v"5.5",
+    v"6.0",
+    v"6.5",
+    v"6.5.1", # with sm_52 support
+    v"7.0",
+    v"8.0",
+    v"9.0",
+    v"9.1",
+    v"10.0", # and v10.1 and v10.2
+    v"11.0",
+    v"11.1",
+    v"11.2", # and v11.3 and v11.4
+    v"11.5",
+    v"11.6",
+    v"11.8",
+    v"12.0",
+    v"12.2",
+    v"12.3",
+    v"12.4",
+    v"12.5",
+    v"12.6",
+    v"12.7",
+    v"12.8",
+    v"12.9",
+    v"12.9.1"]
+
 function version()
     version_ref = Ref{Cuint}()
     cuptiGetVersion(version_ref)
-    VersionNumber(version_ref[])
+    if CUDA.runtime_version() < v"13"
+        cupti_versions[version_ref[]]
+    else
+        major, ver = divrem(version_ref[], 10000)
+        minor, patch = divrem(ver, 100)
+        VersionNumber(major, minor, patch)
+    end
 end
 
 # XXX: `cuptiGetVersion` returns something more like the API version, and doesn't change

diff --git a/src/CUDA.jl b/src/CUDA.jl
@@ -54,12 +54,8 @@ using Printf
 # - Base.aligned_sizeof is the size of an object in an array/inline alloced
 # Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
 # We use `aligned_sizeof` since we care about the size of a type in an array
-@static if VERSION < v"1.11.0"
-   @generated function aligned_sizeof(::Type{T}) where T
-        return :($(Base.aligned_sizeof(T)))
-   end
-else
-    import Base: aligned_sizeof
+@generated function aligned_sizeof(::Type{T}) where T
+    return :($(Base.aligned_sizeof(T)))
 end
 
 ## source code includes

diff --git a/src/initialization.jl b/src/initialization.jl
@@ -160,7 +160,7 @@ function __init__()
     end
 
     # if we're not running under an external profiler, let CUPTI handle NVTX events
-    if !NVTX.isactive() && toolkit_version < v"13" # NVIDIA/NVTX#125
+    if !NVTX.isactive() && CUPTI.version() != v"13.0" # NVIDIA/NVTX#125
         ENV["NVTX_INJECTION64_PATH"] = CUDA_Runtime.libcupti
         NVTX.activate()
     end

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -1,7 +1,6 @@
 ## COV_EXCL_START
 
 # TODO
-# - serial version for lower latency
 # - block-stride loop to delay need for second kernel launch
 
 # Reduce a value across a warp
@@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
     return
 end
 
-function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
+function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
     grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
     @inbounds if grid_idx <= length(Rother)
         Iother = Rother[grid_idx]
@@ -160,7 +159,7 @@ end
 ## COV_EXCL_STOP
 
 # factored out for use in tests
-function big_mapreduce_threshold(dev)
+function serial_mapreduce_threshold(dev)
     max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
                       attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
     return max_concurrency
@@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     @assert length(Rother) > 0
 
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
-    if length(Rother) >= big_mapreduce_threshold(dev)
+    if length(Rother) >= serial_mapreduce_threshold(dev)
         args = (f, op, init, Rreduce, Rother, R, A)
-        kernel = @cuda launch=false big_mapreduce_kernel(args...)
+        kernel = @cuda launch=false serial_mapreduce_kernel(args...)
         kernel_config = launch_configuration(kernel.fun)
         threads = kernel_config.threads
         blocks = cld(length(Rother), threads)
@@ -255,18 +254,38 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         # we can cover the dimensions to reduce using a single block
         kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
     else
-        # we need multiple steps to cover all values to reduce
-        partial = similar(R, (size(R)..., reduce_blocks))
+        # TODO: provide a version that atomically reduces from different blocks
+
+        # temporary empty array whose type will match the final partial array
+	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
+
+        # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
+        #       since the type of `partial` might not match the original output container
+        #       (e.g. if that was a view).
+        partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads)
+        partial_reduce_threads = compute_threads(partial_kernel_config.threads)
+        partial_reduce_shmem = compute_shmem(partial_reduce_threads)
+        partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
+            1
+        else
+            min(cld(length(Rreduce), partial_reduce_threads),
+                cld(partial_kernel_config.blocks, other_blocks))
+        end
+        partial_threads = partial_reduce_threads
+        partial_shmem = partial_reduce_shmem
+        partial_blocks = partial_reduce_blocks*other_blocks
+
+        partial = similar(R, (size(R)..., partial_blocks))
         if init === nothing
             # without an explicit initializer we need to copy from the output container
             partial .= R
         end
-        # NOTE: we can't use the previously-compiled kernel, since the type of `partial`
-        #       might not match the original output container (e.g. if that was a view).
-        @cuda(threads, blocks, shmem,
-              partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A))
 
-        GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
+        partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
+                       threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
+
+        GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
 
     return R

diff --git a/test/base/array.jl b/test/base/array.jl
@@ -718,7 +718,7 @@ end
 @testset "large map reduce" begin
   dev = device()
 
-  big_size = CUDA.big_mapreduce_threshold(dev) + 5
+  big_size = CUDA.serial_mapreduce_threshold(dev) + 5
   a = rand(Float32, big_size, 31)
   c = CuArray(a)