Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,10 @@ steps:
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip special\]/
timeout_in_minutes: 30
matrix:
setup:
cuda:
- "12.9"

- label: "CuArray with {{matrix.memory}} memory"
plugins:
Expand Down Expand Up @@ -360,6 +364,8 @@ steps:
memory:
- "unified"
- "host"
cuda:
- "12.9"
commands: |
echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml

Expand Down
39 changes: 38 additions & 1 deletion lib/cupti/wrappers.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,44 @@
# https://docs.nvidia.com/cupti/api/group__CUPTI__VERSION__API.html
const cupti_versions = [
v"4.0",
v"4.1",
v"5.0",
v"5.5",
v"6.0",
v"6.5",
v"6.5.1", # with sm_52 support
v"7.0",
v"8.0",
v"9.0",
v"9.1",
v"10.0", # and v10.1 and v10.2
v"11.0",
v"11.1",
v"11.2", # and v11.3 and v11.4
v"11.5",
v"11.6",
v"11.8",
v"12.0",
v"12.2",
v"12.3",
v"12.4",
v"12.5",
v"12.6",
v"12.7",
v"12.8",
v"12.9",
v"12.9.1"]

function version()
version_ref = Ref{Cuint}()
cuptiGetVersion(version_ref)
VersionNumber(version_ref[])
if CUDA.runtime_version() < v"13"
cupti_versions[version_ref[]]
else
major, ver = divrem(version_ref[], 10000)
minor, patch = divrem(ver, 100)
VersionNumber(major, minor, patch)
end
end

# XXX: `cuptiGetVersion` returns something more like the API version, and doesn't change
Expand Down
8 changes: 2 additions & 6 deletions src/CUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@ using Printf
# - Base.aligned_sizeof is the size of an object in an array/inline alloced
# Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
# We use `aligned_sizeof` since we care about the size of a type in an array
@static if VERSION < v"1.11.0"
@generated function aligned_sizeof(::Type{T}) where T
return :($(Base.aligned_sizeof(T)))
end
else
import Base: aligned_sizeof
@generated function aligned_sizeof(::Type{T}) where T
return :($(Base.aligned_sizeof(T)))
end

## source code includes
Expand Down
2 changes: 1 addition & 1 deletion src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ function __init__()
end

# if we're not running under an external profiler, let CUPTI handle NVTX events
if !NVTX.isactive() && toolkit_version < v"13" # NVIDIA/NVTX#125
if !NVTX.isactive() && CUPTI.version() != v"13.0" # NVIDIA/NVTX#125
ENV["NVTX_INJECTION64_PATH"] = CUDA_Runtime.libcupti
NVTX.activate()
end
Expand Down
43 changes: 31 additions & 12 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
## COV_EXCL_START

# TODO
# - serial version for lower latency
# - block-stride loop to delay need for second kernel launch

# Reduce a value across a warp
Expand Down Expand Up @@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
return
end

function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
@inbounds if grid_idx <= length(Rother)
Iother = Rother[grid_idx]
Expand All @@ -160,7 +159,7 @@ end
## COV_EXCL_STOP

# factored out for use in tests
function big_mapreduce_threshold(dev)
function serial_mapreduce_threshold(dev)
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
return max_concurrency
Expand Down Expand Up @@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
@assert length(Rother) > 0

# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
if length(Rother) >= big_mapreduce_threshold(dev)
if length(Rother) >= serial_mapreduce_threshold(dev)
args = (f, op, init, Rreduce, Rother, R, A)
kernel = @cuda launch=false big_mapreduce_kernel(args...)
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
kernel_config = launch_configuration(kernel.fun)
threads = kernel_config.threads
blocks = cld(length(Rother), threads)
Expand Down Expand Up @@ -255,18 +254,38 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
# we can cover the dimensions to reduce using a single block
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
else
# we need multiple steps to cover all values to reduce
partial = similar(R, (size(R)..., reduce_blocks))
# TODO: provide a version that atomically reduces from different blocks

# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))

# NOTE: we can't use the previously-compiled kernel, or its launch configuration,
# since the type of `partial` might not match the original output container
# (e.g. if that was a view).
partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads)
partial_reduce_threads = compute_threads(partial_kernel_config.threads)
partial_reduce_shmem = compute_shmem(partial_reduce_threads)
partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
1
else
min(cld(length(Rreduce), partial_reduce_threads),
cld(partial_kernel_config.blocks, other_blocks))
end
partial_threads = partial_reduce_threads
partial_shmem = partial_reduce_shmem
partial_blocks = partial_reduce_blocks*other_blocks

partial = similar(R, (size(R)..., partial_blocks))
if init === nothing
# without an explicit initializer we need to copy from the output container
partial .= R
end
# NOTE: we can't use the previously-compiled kernel, since the type of `partial`
# might not match the original output container (e.g. if that was a view).
@cuda(threads, blocks, shmem,
partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A))

GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)

GPUArrays.mapreducedim!(identity, op, R, partial; init)
end

return R
Expand Down
2 changes: 1 addition & 1 deletion test/base/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ end
@testset "large map reduce" begin
dev = device()

big_size = CUDA.big_mapreduce_threshold(dev) + 5
big_size = CUDA.serial_mapreduce_threshold(dev) + 5
a = rand(Float32, big_size, 31)
c = CuArray(a)

Expand Down