Skip to content

Commit 4a23081

Browse files
authored
Merge pull request #107 from omlins/lr/hip
Add AMDGPU v0.5 support
2 parents a76bb75 + c921de9 commit 4a23081

File tree

15 files changed

+165
-185
lines changed

15 files changed

+165
-185
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
fail-fast: false
1313
matrix:
1414
version:
15-
- '1.8' # Minimum required Julia version (due to CellArrays' AMDGPU dependency 1.7 and due to Enzyme 1.8).
15+
# - '1.8' # Minimum required Julia version (due to CellArrays' AMDGPU dependency 1.7 and due to Enzyme 1.8).
1616
- '1' # Latest stable 1.x release of Julia
1717
# - 'nightly'
1818
os:

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "ParallelStencil"
22
uuid = "94395366-693c-11ea-3b26-d9b7aac5d958"
33
authors = ["Samuel Omlin", "Ludovic Räss"]
4-
version = "0.8.0"
4+
version = "0.8.1"
55

66
[deps]
77
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -13,7 +13,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1313
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
1414

1515
[compat]
16-
AMDGPU = "0.4.14"
16+
AMDGPU = "0.5"
1717
CUDA = "3.12, 4"
1818
CellArrays = "0.1"
1919
Enzyme = "0.11"

src/ParallelKernel/allocators.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,13 @@ macro falses_cuda(args...) check_initialized(); esc(_falses(args...; package=
233233
macro trues_cuda(args...) check_initialized(); esc(_trues(args...; package=PKG_CUDA)); end
234234
macro fill_cuda(args...) check_initialized(); esc(_fill(args...; package=PKG_CUDA)); end
235235
macro fill!_cuda(args...) check_initialized(); esc(_fill!(args...; package=PKG_CUDA)); end
236-
macro zeros_amdgpu(args...) check_initialized(); esc(_zeros(args...; package=PKG_AMDGPU)); end
237-
macro ones_amdgpu(args...) check_initialized(); esc(_ones(args...; package=PKG_AMDGPU)); end
238-
macro rand_amdgpu(args...) check_initialized(); esc(_rand(args...; package=PKG_AMDGPU)); end
239-
macro falses_amdgpu(args...) check_initialized(); esc(_falses(args...; package=PKG_AMDGPU)); end
240-
macro trues_amdgpu(args...) check_initialized(); esc(_trues(args...; package=PKG_AMDGPU)); end
241-
macro fill_amdgpu(args...) check_initialized(); esc(_fill(args...; package=PKG_AMDGPU)); end
242-
macro fill!_amdgpu(args...) check_initialized(); esc(_fill!(args...; package=PKG_AMDGPU)); end
236+
macro zeros_amdgpu(args...) check_initialized(); esc(_zeros(args...; package=PKG_AMDGPU)); end
237+
macro ones_amdgpu(args...) check_initialized(); esc(_ones(args...; package=PKG_AMDGPU)); end
238+
macro rand_amdgpu(args...) check_initialized(); esc(_rand(args...; package=PKG_AMDGPU)); end
239+
macro falses_amdgpu(args...) check_initialized(); esc(_falses(args...; package=PKG_AMDGPU)); end
240+
macro trues_amdgpu(args...) check_initialized(); esc(_trues(args...; package=PKG_AMDGPU)); end
241+
macro fill_amdgpu(args...) check_initialized(); esc(_fill(args...; package=PKG_AMDGPU)); end
242+
macro fill!_amdgpu(args...) check_initialized(); esc(_fill!(args...; package=PKG_AMDGPU)); end
243243
macro zeros_threads(args...) check_initialized(); esc(_zeros(args...; package=PKG_THREADS)); end
244244
macro ones_threads(args...) check_initialized(); esc(_ones(args...; package=PKG_THREADS)); end
245245
macro rand_threads(args...) check_initialized(); esc(_rand(args...; package=PKG_THREADS)); end

src/ParallelKernel/kernel_language.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ end
101101

102102
function gridDim(args...; package::Symbol=get_package())
103103
if (package == PKG_CUDA) return :(CUDA.gridDim($(args...)))
104-
elseif (package == PKG_AMDGPU) return :(AMDGPU.gridDimWG($(args...)))
104+
elseif (package == PKG_AMDGPU) return :(AMDGPU.gridGroupDim($(args...)))
105105
elseif (package == PKG_THREADS) return :(ParallelStencil.ParallelKernel.@gridDim_cpu($(args...)))
106106
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
107107
end
@@ -170,7 +170,7 @@ end
170170

171171
function pk_println(args...; package::Symbol=get_package())
172172
if (package == PKG_CUDA) return :(CUDA.@cuprintln($(args...)))
173-
elseif (package == PKG_AMDGPU) @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
173+
elseif (package == PKG_AMDGPU) return :(AMDGPU.@rocprintln($(args...)))
174174
elseif (package == PKG_THREADS) return :(Base.println($(args...)))
175175
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
176176
end

src/ParallelKernel/parallel.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,16 @@ macro synchronize(args...) check_initialized(); esc(synchronize(args...)); end
8383
## MACROS FORCING PACKAGE, IGNORING INITIALIZATION
8484

8585
macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_CUDA)); end
86-
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_AMDGPU)); end
86+
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_AMDGPU)); end
8787
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_THREADS)); end
8888
macro parallel_indices_cuda(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_CUDA)); end
89-
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_AMDGPU)); end
89+
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_AMDGPU)); end
9090
macro parallel_indices_threads(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_THREADS)); end
9191
macro parallel_async_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_CUDA)); end
92-
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_AMDGPU)); end
92+
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_AMDGPU)); end
9393
macro parallel_async_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_THREADS)); end
9494
macro synchronize_cuda(args...) check_initialized(); esc(synchronize(args...; package=PKG_CUDA)); end
95-
macro synchronize_amdgpu(args...) check_initialized(); esc(synchronize(args...; package=PKG_AMDGPU)); end
95+
macro synchronize_amdgpu(args...) check_initialized(); esc(synchronize(args...; package=PKG_AMDGPU)); end
9696
macro synchronize_threads(args...) check_initialized(); esc(synchronize(args...; package=PKG_THREADS)); end
9797

9898

@@ -158,11 +158,11 @@ function parallel_kernel(package::Symbol, numbertype::DataType, indices::Union{S
158158
body = get_body(kernel)
159159
body = remove_return(body)
160160
if isgpu(package)
161-
kernel = substitute(kernel, :(Data.Array), :(Data.DeviceArray))
162-
kernel = substitute(kernel, :(Data.Cell), :(Data.DeviceCell))
161+
kernel = substitute(kernel, :(Data.Array), :(Data.DeviceArray))
162+
kernel = substitute(kernel, :(Data.Cell), :(Data.DeviceCell))
163163
kernel = substitute(kernel, :(Data.CellArray), :(Data.DeviceCellArray))
164-
kernel = substitute(kernel, :(Data.TArray), :(Data.DeviceTArray))
165-
kernel = substitute(kernel, :(Data.TCell), :(Data.DeviceTCell))
164+
kernel = substitute(kernel, :(Data.TArray), :(Data.DeviceTArray))
165+
kernel = substitute(kernel, :(Data.TCell), :(Data.DeviceTCell))
166166
kernel = substitute(kernel, :(Data.TCellArray), :(Data.DeviceTCellArray))
167167
end
168168
kernel = push_to_signature!(kernel, :($RANGES_VARNAME::$RANGES_TYPE))
@@ -297,7 +297,7 @@ end
297297
## @SYNCHRONIZE FUNCTIONS
298298

299299
synchronize_cuda(args::Union{Symbol,Expr}...) = :(CUDA.synchronize($(args...)))
300-
synchronize_amdgpu(args::Union{Symbol,Expr}...) = :(ParallelStencil.ParallelKernel.synchronize_rocstream($(args...))) #TODO: this supports currently only stream synchronization. Whole GPU synchronization (all streams) should also be supported.
300+
synchronize_amdgpu(args::Union{Symbol,Expr}...) = :(AMDGPU.synchronize($(args...)))
301301
synchronize_threads(args::Union{Symbol,Expr}...) = :(begin end)
302302

303303

@@ -518,13 +518,13 @@ function create_gpu_call(package::Symbol, nblocks::Union{Symbol,Expr}, nthreads:
518518
if launch
519519
if !isnothing(shmem)
520520
if (package == PKG_CUDA) shmem_expr = :(shmem = $shmem)
521-
elseif (package == PKG_AMDGPU) shmem_expr = :(localmem = $shmem)
521+
elseif (package == PKG_AMDGPU) shmem_expr = :(shmem = $shmem)
522522
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
523523
end
524524
backend_kwargs_expr = (backend_kwargs_expr..., shmem_expr)
525525
end
526526
if (package == PKG_CUDA) return :( CUDA.@cuda blocks=$nblocks threads=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall )
527-
elseif (package == PKG_AMDGPU) return :( ParallelStencil.ParallelKernel.push_signal!($stream, AMDGPU.@roc gridsize=($nblocks .* $nthreads) groupsize=$nthreads $(backend_kwargs_expr...) queue=$stream.queue $kernelcall); $synccall )
527+
elseif (package == PKG_AMDGPU) return :( AMDGPU.@roc gridsize=$nblocks groupsize=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall )
528528
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
529529
end
530530
else
@@ -544,7 +544,7 @@ end
544544

545545
function default_stream(package)
546546
if (package == PKG_CUDA) return :(CUDA.stream()) # Use the default stream of the task.
547-
elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.get_default_rocstream())
547+
elseif (package == PKG_AMDGPU) return :(AMDGPU.HIPStream()) # Use the default stream of the task.
548548
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
549549
end
550550
end

src/ParallelKernel/shared.jl

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -103,50 +103,20 @@ macro rangelengths() esc(:(($(RANGELENGTHS_VARNAMES...),))) end
103103
end
104104

105105
@static if ENABLE_AMDGPU
106-
## Stream implementation for AMDGPU. It is the responsibility of the package developers to keep the ROCStreams consistent by pushing each signal received from a kernel launch on queue=stream.queue to the ROCStream using push_signal!. If ROCQueues are to be exposed to the users, then a macro should be implemented to automatize this (e.g. overwrite @roc to accept the kwarg stream...).
107-
mutable struct ROCStream
108-
queue::AMDGPU.ROCQueue
109-
last_signal::Union{Nothing, AMDGPU.ROCKernelSignal}
110-
111-
function ROCStream(device::ROCDevice; priority::Union{Nothing,Symbol}=nothing)
112-
queue = ROCQueue(device; priority=priority)
113-
new(queue, nothing)
114-
end
115-
function ROCStream(queue::ROCQueue)
116-
new(queue, nothing)
117-
end
118-
end
119-
120-
function push_signal!(stream::ROCStream, signal::AMDGPU.ROCKernelSignal)
121-
AMDGPU.barrier_and!(stream.queue, [signal])
122-
stream.last_signal = signal
123-
end
124-
125-
function synchronize_rocstream(stream::ROCStream)
126-
AMDGPU.wait(stream.last_signal)
127-
end
128-
129106
let
130-
global get_priority_rocstream, get_rocstream, get_default_rocstream
131-
priority_rocstreams = Array{ROCStream}(undef, 0)
132-
rocstreams = Array{ROCStream}(undef, 0)
133-
default_rocstreams = Array{ROCStream}(undef, 0)
107+
global get_priority_rocstream, get_rocstream
108+
priority_rocstreams = Array{AMDGPU.HIPStream}(undef, 0)
109+
rocstreams = Array{AMDGPU.HIPStream}(undef, 0)
134110

135111
function get_priority_rocstream(id::Integer)
136-
while (id > length(priority_rocstreams)) push!(priority_rocstreams, ROCStream(AMDGPU.default_device(); priority=:high)) end # :high is max priority.
112+
while (id > length(priority_rocstreams)) push!(priority_rocstreams, AMDGPU.HIPStream(:high)) end
137113
return priority_rocstreams[id]
138114
end
139115

140-
#TODO: check if set priority to normal!
141116
function get_rocstream(id::Integer)
142-
while (id > length(rocstreams)) push!(rocstreams, ROCStream(AMDGPU.default_device(); priority=:low)) end # :low min priority.
117+
while (id > length(rocstreams)) push!(rocstreams, AMDGPU.HIPStream(:low)) end
143118
return rocstreams[id]
144119
end
145-
146-
function get_default_rocstream()
147-
if (length(default_rocstreams)==0) push!(default_rocstreams, ROCStream(AMDGPU.default_queue())) end # NOTE: this implementation is extensible to multiple defaults as available in CUDA for streams.
148-
return default_rocstreams[1]
149-
end
150120
end
151121
end
152122

@@ -267,7 +237,7 @@ function split_parallel_args(args; is_call=true)
267237
posargs, kwargs = split_args(args[1:end-1])
268238
kernelarg = args[end]
269239
if (is_call && any([x.args[1] in [:blocks, :threads] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: blocks / threads. They must be passed as positional arguments or been omited.") end
270-
if (is_call && any([x.args[1] in [:groupsize, :gridsize, :queue] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: groupsize / gridsize / queue. CUDA nomenclature and concepts are to be used for @parallel calls (and kernels).") end
240+
if (is_call && any([x.args[1] in [:groupsize, :gridsize] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: groupsize / gridsize. CUDA nomenclature and concepts are to be used for @parallel calls (and kernels).") end
271241
return posargs, kwargs, kernelarg
272242
end
273243

src/parallel.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,14 @@ macro parallel_async(args...) check_initialized(); checkargs_parallel(args...);
6767

6868
## MACROS FORCING PACKAGE, IGNORING INITIALIZATION
6969

70-
macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end
71-
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_AMDGPU)); end
72-
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_THREADS)); end
70+
macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end
71+
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_AMDGPU)); end
72+
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_THREADS)); end
7373
macro parallel_indices_cuda(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_CUDA)); end
74-
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_AMDGPU)); end
74+
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_AMDGPU)); end
7575
macro parallel_indices_threads(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_THREADS)); end
7676
macro parallel_async_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_CUDA)); end
77-
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_AMDGPU)); end
77+
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_AMDGPU)); end
7878
macro parallel_async_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_THREADS)); end
7979

8080

@@ -268,7 +268,7 @@ end
268268
function parallel_call_memopt(caller::Module, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall)
269269
metadata_call = create_metadata_call(configcall)
270270
metadata_module = metadata_call
271-
loopdim = :($(metadata_module).loopdim)
271+
loopdim = :($(metadata_module).loopdim)
272272
is_parallel_kernel = :($(metadata_module).is_parallel_kernel)
273273
ranges = :( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...)))
274274
parallel_call_memopt(caller, ranges, kernelcall, backend_kwargs_expr, async; memopt=memopt, configcall=configcall)

test/ParallelKernel/test_kernel_language.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@ end
3232
# @test @prettystring(1, @pk_show()) == "CUDA.@cushow"
3333
# @test @prettystring(1, @pk_println()) == "CUDA.@cuprintln"
3434
elseif $package == $AMDGPU
35-
@test @prettystring(1, @gridDim()) == "AMDGPU.gridDimWG()"
35+
@test @prettystring(1, @gridDim()) == "AMDGPU.gridGroupDim()"
3636
@test @prettystring(1, @blockIdx()) == "AMDGPU.workgroupIdx()"
3737
@test @prettystring(1, @blockDim()) == "AMDGPU.workgroupDim()"
3838
@test @prettystring(1, @threadIdx()) == "AMDGPU.workitemIdx()"
3939
@test @prettystring(1, @sync_threads()) == "AMDGPU.sync_workgroup()"
40-
#@test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU
40+
# @test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU
4141
# @test @prettystring(1, @pk_show()) == "CUDA.@cushow" #TODO: not yet supported for AMDGPU
42-
# @test @prettystring(1, @pk_println()) == "CUDA.@rocprintln"
42+
# @test @prettystring(1, @pk_println()) == "AMDGPU.@rocprintln"
4343
elseif $package == $PKG_THREADS
4444
@test @prettystring(1, @gridDim()) == "ParallelStencil.ParallelKernel.@gridDim_cpu"
4545
@test @prettystring(1, @blockIdx()) == "ParallelStencil.ParallelKernel.@blockIdx_cpu"

0 commit comments

Comments
 (0)