1
+ // MIT License
2
+ //
3
+ // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4
+ //
5
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ // of this software and associated documentation files (the "Software"), to deal
7
+ // in the Software without restriction, including without limitation the rights
8
+ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ // copies of the Software, and to permit persons to whom the Software is
10
+ // furnished to do so, subject to the following conditions:
11
+ //
12
+ // The above copyright notice and this permission notice shall be included in all
13
+ // copies or substantial portions of the Software.
14
+ //
15
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ // SOFTWARE.
22
+
23
+ #include < hip/hip_runtime.h>
24
+ #include < type_traits>
25
+ #include < iostream>
26
+ #include < vector>
27
+ #include < random>
28
+
29
+ #define HIP_CHECK (expression ) \
30
+ { \
31
+ const hipError_t status = expression; \
32
+ if (status != hipSuccess){ \
33
+ std::cerr << " HIP error " \
34
+ << status << " : " \
35
+ << hipGetErrorString (status) \
36
+ << " at " << __FILE__ << " :" \
37
+ << __LINE__ << std::endl; \
38
+ } \
39
+ }
40
+
41
+ // [Sphinx template warp size block reduction kernel start]
42
+ template <uint32_t WarpSize>
43
+ using lane_mask_t = typename std::conditional<WarpSize == 32 , uint32_t , uint64_t >::type;
44
+
45
+ template <uint32_t WarpSize>
46
+ __global__ void block_reduce (int * input, lane_mask_t <WarpSize>* mask, int * output, size_t size) {
47
+ extern __shared__ int shared[];
48
+
49
+ // Read of input with bounds check
50
+ auto read_global_safe = [&](const uint32_t i, const uint32_t lane_id, const uint32_t mask_id)
51
+ {
52
+ lane_mask_t <WarpSize> warp_mask = lane_mask_t <WarpSize>(1 ) << lane_id;
53
+ return (i < size) && (mask[mask_id] & warp_mask) ? input[i] : 0 ;
54
+ };
55
+
56
+ const uint32_t tid = threadIdx.x ,
57
+ lid = threadIdx.x % WarpSize,
58
+ wid = threadIdx.x / WarpSize,
59
+ bid = blockIdx.x ,
60
+ gid = bid * blockDim.x + tid;
61
+
62
+ // Read input buffer to shared
63
+ shared[tid] = read_global_safe (gid, lid, bid * (blockDim.x / WarpSize) + wid);
64
+ __syncthreads ();
65
+
66
+ // Shared reduction
67
+ for (uint32_t i = blockDim.x / 2 ; i >= WarpSize; i /= 2 )
68
+ {
69
+ if (tid < i)
70
+ shared[tid] = shared[tid] + shared[tid + i];
71
+ __syncthreads ();
72
+ }
73
+
74
+ // Use local variable in warp reduction
75
+ int result = shared[tid];
76
+ __syncthreads ();
77
+
78
+ // This loop would be unrolled the same with the runtime warpSize.
79
+ #pragma unroll
80
+ for (uint32_t i = WarpSize/2 ; i >= 1 ; i /= 2 ) {
81
+ result = result + __shfl_down (result, i);
82
+ }
83
+
84
+ // Write result to output buffer
85
+ if (tid == 0 )
86
+ output[bid] = result;
87
+ };
88
+ // [Sphinx template warp size block reduction kernel end]
89
+
90
+ // [Sphinx template warp size mask generation start]
91
+ template <uint32_t WarpSize>
92
+ void generate_and_copy_mask (
93
+ void *d_mask,
94
+ std::vector<int >& vectorExpected,
95
+ int numOfBlocks,
96
+ int numberOfWarp,
97
+ int mask_size,
98
+ int mask_element_size) {
99
+
100
+ std::random_device rd;
101
+ std::mt19937_64 eng (rd ());
102
+
103
+ // Host side mask vector
104
+ std::vector<lane_mask_t <WarpSize>> mask (mask_size);
105
+ // Define uniform unsigned int distribution
106
+ std::uniform_int_distribution<lane_mask_t <WarpSize>> distr;
107
+ // Fill up the mask
108
+ for (int i=0 ; i < numOfBlocks; i++) {
109
+ int count = 0 ;
110
+ for (int j=0 ; j < numberOfWarp; j++) {
111
+ int mask_index = i * numberOfWarp + j;
112
+ mask[mask_index] = distr (eng);
113
+ if constexpr (WarpSize == 32 )
114
+ count += __builtin_popcount (mask[mask_index]);
115
+ else
116
+ count += __builtin_popcountll (mask[mask_index]);
117
+ }
118
+ vectorExpected[i]= count;
119
+ }
120
+
121
+ // Copy the mask array
122
+ HIP_CHECK (hipMemcpy (d_mask, mask.data (), mask_size * mask_element_size, hipMemcpyHostToDevice));
123
+ }
124
+ // [Sphinx template warp size mask generation end]
125
+
126
+ int main () {
127
+
128
+ int deviceId = 0 ;
129
+ int warpSizeHost;
130
+ HIP_CHECK (hipDeviceGetAttribute (&warpSizeHost, hipDeviceAttributeWarpSize, deviceId));
131
+ std::cout << " Warp size: " << warpSizeHost << std::endl;
132
+
133
+ constexpr int numOfBlocks = 16 ;
134
+ constexpr int threadsPerBlock = 1024 ;
135
+ const int numberOfWarp = threadsPerBlock / warpSizeHost;
136
+ const int mask_element_size = warpSizeHost == 32 ? sizeof (uint32_t ) : sizeof (uint64_t );
137
+ const int mask_size = numOfBlocks * numberOfWarp;
138
+ constexpr size_t arraySize = numOfBlocks * threadsPerBlock;
139
+
140
+ int *d_data, *d_results;
141
+ void *d_mask;
142
+ int initValue = 1 ;
143
+ std::vector<int > vectorInput (arraySize, initValue);
144
+ std::vector<int > vectorOutput (numOfBlocks);
145
+ std::vector<int > vectorExpected (numOfBlocks);
146
+ // Allocate device memory
147
+ HIP_CHECK (hipMalloc (&d_data, arraySize * sizeof (*d_data)));
148
+ HIP_CHECK (hipMalloc (&d_mask, mask_size * mask_element_size));
149
+ HIP_CHECK (hipMalloc (&d_results, numOfBlocks * sizeof (*d_results)));
150
+ // Host to Device copy of the input array
151
+ HIP_CHECK (hipMemcpy (d_data, vectorInput.data (), arraySize * sizeof (*d_data), hipMemcpyHostToDevice));
152
+
153
+ // [Sphinx template warp size select kernel start]
154
+ // Fill up the mask variable, copy to device and select the right kernel.
155
+ if (warpSizeHost == 32 ) {
156
+ // Generate and copy mask arrays
157
+ generate_and_copy_mask<32 >(d_mask, vectorExpected, numOfBlocks, numberOfWarp, mask_size, mask_element_size);
158
+
159
+ // Start the kernel
160
+ block_reduce<32 ><<<dim3 (numOfBlocks), dim3 (threadsPerBlock), threadsPerBlock * sizeof (*d_data)>>>(
161
+ d_data,
162
+ static_cast <uint32_t *>(d_mask),
163
+ d_results,
164
+ arraySize);
165
+ } else if (warpSizeHost == 64 ) {
166
+ // Generate and copy mask arrays
167
+ generate_and_copy_mask<64 >(d_mask, vectorExpected, numOfBlocks, numberOfWarp, mask_size, mask_element_size);
168
+
169
+ // Start the kernel
170
+ block_reduce<64 ><<<dim3 (numOfBlocks), dim3 (threadsPerBlock), threadsPerBlock * sizeof (*d_data)>>>(
171
+ d_data,
172
+ static_cast <uint64_t *>(d_mask),
173
+ d_results,
174
+ arraySize);
175
+ } else {
176
+ std::cerr << " Unsupported warp size." << std::endl;
177
+ return 0 ;
178
+ }
179
+ // [Sphinx template warp size select kernel end]
180
+
181
+ // Check the kernel launch
182
+ HIP_CHECK (hipGetLastError ());
183
+ // Check for kernel execution error
184
+ HIP_CHECK (hipDeviceSynchronize ());
185
+ // Device to Host copy of the result
186
+ HIP_CHECK (hipMemcpy (vectorOutput.data (), d_results, numOfBlocks * sizeof (*d_results), hipMemcpyDeviceToHost));
187
+
188
+ // Verify results
189
+ bool passed = true ;
190
+ for (size_t i = 0 ; i < numOfBlocks; ++i) {
191
+ if (vectorOutput[i] != vectorExpected[i]) {
192
+ passed = false ;
193
+ std::cerr << " Validation failed! Expected " << vectorExpected[i] << " got " << vectorOutput[i] << " at index: " << i << std::endl;
194
+ }
195
+ }
196
+ if (passed){
197
+ std::cout << " Execution completed successfully." << std::endl;
198
+ }else {
199
+ std::cerr << " Execution failed." << std::endl;
200
+ }
201
+
202
+ // Cleanup
203
+ HIP_CHECK (hipFree (d_data));
204
+ HIP_CHECK (hipFree (d_mask));
205
+ HIP_CHECK (hipFree (d_results));
206
+ return 0 ;
207
+ }
0 commit comments