@@ -49,15 +49,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
49
49
}
50
50
}
51
51
52
- UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn (const void *address, size_t length)
52
+ static UCS_F_ALWAYS_INLINE uint64_t *
53
+ ucx_perf_cuda_get_sn (const void *address, size_t length)
53
54
{
54
- return (uint64_t *)UCS_PTR_BYTE_OFFSET (address, length - sizeof ( uint64_t ) );
55
+ return (uint64_t *)UCS_PTR_BYTE_OFFSET (address, length);
55
56
}
56
57
57
- UCS_F_DEVICE void ucx_perf_cuda_wait_sn (volatile uint64_t *sn, uint64_t value)
58
+ UCS_F_DEVICE void ucx_perf_cuda_wait_sn (const uint64_t *sn, uint64_t value)
58
59
{
59
60
if (threadIdx .x == 0 ) {
60
- while (*sn < value);
61
+ while (ucs_device_atomic64_read (sn) < value);
61
62
}
62
63
__syncthreads ();
63
64
}
@@ -79,8 +80,8 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
79
80
return count;
80
81
}
81
82
82
- UCS_F_DEVICE size_t ucx_bitset_ffns ( const uint8_t *set, size_t bits,
83
- size_t from)
83
+ UCS_F_DEVICE size_t
84
+ ucx_bitset_ffns ( const uint8_t *set, size_t bits, size_t from)
84
85
{
85
86
for (size_t i = from; i < bits; i++) {
86
87
if (!UCX_BIT_GET (set, i)) {
@@ -90,6 +91,55 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
90
91
return bits;
91
92
}
92
93
94
+ #define UCX_KERNEL_CMD (level, cmd, blocks, threads, shared_size, func, ...) \
95
+ do { \
96
+ switch (cmd) { \
97
+ case UCX_PERF_CMD_PUT_SINGLE: \
98
+ func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
99
+ break ; \
100
+ case UCX_PERF_CMD_PUT_MULTI: \
101
+ func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
102
+ break ; \
103
+ case UCX_PERF_CMD_PUT_PARTIAL: \
104
+ func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>> (__VA_ARGS__); \
105
+ break ; \
106
+ default : \
107
+ ucs_error (" Unsupported cmd: %d" , cmd); \
108
+ break ; \
109
+ } \
110
+ } while (0 )
111
+
112
+ #define UCX_KERNEL_DISPATCH (perf, func, ...) \
113
+ do { \
114
+ ucs_device_level_t _level = perf.params .device_level ; \
115
+ ucx_perf_cmd_t _cmd = perf.params .command ; \
116
+ unsigned _blocks = perf.params .device_block_count ; \
117
+ unsigned _threads = perf.params .device_thread_count ; \
118
+ size_t _shared_size = _threads * perf.params .max_outstanding * \
119
+ sizeof (ucp_device_request_t ); \
120
+ switch (_level) { \
121
+ case UCS_DEVICE_LEVEL_THREAD: \
122
+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123
+ _shared_size, func, __VA_ARGS__); \
124
+ break ; \
125
+ case UCS_DEVICE_LEVEL_WARP: \
126
+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127
+ _shared_size, func, __VA_ARGS__); \
128
+ break ; \
129
+ case UCS_DEVICE_LEVEL_BLOCK: \
130
+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131
+ _shared_size, func, __VA_ARGS__); \
132
+ break ; \
133
+ case UCS_DEVICE_LEVEL_GRID: \
134
+ UCX_KERNEL_CMD (UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135
+ _shared_size, func, __VA_ARGS__); \
136
+ break ; \
137
+ default : \
138
+ ucs_error (" Unsupported level: %d" , _level); \
139
+ break ; \
140
+ } \
141
+ } while (0 )
142
+
93
143
class ucx_perf_cuda_test_runner {
94
144
public:
95
145
ucx_perf_cuda_test_runner (ucx_perf_context_t &perf) : m_perf(perf)
@@ -110,17 +160,17 @@ public:
110
160
CUDA_CALL_WARN (cudaFreeHost, m_cpu_ctx);
111
161
}
112
162
113
- ucx_perf_cuda_context &gpu_ctx () const { return *m_gpu_ctx; }
114
-
115
- void wait_for_kernel (size_t msg_length)
163
+ void wait_for_kernel ()
116
164
{
165
+ size_t msg_length = ucx_perf_get_message_size (&m_perf.params );
117
166
ucx_perf_counter_t last_completed = 0 ;
118
167
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters ;
119
- while (1 ) {
168
+ unsigned thread_count = m_perf.params .device_thread_count ;
169
+ while (true ) {
120
170
ucx_perf_counter_t delta = completed - last_completed;
121
171
if (delta > 0 ) {
122
172
// TODO: calculate latency percentile on kernel
123
- ucx_perf_update (&m_perf, delta, msg_length);
173
+ ucx_perf_update (&m_perf, delta, delta * thread_count, msg_length);
124
174
} else if (completed >= m_perf.max_iter ) {
125
175
break ;
126
176
}
@@ -133,6 +183,8 @@ public:
133
183
134
184
protected:
135
185
ucx_perf_context_t &m_perf;
186
+ ucx_perf_cuda_context *m_cpu_ctx;
187
+ ucx_perf_cuda_context *m_gpu_ctx;
136
188
137
189
private:
138
190
void init_ctx ()
@@ -142,17 +194,16 @@ private:
142
194
CUDA_CALL (, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
143
195
&m_gpu_ctx, m_cpu_ctx, 0 );
144
196
}
145
-
146
- ucx_perf_cuda_context *m_cpu_ctx;
147
- ucx_perf_cuda_context *m_gpu_ctx;
148
197
};
149
198
150
199
151
200
template <typename Runner> ucs_status_t
152
201
ucx_perf_cuda_dispatch (ucx_perf_context_t *perf)
153
202
{
154
203
Runner runner (*perf);
155
- if (perf->params .command == UCX_PERF_CMD_PUT_MULTI) {
204
+ if ((perf->params .command == UCX_PERF_CMD_PUT_MULTI) ||
205
+ (perf->params .command == UCX_PERF_CMD_PUT_SINGLE) ||
206
+ (perf->params .command == UCX_PERF_CMD_PUT_PARTIAL)) {
156
207
if (perf->params .test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
157
208
return runner.run_pingpong ();
158
209
} else if (perf->params .test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) {
0 commit comments