Skip to content

Commit aaea0a8

Browse files
authored
UCP/PERF: UCP tests with configurable level/api/batch (#10893)
1 parent c73ee1a commit aaea0a8

File tree

18 files changed

+505
-219
lines changed

18 files changed

+505
-219
lines changed

contrib/test_jenkins.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,7 @@ run_ucx_perftest_cuda_device() {
657657
ucx_perftest="$ucx_inst/bin/ucx_perftest"
658658
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
659659

660+
# TODO: Run on all GPUs & NICs combinations
660661
# TODO: Run on all GPUs & NICs combinations
661662
ucp_client_args="-a cuda:0 $(hostname)"
662663
gda_tls="cuda_copy,rc,rc_gda"
Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
#
22
# UCP basic device cuda tests
33
#
4-
ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
5-
# TODO - Increase number of threads after adjusting perftest.
6-
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32
7-
ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
8-
# TODO - Increase number of threads after adjusting perftest.
9-
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32
4+
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000
5+
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000
6+
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
7+
ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000
8+
ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000
9+
ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000
10+
11+
# Increase number of threads after following fixes:
12+
# - Use thread-local memory instead of shared for requests (limit 48K)
13+
# - Fix WQE size limit of 1024
14+
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
15+
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
16+
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
17+
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
18+
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
19+
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2

src/tools/perf/api/libperf.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define UCX_LIBPERF_H
1212

1313
#include <ucs/sys/compiler.h>
14+
#include <ucs/sys/device_code.h>
1415

1516
BEGIN_C_DECLS
1617

@@ -30,7 +31,9 @@ typedef enum {
3031
typedef enum {
3132
UCX_PERF_CMD_AM,
3233
UCX_PERF_CMD_PUT,
34+
UCX_PERF_CMD_PUT_SINGLE,
3335
UCX_PERF_CMD_PUT_MULTI,
36+
UCX_PERF_CMD_PUT_PARTIAL,
3437
UCX_PERF_CMD_GET,
3538
UCX_PERF_CMD_ADD,
3639
UCX_PERF_CMD_FADD,
@@ -265,6 +268,7 @@ typedef struct ucx_perf_params {
265268
ucs_memory_type_t recv_mem_type; /* Recv memory type */
266269
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
267270
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
271+
ucs_device_level_t device_level; /* Device level for gdaki */
268272
unsigned flags; /* See ucx_perf_test_flags. */
269273

270274
size_t *msg_size_list; /* Test message sizes list. The size
@@ -284,6 +288,7 @@ typedef struct ucx_perf_params {
284288
double percentile_rank; /* The percentile rank of the percentile reported
285289
in latency tests */
286290
unsigned device_thread_count; /* Number of device threads */
291+
unsigned device_block_count; /* Number of device blocks */
287292

288293
void *rte_group; /* Opaque RTE group handle */
289294
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
4949
}
5050
}
5151

52-
UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length)
52+
static UCS_F_ALWAYS_INLINE uint64_t *
53+
ucx_perf_cuda_get_sn(const void *address, size_t length)
5354
{
54-
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
55+
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length);
5556
}
5657

57-
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value)
58+
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
5859
{
5960
if (threadIdx.x == 0) {
60-
while (*sn < value);
61+
while (ucs_device_atomic64_read(sn) < value);
6162
}
6263
__syncthreads();
6364
}
@@ -79,8 +80,8 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
7980
return count;
8081
}
8182

82-
UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
83-
size_t from)
83+
UCS_F_DEVICE size_t
84+
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
8485
{
8586
for (size_t i = from; i < bits; i++) {
8687
if (!UCX_BIT_GET(set, i)) {
@@ -90,6 +91,55 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
9091
return bits;
9192
}
9293

94+
#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
95+
do { \
96+
switch (cmd) { \
97+
case UCX_PERF_CMD_PUT_SINGLE: \
98+
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
99+
break; \
100+
case UCX_PERF_CMD_PUT_MULTI: \
101+
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
102+
break; \
103+
case UCX_PERF_CMD_PUT_PARTIAL: \
104+
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
105+
break; \
106+
default: \
107+
ucs_error("Unsupported cmd: %d", cmd); \
108+
break; \
109+
} \
110+
} while (0)
111+
112+
#define UCX_KERNEL_DISPATCH(perf, func, ...) \
113+
do { \
114+
ucs_device_level_t _level = perf.params.device_level; \
115+
ucx_perf_cmd_t _cmd = perf.params.command; \
116+
unsigned _blocks = perf.params.device_block_count; \
117+
unsigned _threads = perf.params.device_thread_count; \
118+
size_t _shared_size = _threads * perf.params.max_outstanding * \
119+
sizeof(ucp_device_request_t); \
120+
switch (_level) { \
121+
case UCS_DEVICE_LEVEL_THREAD: \
122+
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123+
_shared_size, func, __VA_ARGS__); \
124+
break; \
125+
case UCS_DEVICE_LEVEL_WARP: \
126+
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127+
_shared_size, func, __VA_ARGS__); \
128+
break; \
129+
case UCS_DEVICE_LEVEL_BLOCK: \
130+
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131+
_shared_size, func, __VA_ARGS__); \
132+
break; \
133+
case UCS_DEVICE_LEVEL_GRID: \
134+
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135+
_shared_size, func, __VA_ARGS__); \
136+
break; \
137+
default: \
138+
ucs_error("Unsupported level: %d", _level); \
139+
break; \
140+
} \
141+
} while (0)
142+
93143
class ucx_perf_cuda_test_runner {
94144
public:
95145
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
@@ -110,17 +160,17 @@ public:
110160
CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
111161
}
112162

113-
ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }
114-
115-
void wait_for_kernel(size_t msg_length)
163+
void wait_for_kernel()
116164
{
165+
size_t msg_length = ucx_perf_get_message_size(&m_perf.params);
117166
ucx_perf_counter_t last_completed = 0;
118167
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
119-
while (1) {
168+
unsigned thread_count = m_perf.params.device_thread_count;
169+
while (true) {
120170
ucx_perf_counter_t delta = completed - last_completed;
121171
if (delta > 0) {
122172
// TODO: calculate latency percentile on kernel
123-
ucx_perf_update(&m_perf, delta, msg_length);
173+
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
124174
} else if (completed >= m_perf.max_iter) {
125175
break;
126176
}
@@ -133,6 +183,8 @@ public:
133183

134184
protected:
135185
ucx_perf_context_t &m_perf;
186+
ucx_perf_cuda_context *m_cpu_ctx;
187+
ucx_perf_cuda_context *m_gpu_ctx;
136188

137189
private:
138190
void init_ctx()
@@ -142,17 +194,16 @@ private:
142194
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
143195
&m_gpu_ctx, m_cpu_ctx, 0);
144196
}
145-
146-
ucx_perf_cuda_context *m_cpu_ctx;
147-
ucx_perf_cuda_context *m_gpu_ctx;
148197
};
149198

150199

151200
template<typename Runner> ucs_status_t
152201
ucx_perf_cuda_dispatch(ucx_perf_context_t *perf)
153202
{
154203
Runner runner(*perf);
155-
if (perf->params.command == UCX_PERF_CMD_PUT_MULTI) {
204+
if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) ||
205+
(perf->params.command == UCX_PERF_CMD_PUT_SINGLE) ||
206+
(perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) {
156207
if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
157208
return runner.run_pingpong();
158209
} else if (perf->params.test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) {

0 commit comments

Comments
 (0)