Skip to content

Commit c73ee1a

Browse files
authored
UCP/PROTO: Added option to use single network device. (#10870)
1 parent 0c6268a commit c73ee1a

File tree

9 files changed

+477
-16
lines changed

9 files changed

+477
-16
lines changed

src/ucp/api/ucp.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ enum ucp_params_field {
125125
UCP_PARAM_FIELD_MT_WORKERS_SHARED = UCS_BIT(5), /**< mt_workers_shared */
126126
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS = UCS_BIT(6), /**< estimated_num_eps */
127127
UCP_PARAM_FIELD_ESTIMATED_NUM_PPN = UCS_BIT(7), /**< estimated_num_ppn */
128-
UCP_PARAM_FIELD_NAME = UCS_BIT(8) /**< name */
128+
UCP_PARAM_FIELD_NAME = UCS_BIT(8), /**< name */
129+
UCP_PARAM_FIELD_NODE_LOCAL_ID = UCS_BIT(9)
129130
};
130131

131132

@@ -1144,6 +1145,16 @@ typedef struct ucp_params {
11441145
* unique name will be created for you.
11451146
*/
11461147
const char *name;
1148+
1149+
/**
1150+
* An optimization hint for a single node. For example, when used from MPI or
1151+
* OpenSHMEM libraries, this number will specify the local identificator on
1152+
* a single node in the job. Does not affect semantics, only transport
1153+
* selection criteria and the resulting performance.
1154+
* The value can be also set by the UCX_LOCAL_NODE_ID environment variable,
1155+
* which will override the id set by @e node_local_id
1156+
*/
1157+
size_t node_local_id;
11471158
} ucp_params_t;
11481159

11491160

src/ucp/core/ucp_context.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,16 @@ static ucs_config_field_t ucp_context_config_table[] = {
577577
ucs_offsetof(ucp_context_config_t, connect_all_to_all),
578578
UCS_CONFIG_TYPE_BOOL},
579579

580+
{"SINGLE_NET_DEVICE", "n", "Use only one network device for all protocols.",
581+
ucs_offsetof(ucp_context_config_t, proto_use_single_net_device),
582+
UCS_CONFIG_TYPE_BOOL},
583+
584+
{"NODE_LOCAL_ID", "auto",
585+
"An optimization hint for the local identificator on a single node. Does \n"
586+
"not affect semantics, only transport selection criteria and the \n"
587+
"resulting performance.",
588+
ucs_offsetof(ucp_context_config_t, node_local_id), UCS_CONFIG_TYPE_ULUNITS},
589+
580590
{NULL}
581591
};
582592

@@ -1999,6 +2009,9 @@ static void ucp_apply_params(ucp_context_h context, const ucp_params_t *params,
19992009
estimated_num_ppn,
20002010
ESTIMATED_NUM_PPN, 1);
20012011

2012+
context->config.node_local_id = UCP_PARAM_FIELD_VALUE(params, node_local_id,
2013+
NODE_LOCAL_ID, 0);
2014+
20022015
if ((params->field_mask & UCP_PARAM_FIELD_MT_WORKERS_SHARED) &&
20032016
params->mt_workers_shared) {
20042017
context->mt_lock.mt_type = mt_type;
@@ -2108,6 +2121,12 @@ static ucs_status_t ucp_fill_config(ucp_context_h context,
21082121
ucs_debug("estimated number of endpoints per node is %d",
21092122
context->config.est_num_ppn);
21102123

2124+
if (context->config.ext.node_local_id != UCS_ULUNITS_AUTO) {
2125+
/* node_local_id was set via the env variable. Override current value */
2126+
context->config.node_local_id = context->config.ext.node_local_id;
2127+
}
2128+
ucs_debug("node local id is %lu", context->config.node_local_id);
2129+
21112130
if (UCS_CONFIG_DBL_IS_AUTO(context->config.ext.bcopy_bw)) {
21122131
/* bcopy_bw wasn't set via the env variable. Calculate the value */
21132132
if (context->config.ext.proto_enable) {

src/ucp/core/ucp_context.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,10 @@ typedef struct ucp_context_config {
215215
/** Extend endpoint lanes connections of each local device to all remote
216216
* devices */
217217
int connect_all_to_all;
218+
/** Use only one network device for all protocols */
219+
int proto_use_single_net_device;
220+
/** Local identificator on a single node */
221+
unsigned long node_local_id;
218222
} ucp_context_config_t;
219223

220224

@@ -415,6 +419,9 @@ typedef struct ucp_context {
415419
/* How many endpoints are expected to be created on single node */
416420
int est_num_ppn;
417421

422+
/* Local identificator on a single node */
423+
unsigned long node_local_id;
424+
418425
struct {
419426
size_t size; /* Request size for user */
420427
ucp_request_init_callback_t init; /* Initialization user callback */

src/ucp/proto/proto_common.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,37 @@ ucp_proto_common_get_sys_dev(const ucp_proto_init_params_t *params,
152152
return params->worker->context->tl_rscs[rsc_index].tl_rsc.sys_device;
153153
}
154154

155+
int ucp_proto_common_add_unique_sys_dev(ucs_sys_device_t sys_dev,
156+
ucs_sys_device_t *sys_devs,
157+
ucp_lane_index_t *num_sys_devs,
158+
ucp_lane_index_t max_sys_devs)
159+
{
160+
ucp_lane_index_t i;
161+
162+
for (i = 0; i < *num_sys_devs; ++i) {
163+
if (sys_dev == sys_devs[i]) {
164+
return 0; /* Already exists */
165+
}
166+
}
167+
168+
if (*num_sys_devs < max_sys_devs) {
169+
sys_devs[(*num_sys_devs)++] = sys_dev;
170+
return 1; /* Added */
171+
}
172+
173+
return 0; /* No space */
174+
}
175+
176+
ucp_lane_index_t
177+
ucp_proto_common_select_sys_dev_by_node_id(const ucp_proto_init_params_t *params,
178+
ucp_lane_index_t num_sys_devs)
179+
{
180+
if (num_sys_devs == 0) {
181+
return 0;
182+
}
183+
return params->worker->context->config.node_local_id % num_sys_devs;
184+
}
185+
155186
/* Pack/unpack local distance to make it equal to the remote one */
156187
static void
157188
ucp_proto_common_fp8_pack_unpack_distance(ucs_sys_dev_distance_t *distance)

src/ucp/proto/proto_common.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,15 @@ ucs_sys_device_t
255255
ucp_proto_common_get_sys_dev(const ucp_proto_init_params_t *params,
256256
ucp_lane_index_t lane);
257257

258+
int ucp_proto_common_add_unique_sys_dev(ucs_sys_device_t sys_dev,
259+
ucs_sys_device_t *sys_devs,
260+
ucp_lane_index_t *num_sys_devs,
261+
ucp_lane_index_t max_sys_devs);
262+
263+
ucp_lane_index_t
264+
ucp_proto_common_select_sys_dev_by_node_id(const ucp_proto_init_params_t *params,
265+
ucp_lane_index_t num_sys_devs);
266+
258267

259268
void ucp_proto_common_get_lane_distance(const ucp_proto_init_params_t *params,
260269
ucp_lane_index_t lane,

src/ucp/proto/proto_common.inl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,4 +407,26 @@ ucp_proto_common_get_dev_index(const ucp_proto_init_params_t *params,
407407
return params->worker->context->tl_rscs[rsc_index].dev_index;
408408
}
409409

410+
static UCS_F_ALWAYS_INLINE const uct_tl_resource_desc_t *
411+
ucp_proto_common_get_tl_rsc(const ucp_proto_init_params_t *params,
412+
ucp_lane_index_t lane)
413+
{
414+
ucp_rsc_index_t rsc_index = ucp_proto_common_get_rsc_index(params, lane);
415+
return &params->worker->context->tl_rscs[rsc_index].tl_rsc;
416+
}
417+
418+
static UCS_F_ALWAYS_INLINE int
419+
ucp_proto_common_is_net_dev(const ucp_proto_init_params_t *params,
420+
ucp_lane_index_t lane)
421+
{
422+
return ucp_proto_common_get_tl_rsc(params, lane)->dev_type ==
423+
UCT_DEVICE_TYPE_NET;
424+
}
425+
426+
static UCS_F_ALWAYS_INLINE int
427+
ucp_proto_common_bandwidth_equal(double bw1, double bw2)
428+
{
429+
return fabs(bw1 - bw2) <= UCP_PROTO_PERF_EPSILON;
430+
}
431+
410432
#endif

src/ucp/proto/proto_multi.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,62 @@ ucp_proto_multi_init_flush_sys_dev_mask(const ucp_rkey_config_key_t *key)
151151
return UCS_BIT(key->sys_dev & ~UCP_SYS_DEVICE_FLUSH_BIT);
152152
}
153153

154+
static ucp_lane_index_t ucp_proto_multi_filter_net_devices(
155+
ucp_lane_index_t num_lanes, const ucp_proto_init_params_t *params,
156+
const ucp_proto_common_tl_perf_t *tl_perfs, int fixed_first_lane,
157+
ucp_lane_index_t *lanes, ucp_proto_perf_node_t **perf_nodes)
158+
{
159+
ucp_lane_index_t num_max_bw_devs = 0;
160+
double max_bandwidth;
161+
ucp_lane_index_t i, lane, seed, num_filtered_lanes;
162+
ucp_lane_map_t lane_map;
163+
ucs_sys_device_t sys_dev;
164+
ucs_sys_device_t sys_devs[UCP_PROTO_MAX_LANES];
165+
const uct_tl_resource_desc_t *tl_rsc;
166+
167+
for (lane_map = 0, max_bandwidth = 0, i = 0; i < num_lanes; ++i) {
168+
lane = lanes[i];
169+
if (!ucp_proto_common_is_net_dev(params, lane)) {
170+
continue;
171+
}
172+
173+
lane_map |= UCS_BIT(lane);
174+
max_bandwidth = ucs_max(max_bandwidth, tl_perfs[lane].bandwidth);
175+
}
176+
177+
ucs_for_each_bit(lane, lane_map) {
178+
if (!ucp_proto_common_bandwidth_equal(tl_perfs[lane].bandwidth,
179+
max_bandwidth)) {
180+
continue;
181+
}
182+
183+
sys_dev = ucp_proto_common_get_sys_dev(params, lane);
184+
ucp_proto_common_add_unique_sys_dev(sys_dev, sys_devs, &num_max_bw_devs,
185+
UCP_PROTO_MAX_LANES);
186+
}
187+
188+
if (num_max_bw_devs == 0) {
189+
return num_lanes;
190+
}
191+
192+
seed = ucp_proto_common_select_sys_dev_by_node_id(params, num_max_bw_devs);
193+
194+
for (i = !!fixed_first_lane, num_filtered_lanes = i; i < num_lanes; ++i) {
195+
lane = lanes[i];
196+
tl_rsc = ucp_proto_common_get_tl_rsc(params, lane);
197+
if ((tl_rsc->dev_type == UCT_DEVICE_TYPE_NET) &&
198+
(tl_rsc->sys_device != sys_devs[seed])) {
199+
ucp_proto_perf_node_deref(&perf_nodes[lane]);
200+
ucs_trace("filtered out " UCP_PROTO_LANE_FMT,
201+
UCP_PROTO_LANE_ARG(params, lane, &tl_perfs[lane]));
202+
} else {
203+
lanes[num_filtered_lanes++] = lane;
204+
}
205+
}
206+
207+
return num_filtered_lanes;
208+
}
209+
154210
ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params,
155211
const char *perf_name,
156212
ucp_proto_perf_t **perf_p,
@@ -252,6 +308,14 @@ ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params,
252308
}
253309

254310
num_lanes = num_fast_lanes;
311+
if (context->config.ext.proto_use_single_net_device) {
312+
num_lanes = ucp_proto_multi_filter_net_devices(num_lanes,
313+
&params->super.super,
314+
lanes_perf,
315+
fixed_first_lane, lanes,
316+
lanes_perf_nodes);
317+
}
318+
255319
ucp_proto_multi_select_bw_lanes(&params->super.super, lanes, num_lanes,
256320
params->max_lanes, lanes_perf,
257321
fixed_first_lane, &selection);

src/ucp/proto/proto_single.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,87 @@
1010

1111
#include "proto_single.h"
1212
#include "proto_common.h"
13+
#include "proto_common.inl"
1314
#include "proto_init.h"
1415
#include "proto_debug.h"
1516

1617
#include <ucs/debug/assert.h>
1718
#include <ucs/debug/log.h>
1819
#include <ucs/sys/math.h>
1920

21+
static double
22+
ucp_proto_single_get_bandwidth(const ucp_proto_common_init_params_t *params,
23+
ucp_lane_index_t lane)
24+
{
25+
ucp_proto_common_tl_perf_t tl_perf;
26+
ucp_proto_perf_node_t *perf_node;
27+
ucs_status_t status;
28+
29+
status = ucp_proto_common_get_lane_perf(params, lane, &tl_perf, &perf_node);
30+
if (status != UCS_OK) {
31+
return 0;
32+
}
33+
34+
ucp_proto_perf_node_deref(&perf_node);
35+
return tl_perf.bandwidth;
36+
}
37+
38+
static void
39+
ucp_proto_single_update_lane(const ucp_proto_single_init_params_t *params,
40+
ucp_lane_index_t *lane_p)
41+
{
42+
const ucp_proto_common_init_params_t *common_params = &params->super;
43+
const ucp_proto_init_params_t *init_params = &common_params->super;
44+
const ucp_context_h context = init_params->worker->context;
45+
double bandwidth;
46+
ucp_lane_index_t lanes[UCP_PROTO_MAX_LANES];
47+
ucs_sys_device_t sys_devs[UCP_PROTO_MAX_LANES];
48+
ucp_lane_index_t num_lanes, num_same_bw_devs, i, lane;
49+
ucs_sys_device_t sys_dev;
50+
51+
if (!context->config.ext.proto_use_single_net_device ||
52+
/* skip lane update for node_local_id 0 since the original lane would be
53+
* selected anyway */
54+
(context->config.node_local_id == 0)) {
55+
return;
56+
}
57+
58+
if (!ucp_proto_common_is_net_dev(init_params, *lane_p)) {
59+
return;
60+
}
61+
62+
bandwidth = ucp_proto_single_get_bandwidth(common_params, *lane_p);
63+
lanes[0] = *lane_p;
64+
sys_devs[0] = ucp_proto_common_get_sys_dev(init_params, lanes[0]);
65+
66+
num_lanes = ucp_proto_common_find_lanes(
67+
init_params, common_params->flags, params->lane_type,
68+
params->tl_cap_flags, UCP_PROTO_MAX_LANES - 1,
69+
(common_params->exclude_map | UCS_BIT(lanes[0])),
70+
ucp_proto_common_filter_min_frag, lanes + 1);
71+
72+
for (num_same_bw_devs = 1, i = 1; i < num_lanes; ++i) {
73+
lane = lanes[i];
74+
if (!ucp_proto_common_is_net_dev(init_params, lane)) {
75+
continue;
76+
}
77+
78+
if (!ucp_proto_common_bandwidth_equal(
79+
ucp_proto_single_get_bandwidth(common_params, lane), bandwidth)) {
80+
continue;
81+
}
82+
83+
sys_dev = ucp_proto_common_get_sys_dev(init_params, lane);
84+
if (ucp_proto_common_add_unique_sys_dev(sys_dev, sys_devs,
85+
&num_same_bw_devs,
86+
UCP_PROTO_MAX_LANES)) {
87+
lanes[num_same_bw_devs - 1] = lane;
88+
}
89+
}
90+
91+
*lane_p = lanes[ucp_proto_common_select_sys_dev_by_node_id(init_params,
92+
num_same_bw_devs)];
93+
}
2094

2195
ucs_status_t ucp_proto_single_init(const ucp_proto_single_init_params_t *params,
2296
ucp_proto_perf_t **perf_p,
@@ -47,6 +121,8 @@ ucs_status_t ucp_proto_single_init(const ucp_proto_single_init_params_t *params,
47121

48122
ucs_assert(num_lanes == 1);
49123

124+
ucp_proto_single_update_lane(params, &lane);
125+
50126
reg_md_map = ucp_proto_common_reg_md_map(&params->super, UCS_BIT(lane));
51127
if (reg_md_map == 0) {
52128
spriv->reg_md = UCP_NULL_RESOURCE;

0 commit comments

Comments
 (0)