@@ -138,6 +138,61 @@ async def init_rdma_on_mesh(self, proc_mesh: ProcMesh) -> None:
138
138
)
139
139
140
140
141
+ # Cached so that we don't have to call out to the root client every time,
142
+ # which may be on a different host.
143
+ @functools .cache
144
+ def _ensure_init_rdma_manager () -> Shared [None ]:
145
+ async def task () -> None :
146
+ await (
147
+ await get_or_spawn_controller ("rdma_controller" , RdmaController )
148
+ ).init_rdma_on_mesh .call_one (none_throws (context ().actor_instance .proc_mesh ))
149
+
150
+ return PythonTask .from_coroutine (task ()).spawn ()
151
+
152
+
153
+ @functools .cache
154
+ def _check_cuda_expandable_segments_enabled () -> bool :
155
+ """
156
+ Check if PyTorch CUDA caching allocator is using expandable segments.
157
+
158
+ Uses the Rust extension which calls the C++ implementation from rdmaxcel-sys
159
+ that directly accesses the PyTorch C10 CUDA allocator configuration.
160
+
161
+ Returns:
162
+ bool: True if expandable segments are enabled, False otherwise
163
+
164
+ Raises:
165
+ RuntimeError: If expandable segments are not enabled but required for RDMA
166
+ """
167
+ try :
168
+ # Use the new Rust utility function that calls the C++ pt_cuda_allocator_compatibility()
169
+ pt_cuda_compat = _RdmaBuffer .pt_cuda_allocator_compatibility ()
170
+
171
+ if not pt_cuda_compat :
172
+ raise RuntimeError (
173
+ "CUDA caching allocator is not using expandable segments.\n "
174
+ "This is required for RDMA to work correctly with CUDA tensors.\n \n "
175
+ "To fix this, set the environment variable BEFORE importing PyTorch:\n "
176
+ "1. In shell:\n "
177
+ ' export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"\n '
178
+ "2. Or in Python script (BEFORE any PyTorch imports):\n "
179
+ " import os\n "
180
+ ' os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"\n '
181
+ " import torch # Must come after setting the env var\n \n "
182
+ "Note: This setting must be configured before PyTorch's CUDA allocator is initialized."
183
+ )
184
+ return True
185
+
186
+ except Exception as e :
187
+ logging .error (f"Failed to check CUDA allocator configuration: { e } " )
188
+ raise RuntimeError (
189
+ "Unable to verify CUDA allocator configuration.\n "
190
+ "Please ensure expandable segments are enabled:\n "
191
+ ' export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"\n '
192
+ "Set this environment variable before importing PyTorch."
193
+ )
194
+
195
+
141
196
class RDMABuffer :
142
197
def __init__ (
143
198
self ,
@@ -159,15 +214,9 @@ def __init__(
159
214
160
215
TODO: Create TensorBuffer, which will be main user API supporting non-contiguous tensors
161
216
"""
162
- if isinstance (data , torch .Tensor ) and data .device .type != "cpu" :
163
- # TODO - CUDA support for RDMABuffer exists at the Rust layer, but
164
- # runs into issues with MR creation. For now, only support CPU tensors.
165
- # Remove this once GPU support is added.
166
- raise ValueError (
167
- "RDMABuffer currently only supports CPU tensors (got device {})" .format (
168
- data .device
169
- )
170
- )
217
+ if isinstance (data , torch .Tensor ) and data .device .type == "cuda" :
218
+ # Check if CUDA caching allocator is using expandable segments
219
+ _check_cuda_expandable_segments_enabled ()
171
220
172
221
assert (
173
222
is_available ()
@@ -221,16 +270,6 @@ def read_into(
221
270
Currently only CPU tensors are fully supported. GPU tensors will be temporarily
222
271
copied to CPU, which may impact performance.
223
272
"""
224
- dst_gpu = None
225
- if isinstance (dst , torch .Tensor ) and dst .device .type != "cpu" :
226
- warnings .warn (
227
- "note: read_into only supports CPU tensors, so `dst` is being copied to CPU." ,
228
- RDMAReadTransferWarning ,
229
- stacklevel = 2 ,
230
- )
231
- dst_gpu = dst
232
- dst = dst .cpu ()
233
-
234
273
dst_addr , dst_size = _get_addr_and_size (dst )
235
274
236
275
if self .size () > dst_size :
@@ -251,9 +290,6 @@ async def read_into_nonblocking() -> Optional[int]:
251
290
client = client ,
252
291
timeout = timeout ,
253
292
)
254
- # TODO - remove this once GPU support is added.
255
- if dst_gpu is not None :
256
- dst_gpu .copy_ (dst )
257
293
return res
258
294
259
295
return Future (coro = read_into_nonblocking ())
@@ -285,16 +321,6 @@ def write_from(
285
321
Currently only CPU tensors are fully supported. GPU tensors will be temporarily
286
322
copied to CPU, which may impact performance.
287
323
"""
288
- src_gpu = None
289
- if isinstance (src , torch .Tensor ) and src .device .type != "cpu" :
290
- # TODO - remove this once GPU support is added.
291
- warnings .warn (
292
- "note: write_from only supports CPU tensors, so we will write to CPU first, then transfer to `src` in place." ,
293
- RDMAWriteTransferWarning ,
294
- stacklevel = 2 ,
295
- )
296
- src_gpu = src # Save the original GPU tensor reference
297
- src = src .cpu () # Convert to CPU for RDMA operation
298
324
299
325
src_addr , src_size = _get_addr_and_size (src )
300
326
@@ -315,9 +341,6 @@ async def write_from_nonblocking() -> None:
315
341
client = client ,
316
342
timeout = timeout ,
317
343
)
318
- # TODO - remove this once GPU support is added.
319
- if src_gpu is not None :
320
- src_gpu .copy_ (src )
321
344
return res
322
345
323
346
return Future (coro = write_from_nonblocking ())
0 commit comments