Enable CUDA path in Python FE, update grpo example (#1224)

dstaay-fb · facebook-github-bot · commit 9b03dda12a58 · 2025-09-15T21:01:26.000-07:00
Summary:

Enable device='cuda' for RDMABuffer

Reviewed By: zdevito

Differential Revision: D82331433
diff --git a/docs/source/examples/grpo_actor.py b/docs/source/examples/grpo_actor.py
@@ -292,11 +292,10 @@ async def weights_handle(self) -> Dict[str, Tuple[torch.Tensor, RDMABuffer]]:
         Returns:
             Dictionary mapping parameter names to RDMA buffers
         """
-        cpu_tensors = {
-            k: v.cpu().view(torch.uint8).flatten()
+        self._weights_handle = {
+            k: (v, RDMABuffer(v.view(torch.uint8).flatten()))
             for k, v in self.model.state_dict().items()
         }
-        self._weights_handle = {k: (v, RDMABuffer(v)) for k, v in cpu_tensors.items()}
         return self._weights_handle
 
     def _compute_advantages(self, rewards: torch.Tensor) -> torch.Tensor:
@@ -372,11 +371,6 @@ def _apply_policy_update(
         self.optim.step()
         self.policy_version += 1
 
-        # update buffers
-        sd = self.model.state_dict()
-        for n, (t, _) in self._weights_handle.items():
-            t.copy_(sd[n].view(torch.uint8).flatten())
-
         # Return loss value
         return loss.detach()
 
@@ -486,9 +480,8 @@ async def update(self, version: int) -> None:
         async with self.cond:
             # Copy weights from RDMA buffers
             sd = self.model.state_dict()
-            cpu_sd = {k: torch.zeros_like(v, device="cpu") for k, v in sd.items()}
             for n, (_, b) in self.weight_buffers.items():
-                await b.read_into(cpu_sd[n].view(torch.uint8).flatten())
+                await b.read_into(sd[n].view(torch.uint8).flatten())
             self.model.load_state_dict(sd)
             # Update version and state
             self.policy_version = version
diff --git a/monarch_rdma/extension/lib.rs b/monarch_rdma/extension/lib.rs
@@ -116,6 +116,11 @@ impl PyRdmaBuffer {
         ibverbs_supported()
     }
 
+    #[classmethod]
+    fn pt_cuda_allocator_compatibility<'py>(_cls: &Bound<'_, PyType>, _py: Python<'py>) -> bool {
+        monarch_rdma::pt_cuda_allocator_compatibility()
+    }
+
     #[pyo3(name = "__repr__")]
     fn repr(&self) -> String {
         format!("<RdmaBuffer'{:?}'>", self.buffer)
diff --git a/monarch_rdma/src/rdma_components.rs b/monarch_rdma/src/rdma_components.rs
@@ -1245,6 +1245,21 @@ pub fn get_registered_cuda_segments() -> Vec<rdmaxcel_sys::rdma_segment_info_t>
     }
 }
 
+/// Check if PyTorch CUDA caching allocator has expandable segments enabled.
+///
+/// This function calls the C++ implementation that directly accesses the
+/// PyTorch C10 CUDA allocator configuration to check if expandable segments
+/// are enabled, which is required for RDMA operations with CUDA tensors.
+///
+/// # Returns
+///
+/// `true` if both CUDA caching allocator is enabled AND expandable segments are enabled,
+/// `false` otherwise.
+pub fn pt_cuda_allocator_compatibility() -> bool {
+    // SAFETY: We are calling a C++ function from rdmaxcel that accesses PyTorch C10 APIs.
+    unsafe { rdmaxcel_sys::pt_cuda_allocator_compatibility() }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/monarch_rdma/src/rdma_manager_actor.rs b/monarch_rdma/src/rdma_manager_actor.rs
@@ -275,7 +275,7 @@ impl Actor for RdmaManagerActor {
     async fn new(_params: Self::Params) -> Result<Self, anyhow::Error> {
         let mut config = _params;
 
-        let pt_cuda_alloc = unsafe { rdmaxcel_sys::pt_cuda_allocator_compatibility() };
+        let pt_cuda_alloc = crate::rdma_components::pt_cuda_allocator_compatibility();
 
         // check config and hardware support align
         if config.use_gpu_direct {
diff --git a/python/monarch/_rust_bindings/rdma.pyi b/python/monarch/_rust_bindings/rdma.pyi
@@ -59,3 +59,5 @@ class _RdmaBuffer:
     def new_from_json(json: str) -> _RdmaBuffer: ...
     @classmethod
     def rdma_supported(cls) -> bool: ...
+    @classmethod
+    def pt_cuda_allocator_compatibility(cls) -> bool: ...
diff --git a/python/monarch/_src/tensor_engine/rdma.py b/python/monarch/_src/tensor_engine/rdma.py
@@ -138,6 +138,61 @@ async def init_rdma_on_mesh(self, proc_mesh: ProcMesh) -> None:
                 )
 
 
+# Cached so that we don't have to call out to the root client every time,
+# which may be on a different host.
+@functools.cache
+def _ensure_init_rdma_manager() -> Shared[None]:
+    async def task() -> None:
+        await (
+            await get_or_spawn_controller("rdma_controller", RdmaController)
+        ).init_rdma_on_mesh.call_one(none_throws(context().actor_instance.proc_mesh))
+
+    return PythonTask.from_coroutine(task()).spawn()
+
+
+@functools.cache
+def _check_cuda_expandable_segments_enabled() -> bool:
+    """
+    Check if PyTorch CUDA caching allocator is using expandable segments.
+
+    Uses the Rust extension which calls the C++ implementation from rdmaxcel-sys
+    that directly accesses the PyTorch C10 CUDA allocator configuration.
+
+    Returns:
+        bool: True if expandable segments are enabled, False otherwise
+
+    Raises:
+        RuntimeError: If expandable segments are not enabled but required for RDMA
+    """
+    try:
+        # Use the new Rust utility function that calls the C++ pt_cuda_allocator_compatibility()
+        pt_cuda_compat = _RdmaBuffer.pt_cuda_allocator_compatibility()
+
+        if not pt_cuda_compat:
+            raise RuntimeError(
+                "CUDA caching allocator is not using expandable segments.\n"
+                "This is required for RDMA to work correctly with CUDA tensors.\n\n"
+                "To fix this, set the environment variable BEFORE importing PyTorch:\n"
+                "1. In shell:\n"
+                '   export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"\n'
+                "2. Or in Python script (BEFORE any PyTorch imports):\n"
+                "   import os\n"
+                '   os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"\n'
+                "   import torch  # Must come after setting the env var\n\n"
+                "Note: This setting must be configured before PyTorch's CUDA allocator is initialized."
+            )
+        return True
+
+    except Exception as e:
+        logging.error(f"Failed to check CUDA allocator configuration: {e}")
+        raise RuntimeError(
+            "Unable to verify CUDA allocator configuration.\n"
+            "Please ensure expandable segments are enabled:\n"
+            '   export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"\n'
+            "Set this environment variable before importing PyTorch."
+        )
+
+
 class RDMABuffer:
     def __init__(
         self,
@@ -159,15 +214,9 @@ def __init__(
 
         TODO: Create TensorBuffer, which will be main user API supporting non-contiguous tensors
         """
-        if isinstance(data, torch.Tensor) and data.device.type != "cpu":
-            # TODO - CUDA support for RDMABuffer exists at the Rust layer, but
-            # runs into issues with MR creation. For now, only support CPU tensors.
-            # Remove this once GPU support is added.
-            raise ValueError(
-                "RDMABuffer currently only supports CPU tensors (got device {})".format(
-                    data.device
-                )
-            )
+        if isinstance(data, torch.Tensor) and data.device.type == "cuda":
+            # Check if CUDA caching allocator is using expandable segments
+            _check_cuda_expandable_segments_enabled()
 
         assert (
             is_available()
@@ -221,16 +270,6 @@ def read_into(
             Currently only CPU tensors are fully supported. GPU tensors will be temporarily
             copied to CPU, which may impact performance.
         """
-        dst_gpu = None
-        if isinstance(dst, torch.Tensor) and dst.device.type != "cpu":
-            warnings.warn(
-                "note: read_into only supports CPU tensors, so `dst` is being copied to CPU.",
-                RDMAReadTransferWarning,
-                stacklevel=2,
-            )
-            dst_gpu = dst
-            dst = dst.cpu()
-
         dst_addr, dst_size = _get_addr_and_size(dst)
 
         if self.size() > dst_size:
@@ -251,9 +290,6 @@ async def read_into_nonblocking() -> Optional[int]:
                 client=client,
                 timeout=timeout,
             )
-            # TODO - remove this once GPU support is added.
-            if dst_gpu is not None:
-                dst_gpu.copy_(dst)
             return res
 
         return Future(coro=read_into_nonblocking())
@@ -285,16 +321,6 @@ def write_from(
             Currently only CPU tensors are fully supported. GPU tensors will be temporarily
             copied to CPU, which may impact performance.
         """
-        src_gpu = None
-        if isinstance(src, torch.Tensor) and src.device.type != "cpu":
-            # TODO - remove this once GPU support is added.
-            warnings.warn(
-                "note: write_from only supports CPU tensors, so we will write to CPU first, then transfer to `src` in place.",
-                RDMAWriteTransferWarning,
-                stacklevel=2,
-            )
-            src_gpu = src  # Save the original GPU tensor reference
-            src = src.cpu()  # Convert to CPU for RDMA operation
 
         src_addr, src_size = _get_addr_and_size(src)
 
@@ -315,9 +341,6 @@ async def write_from_nonblocking() -> None:
                 client=client,
                 timeout=timeout,
             )
-            # TODO - remove this once GPU support is added.
-            if src_gpu is not None:
-                src_gpu.copy_(src)
             return res
 
         return Future(coro=write_from_nonblocking())
diff --git a/python/tests/test_rdma.py b/python/tests/test_rdma.py
@@ -5,6 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+import os
+
+# required to enable RDMA support
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
 import pytest
 import torch
@@ -105,23 +109,20 @@ async def test_proc_mesh_rdma():
     x = await client_gpu.get_buffer.call_one()
     buffer_gpu = x.view(torch.float32).view(10, 10)
     assert torch.sum(buffer_gpu) == 0
-    # copying a tensor across hosts moves it to CPU
-    assert buffer_gpu.device.type == "cpu"
 
     # Modify server state again
     await server.update.call_one()
     await client_gpu.download.call_one()
     x = await client_gpu.get_buffer.call_one()
     buffer_gpu = x.view(torch.float32).view(10, 10)
     remote_grad = await server.get_grad_buffer.call_one()
-    assert torch.allclose(buffer_gpu.cpu(), remote_grad)
+    assert torch.allclose(buffer_gpu.cpu(), remote_grad.cpu())
 
 
 class TrainerActor(Actor):
     def __init__(self):
         super().__init__()
-        # TODO - switch to CUDA once GPU support is added
-        self.trainer = torch.nn.Linear(10, 10).to("cpu")
+        self.trainer = torch.nn.Linear(10, 10).to("cuda")
         self.trainer.weight.data.zero_()
 
     @endpoint