Enable RDMA tests in OSS CI (meta-pytorch#1255)

dstaay-fb · facebook-github-bot · commit c6d6dda60188 · 2025-09-19T07:51:23.000-07:00
Summary: Pull Request resolved: meta-pytorch#1255 Ensure `cargo test -p monarch_rdma` runs on OSS CI Reviewed By: dulinriley Differential Revision: D82699014
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,30 +22,46 @@ jobs:
     name: Build CPU
     uses: ./.github/workflows/build-cpu.yml
 
-  test-cuda:
-    name: Test CUDA
-    needs: build-cuda
-    uses: ./.github/workflows/test-cuda.yml
+  test-cpu-python:
+    name: Test CPU Python
+    needs: build-cpu
+    uses: ./.github/workflows/test-cpu-python.yml
     with:
-      artifact-name: monarch-cuda-${{ github.sha }}
+      artifact-name: monarch-cpu-${{ github.sha }}
 
-  test-cpu:
-    name: Test CPU
+  test-cpu-rust:
+    name: Test CPU Rust
     needs: build-cpu
-    uses: ./.github/workflows/test-cpu.yml
+    uses: ./.github/workflows/test-cpu-rust.yml
     with:
       artifact-name: monarch-cpu-${{ github.sha }}
 
+  test-gpu-python:
+    name: Test GPU Python
+    needs: build-cuda
+    uses: ./.github/workflows/test-gpu-python.yml
+    with:
+      artifact-name: monarch-cuda-${{ github.sha }}
+
+  test-gpu-rust:
+    name: Test GPU Rust
+    needs: build-cuda
+    uses: ./.github/workflows/test-gpu-rust.yml
+    with:
+      artifact-name: monarch-cuda-${{ github.sha }}
+
   status-check:
     name: Status Check
     runs-on: ubuntu-latest
-    needs: [test-cuda, test-cpu]
+    needs: [test-cpu-python, test-cpu-rust, test-gpu-python, test-gpu-rust]
     if: always()
     steps:
       - name: Check all jobs status
         run: |
-          if [[ "${{ needs.test-cuda.result }}" != "success" ]] ||
-             [[ "${{ needs.test-cpu.result }}" != "success" ]]; then
+          if [[ "${{ needs.test-cpu-python.result }}" != "success" ]] ||
+             [[ "${{ needs.test-cpu-rust.result }}" != "success" ]] ||
+             [[ "${{ needs.test-gpu-python.result }}" != "success" ]] ||
+             [[ "${{ needs.test-gpu-rust.result }}" != "success" ]]; then
             echo "One or more jobs failed"
             exit 1
           else
diff --git a/.github/workflows/test-cpu-python.yml b/.github/workflows/test-cpu-python.yml
@@ -1,4 +1,4 @@
-name: Test CPU
+name: Test CPU Python
 
 on:
   workflow_call:
@@ -9,12 +9,12 @@ on:
         type: string
 
 concurrency:
-  group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: test-cpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 jobs:
-  test-cpu-no-tensor-engine:
-    name: Test CPU - No Tensor Engine
+  test-cpu-python:
+    name: Test CPU Python - No Tensor Engine
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
@@ -38,3 +38,7 @@ jobs:
         # Tests requiring tensor engine / GPU need to be identified and flagged to skip.
         # We will just ensure monarch can be imported successfully.
         python -c "import monarch; print('Monarch imported successfully')"
+
+        # Run CPU Python tests (excluding GPU/tensor engine tests)
+        # TODO: Add appropriate test filters for CPU-only Python tests
+        echo "CPU Python tests would run here - currently placeholder"
diff --git a/.github/workflows/test-cpu-rust.yml b/.github/workflows/test-cpu-rust.yml
@@ -0,0 +1,43 @@
+name: Test CPU Rust
+
+on:
+  workflow_call:
+    inputs:
+      artifact-name:
+        description: 'Wheel artifact name from build workflow'
+        required: true
+        type: string
+
+concurrency:
+  group: test-cpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-cpu-rust:
+    name: Test CPU Rust - No Tensor Engine
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: linux.4xlarge
+      submodules: recursive
+      download-artifact: ${{ inputs.artifact-name }}
+      script: |
+        # Source common setup functions
+        source scripts/common-setup.sh
+
+        # Setup test environment
+        setup_conda_environment
+
+        # Install Rust
+        curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0
+        source $HOME/.cargo/env
+        rustc --version
+
+        # Disable tensor engine
+        export USE_TENSOR_ENGINE=0
+
+
+
+        # Run CPU Rust tests - now that hyperactor_telemetry Cargo.toml is fixed for OSS
+        echo "Running OSS Rust tests..."
+        cargo test -p monarch_rdma
diff --git a/.github/workflows/test-gpu-python.yml b/.github/workflows/test-gpu-python.yml
@@ -1,4 +1,4 @@
-name: Test CUDA
+name: Test GPU Python
 
 on:
   workflow_call:
@@ -9,12 +9,12 @@ on:
         type: string
 
 concurrency:
-  group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 jobs:
-  test-cuda:
-    name: Test CUDA (cuda12.6-py3.10)
+  test-gpu-python:
+    name: Test GPU Python (cuda12.6-py3.10)
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       fail-fast: true
@@ -51,7 +51,7 @@ jobs:
         # pyre currently does not check these assertions
         pyright python/tests/test_python_actors.py
 
-        # Run CUDA tests
+        # Run GPU Python tests
         LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
         # TODO(meriksen): temporarily disabled to unblock lands while debugging
         # mock CUDA issues on the OSS setup
diff --git a/.github/workflows/test-gpu-rust.yml b/.github/workflows/test-gpu-rust.yml
@@ -0,0 +1,52 @@
+name: Test GPU Rust
+
+on:
+  workflow_call:
+    inputs:
+      artifact-name:
+        description: 'Wheel artifact name from build workflow'
+        required: true
+        type: string
+
+concurrency:
+  group: test-gpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-gpu-rust:
+    name: Test GPU Rust (cuda12.6)
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - name: 4xlargegpu
+            runs-on: linux.g5.4xlarge.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.6"
+    with:
+      timeout: 120
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      download-artifact: ${{ inputs.artifact-name }}
+      script: |
+        # Source common setup functions
+        source scripts/common-setup.sh
+
+        # Setup test environment
+        setup_test_environment
+
+        # Setup Tensor Engine dependencies
+        setup_tensor_engine
+
+        export CUDA_LIB_DIR=/usr/lib64
+
+        # Install specific rust 1.88.0 version that supports edition 2024 from rolling_rolling_AppStream
+        dnf install -y rust-1.88.0-1.el9 cargo-1.88.0-1.el9
+
+        # Run GPU Rust tests - now that hyperactor_telemetry Cargo.toml is fixed for OSS
+        echo "Running OSS Rust tests..."
+        cargo test -p monarch_rdma
diff --git a/hyperactor_telemetry/Cargo.toml b/hyperactor_telemetry/Cargo.toml
@@ -14,14 +14,14 @@ edition = "2024"
 anyhow = "1.0.98"
 chrono = { version = "0.4.41", features = ["clock", "serde", "std"], default-features = false }
 dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
-fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
+fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
 hdrhistogram = "7.5"
 lazy_static = "1.5"
 opentelemetry = "0.29"
 opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] }
 rand = { version = "0.8", features = ["small_rng"] }
 rusqlite = { version = "0.36.0", features = ["backup", "blob", "bundled", "column_decltype", "functions", "limits", "modern_sqlite", "serde_json"] }
-scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
+scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
 serde = { version = "1.0.219", features = ["derive", "rc"] }
 serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "raw_value", "unbounded_depth"] }
 serde_rusqlite = "0.39.3"
@@ -35,15 +35,9 @@ tracing-subscriber = { version = "0.3.20", features = ["chrono", "env-filter", "
 whoami = "1.5"
 
 [dev-dependencies]
-fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
 quickcheck = "1.0"
 quickcheck_macros = "1.0"
-scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
 tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
 
-[features]
-default = []
-fbcode_build = ["fbinit", "scuba"]
-
 [lints]
 rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }