Skip to content

Commit c6d6dda

Browse files
dstaay-fbfacebook-github-bot
authored andcommitted
Enable RDMA tests in OSS CI (meta-pytorch#1255)
Summary: Pull Request resolved: meta-pytorch#1255 Ensure `cargo test -p monarch_rdma` runs on OSS CI Reviewed By: dulinriley Differential Revision: D82699014
1 parent 747c163 commit c6d6dda

File tree

6 files changed

+137
-28
lines changed

6 files changed

+137
-28
lines changed

.github/workflows/ci.yml

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,46 @@ jobs:
2222
name: Build CPU
2323
uses: ./.github/workflows/build-cpu.yml
2424

25-
test-cuda:
26-
name: Test CUDA
27-
needs: build-cuda
28-
uses: ./.github/workflows/test-cuda.yml
25+
test-cpu-python:
26+
name: Test CPU Python
27+
needs: build-cpu
28+
uses: ./.github/workflows/test-cpu-python.yml
2929
with:
30-
artifact-name: monarch-cuda-${{ github.sha }}
30+
artifact-name: monarch-cpu-${{ github.sha }}
3131

32-
test-cpu:
33-
name: Test CPU
32+
test-cpu-rust:
33+
name: Test CPU Rust
3434
needs: build-cpu
35-
uses: ./.github/workflows/test-cpu.yml
35+
uses: ./.github/workflows/test-cpu-rust.yml
3636
with:
3737
artifact-name: monarch-cpu-${{ github.sha }}
3838

39+
test-gpu-python:
40+
name: Test GPU Python
41+
needs: build-cuda
42+
uses: ./.github/workflows/test-gpu-python.yml
43+
with:
44+
artifact-name: monarch-cuda-${{ github.sha }}
45+
46+
test-gpu-rust:
47+
name: Test GPU Rust
48+
needs: build-cuda
49+
uses: ./.github/workflows/test-gpu-rust.yml
50+
with:
51+
artifact-name: monarch-cuda-${{ github.sha }}
52+
3953
status-check:
4054
name: Status Check
4155
runs-on: ubuntu-latest
42-
needs: [test-cuda, test-cpu]
56+
needs: [test-cpu-python, test-cpu-rust, test-gpu-python, test-gpu-rust]
4357
if: always()
4458
steps:
4559
- name: Check all jobs status
4660
run: |
47-
if [[ "${{ needs.test-cuda.result }}" != "success" ]] ||
48-
[[ "${{ needs.test-cpu.result }}" != "success" ]]; then
61+
if [[ "${{ needs.test-cpu-python.result }}" != "success" ]] ||
62+
[[ "${{ needs.test-cpu-rust.result }}" != "success" ]] ||
63+
[[ "${{ needs.test-gpu-python.result }}" != "success" ]] ||
64+
[[ "${{ needs.test-gpu-rust.result }}" != "success" ]]; then
4965
echo "One or more jobs failed"
5066
exit 1
5167
else

.github/workflows/test-cpu.yml renamed to .github/workflows/test-cpu-python.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CPU
1+
name: Test CPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-cpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cpu-no-tensor-engine:
17-
name: Test CPU - No Tensor Engine
16+
test-cpu-python:
17+
name: Test CPU Python - No Tensor Engine
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
with:
2020
timeout: 60
@@ -38,3 +38,7 @@ jobs:
3838
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
3939
# We will just ensure monarch can be imported successfully.
4040
python -c "import monarch; print('Monarch imported successfully')"
41+
42+
# Run CPU Python tests (excluding GPU/tensor engine tests)
43+
# TODO: Add appropriate test filters for CPU-only Python tests
44+
echo "CPU Python tests would run here - currently placeholder"
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Test CPU Rust
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
artifact-name:
7+
description: 'Wheel artifact name from build workflow'
8+
required: true
9+
type: string
10+
11+
concurrency:
12+
group: test-cpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test-cpu-rust:
17+
name: Test CPU Rust - No Tensor Engine
18+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19+
with:
20+
timeout: 60
21+
runner: linux.4xlarge
22+
submodules: recursive
23+
download-artifact: ${{ inputs.artifact-name }}
24+
script: |
25+
# Source common setup functions
26+
source scripts/common-setup.sh
27+
28+
# Setup test environment
29+
setup_conda_environment
30+
31+
# Install Rust
32+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0
33+
source $HOME/.cargo/env
34+
rustc --version
35+
36+
# Disable tensor engine
37+
export USE_TENSOR_ENGINE=0
38+
39+
40+
41+
# Run CPU Rust tests - now that hyperactor_telemetry Cargo.toml is fixed for OSS
42+
echo "Running OSS Rust tests..."
43+
cargo test -p monarch_rdma

.github/workflows/test-cuda.yml renamed to .github/workflows/test-gpu-python.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CUDA
1+
name: Test GPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cuda:
17-
name: Test CUDA (cuda12.6-py3.10)
16+
test-gpu-python:
17+
name: Test GPU Python (cuda12.6-py3.10)
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
strategy:
2020
fail-fast: true
@@ -51,7 +51,7 @@ jobs:
5151
# pyre currently does not check these assertions
5252
pyright python/tests/test_python_actors.py
5353
54-
# Run CUDA tests
54+
# Run GPU Python tests
5555
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
5656
# TODO(meriksen): temporarily disabled to unblock lands while debugging
5757
# mock CUDA issues on the OSS setup
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: Test GPU Rust
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
artifact-name:
7+
description: 'Wheel artifact name from build workflow'
8+
required: true
9+
type: string
10+
11+
concurrency:
12+
group: test-gpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test-gpu-rust:
17+
name: Test GPU Rust (cuda12.6)
18+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19+
strategy:
20+
fail-fast: true
21+
matrix:
22+
include:
23+
- name: 4xlargegpu
24+
runs-on: linux.g5.4xlarge.nvidia.gpu
25+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
26+
gpu-arch-type: "cuda"
27+
gpu-arch-version: "12.6"
28+
with:
29+
timeout: 120
30+
runner: ${{ matrix.runs-on }}
31+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
32+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
33+
submodules: recursive
34+
download-artifact: ${{ inputs.artifact-name }}
35+
script: |
36+
# Source common setup functions
37+
source scripts/common-setup.sh
38+
39+
# Setup test environment
40+
setup_test_environment
41+
42+
# Setup Tensor Engine dependencies
43+
setup_tensor_engine
44+
45+
export CUDA_LIB_DIR=/usr/lib64
46+
47+
# Install specific rust 1.88.0 version that supports edition 2024 from rolling_rolling_AppStream
48+
dnf install -y rust-1.88.0-1.el9 cargo-1.88.0-1.el9
49+
50+
# Run GPU Rust tests - now that hyperactor_telemetry Cargo.toml is fixed for OSS
51+
echo "Running OSS Rust tests..."
52+
cargo test -p monarch_rdma

hyperactor_telemetry/Cargo.toml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ edition = "2024"
1414
anyhow = "1.0.98"
1515
chrono = { version = "0.4.41", features = ["clock", "serde", "std"], default-features = false }
1616
dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
17-
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
17+
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
1818
hdrhistogram = "7.5"
1919
lazy_static = "1.5"
2020
opentelemetry = "0.29"
2121
opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] }
2222
rand = { version = "0.8", features = ["small_rng"] }
2323
rusqlite = { version = "0.36.0", features = ["backup", "blob", "bundled", "column_decltype", "functions", "limits", "modern_sqlite", "serde_json"] }
24-
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
24+
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
2525
serde = { version = "1.0.219", features = ["derive", "rc"] }
2626
serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "raw_value", "unbounded_depth"] }
2727
serde_rusqlite = "0.39.3"
@@ -35,15 +35,9 @@ tracing-subscriber = { version = "0.3.20", features = ["chrono", "env-filter", "
3535
whoami = "1.5"
3636

3737
[dev-dependencies]
38-
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
3938
quickcheck = "1.0"
4039
quickcheck_macros = "1.0"
41-
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
4240
tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
4341

44-
[features]
45-
default = []
46-
fbcode_build = ["fbinit", "scuba"]
47-
4842
[lints]
4943
rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }

0 commit comments

Comments
 (0)