Skip to content

Commit 08d7e7c

Browse files
dstaay-fbfacebook-github-bot
authored andcommitted
Enable RDMA tests in OSS CI (#1255)
Summary: Pull Request resolved: #1255 - Ensure `cargo test -p monarch_rdma` runs on OSS CI - Break out new testing infra into its own runner (see rationale below). - `cargo test` in general will work for repo, but we have a lot of broken tests to resolve. Given test suites getting relatively large and different paths (ie. cargo test vs pytest/pyright), which was actually helpful since some flakiness within python path having some long running examples. Trade off is slightly more resources; so can revisit later if issue. {F1982168077} Reviewed By: dulinriley Differential Revision: D82699014
1 parent c87b02c commit 08d7e7c

File tree

11 files changed

+173
-49
lines changed

11 files changed

+173
-49
lines changed

.github/workflows/ci.yml

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,38 @@ jobs:
2222
name: Build CPU
2323
uses: ./.github/workflows/build-cpu.yml
2424

25-
test-cuda:
26-
name: Test CUDA
25+
test-cpu-python:
26+
name: Test CPU Python
27+
needs: build-cpu
28+
uses: ./.github/workflows/test-cpu-python.yml
29+
with:
30+
artifact-name: monarch-cpu-${{ github.sha }}
31+
32+
test-gpu-python:
33+
name: Test GPU Python
2734
needs: build-cuda
28-
uses: ./.github/workflows/test-cuda.yml
35+
uses: ./.github/workflows/test-gpu-python.yml
2936
with:
3037
artifact-name: monarch-cuda-${{ github.sha }}
3138

32-
test-cpu:
33-
name: Test CPU
34-
needs: build-cpu
35-
uses: ./.github/workflows/test-cpu.yml
39+
test-gpu-rust:
40+
name: Test GPU Rust
41+
needs: build-cuda
42+
uses: ./.github/workflows/test-gpu-rust.yml
3643
with:
37-
artifact-name: monarch-cpu-${{ github.sha }}
44+
artifact-name: monarch-cuda-${{ github.sha }}
3845

3946
status-check:
4047
name: Status Check
4148
runs-on: ubuntu-latest
42-
needs: [test-cuda, test-cpu]
49+
needs: [test-cpu-python, test-gpu-python, test-gpu-rust]
4350
if: always()
4451
steps:
4552
- name: Check all jobs status
4653
run: |
47-
if [[ "${{ needs.test-cuda.result }}" != "success" ]] ||
48-
[[ "${{ needs.test-cpu.result }}" != "success" ]]; then
54+
if [[ "${{ needs.test-cpu-python.result }}" != "success" ]] ||
55+
[[ "${{ needs.test-gpu-python.result }}" != "success" ]] ||
56+
[[ "${{ needs.test-gpu-rust.result }}" != "success" ]]; then
4957
echo "One or more jobs failed"
5058
exit 1
5159
else

.github/workflows/test-cpu.yml renamed to .github/workflows/test-cpu-python.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CPU
1+
name: Test CPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-cpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cpu-no-tensor-engine:
17-
name: Test CPU - No Tensor Engine
16+
test-cpu-python:
17+
name: Test CPU Python - No Tensor Engine
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
with:
2020
timeout: 60
@@ -38,3 +38,7 @@ jobs:
3838
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
3939
# We will just ensure monarch can be imported successfully.
4040
python -c "import monarch; print('Monarch imported successfully')"
41+
42+
# Run CPU Python tests (excluding GPU/tensor engine tests)
43+
# TODO: Add appropriate test filters for CPU-only Python tests
44+
echo "CPU Python tests would run here - currently placeholder"

.github/workflows/test-cuda.yml renamed to .github/workflows/test-gpu-python.yml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CUDA
1+
name: Test GPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cuda:
17-
name: Test CUDA (cuda12.6-py3.10)
16+
test-gpu-python:
17+
name: Test GPU Python (cuda12.6-py3.10)
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
strategy:
2020
fail-fast: true
@@ -39,10 +39,8 @@ jobs:
3939
# Setup test environment
4040
setup_test_environment
4141
42-
# Setup Tensor Engine dependencies
43-
setup_tensor_engine
44-
45-
export CUDA_LIB_DIR=/usr/lib64
42+
# Setup CUDA environment and library paths
43+
setup_cuda_environment
4644
4745
# Install the built wheel from artifact
4846
install_wheel_from_artifact
@@ -51,7 +49,7 @@ jobs:
5149
# pyre currently does not check these assertions
5250
pyright python/tests/test_python_actors.py
5351
54-
# Run CUDA tests
52+
# Run GPU Python tests
5553
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
5654
# TODO(meriksen): temporarily disabled to unblock lands while debugging
5755
# mock CUDA issues on the OSS setup
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
name: Test GPU Rust
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
artifact-name:
7+
description: 'Wheel artifact name from build workflow'
8+
required: true
9+
type: string
10+
11+
concurrency:
12+
group: test-gpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test-gpu-rust:
17+
name: Test GPU Rust (cuda12.6)
18+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19+
strategy:
20+
fail-fast: true
21+
matrix:
22+
include:
23+
- name: 4xlargegpu
24+
runs-on: linux.g5.4xlarge.nvidia.gpu
25+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
26+
gpu-arch-type: "cuda"
27+
gpu-arch-version: "12.6"
28+
with:
29+
timeout: 120
30+
runner: ${{ matrix.runs-on }}
31+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
32+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
33+
submodules: recursive
34+
download-artifact: ${{ inputs.artifact-name }}
35+
script: |
36+
# Source common setup functions
37+
source scripts/common-setup.sh
38+
39+
# Setup test environment
40+
setup_test_environment
41+
42+
# Install Rust
43+
setup_rust_toolchain
44+
45+
# Install System dependencies
46+
install_system_dependencies
47+
48+
# Setup Tensor Engine dependencies
49+
setup_tensor_engine
50+
51+
# Setup CUDA environment and library paths
52+
setup_cuda_environment
53+
54+
# Setup PyTorch with C++ headers using common-setup utility
55+
setup_pytorch_with_headers "${{ matrix.gpu-arch-version }}" "${{ matrix.torch-spec }}"
56+
57+
# Run GPU Rust tests
58+
echo "Running OSS Rust tests..."
59+
# TODO: fix broken tests, then update to `cargo test --no-fail-fast`
60+
cargo test -p monarch_rdma

cuda-sys/build.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
use std::env;
1010
use std::path::PathBuf;
1111

12-
use build_utils::*;
12+
use build_utils;
1313

1414
#[cfg(target_os = "macos")]
1515
fn main() {}

hyperactor_macros/tests/export.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ use hyperactor::Named;
1515
use hyperactor::PortRef;
1616
use hyperactor::Unbind;
1717
use serde::Deserialize;
18-
19-
use crate::Serialize;
18+
use serde::Serialize;
2019

2120
#[derive(Debug)]
2221
#[hyperactor::export(

hyperactor_telemetry/Cargo.toml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ edition = "2024"
1414
anyhow = "1.0.98"
1515
chrono = { version = "0.4.41", features = ["clock", "serde", "std"], default-features = false }
1616
dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
17-
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
17+
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
1818
hdrhistogram = "7.5"
1919
lazy_static = "1.5"
2020
opentelemetry = "0.29"
2121
opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] }
2222
rand = { version = "0.8", features = ["small_rng"] }
2323
rusqlite = { version = "0.36.0", features = ["backup", "blob", "bundled", "column_decltype", "functions", "limits", "modern_sqlite", "serde_json"] }
24-
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
24+
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
2525
serde = { version = "1.0.219", features = ["derive", "rc"] }
2626
serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "raw_value", "unbounded_depth"] }
2727
serde_rusqlite = "0.39.3"
@@ -35,15 +35,9 @@ tracing-subscriber = { version = "0.3.20", features = ["chrono", "env-filter", "
3535
whoami = "1.5"
3636

3737
[dev-dependencies]
38-
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
3938
quickcheck = "1.0"
4039
quickcheck_macros = "1.0"
41-
scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
4240
tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
4341

44-
[features]
45-
default = []
46-
fbcode_build = ["fbinit", "scuba"]
47-
4842
[lints]
4943
rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }

monarch_rdma/build.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,6 @@ fn main() {
5151
println!("cargo:rustc-link-lib=cuda");
5252
println!("cargo:rustc-link-lib=cudart");
5353

54-
// Tell cargo to look for shared libraries in standard RDMA directories
55-
println!("cargo:rustc-link-search=/usr/lib");
56-
println!("cargo:rustc-link-search=/usr/lib64");
57-
5854
// Link against the ibverbs and mlx5 libraries (used by rdmaxcel-sys)
5955
println!("cargo:rustc-link-lib=ibverbs");
6056
println!("cargo:rustc-link-lib=mlx5");

monarch_tensor_worker/src/stream.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2200,11 +2200,12 @@ mod tests {
22002200
.unwrap()
22012201
.unwrap();
22022202

2203-
allclose(
2203+
let result = allclose(
22042204
&factory_float_tensor(data, "cpu".try_into().unwrap()),
22052205
&actual.borrow(),
22062206
)
2207-
.unwrap()
2207+
.unwrap();
2208+
result
22082209
}
22092210

22102211
async fn validate_dependent_error(&mut self, reference: Ref, error: Arc<SeqError>) {

rdmaxcel-sys/build.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,11 @@ use std::env;
1010
use std::path::Path;
1111
use std::path::PathBuf;
1212

13-
use build_utils::*;
14-
1513
#[cfg(target_os = "macos")]
1614
fn main() {}
1715

1816
#[cfg(not(target_os = "macos"))]
1917
fn main() {
20-
// Tell cargo to look for shared libraries in the specified directory
21-
println!("cargo:rustc-link-search=/usr/lib");
22-
println!("cargo:rustc-link-search=/usr/lib64");
23-
2418
// Link against the ibverbs library
2519
println!("cargo:rustc-link-lib=ibverbs");
2620

@@ -141,8 +135,6 @@ fn main() {
141135
}
142136
if let Some(lib_dir) = &python_config.lib_dir {
143137
println!("cargo:rustc-link-search=native={}", lib_dir);
144-
// Set cargo metadata to inform dependent binaries about how to set their
145-
// RPATH (see controller/build.rs for an example).
146138
println!("cargo:metadata=LIB_PATH={}", lib_dir);
147139
}
148140

0 commit comments

Comments
 (0)