Skip to content

Commit fff7d24

Browse files
dstaay-fbfacebook-github-bot
authored andcommitted
Enable RDMA tests in OSS CI (meta-pytorch#1255)
Summary: Ensure `cargo test -p monarch_rdma` runs on OSS CI Reviewed By: dulinriley Differential Revision: D82699014
1 parent e4cbb58 commit fff7d24

File tree

5 files changed

+146
-20
lines changed

5 files changed

+146
-20
lines changed

.github/workflows/ci.yml

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,46 @@ jobs:
2222
name: Build CPU
2323
uses: ./.github/workflows/build-cpu.yml
2424

25-
test-cuda:
26-
name: Test CUDA
27-
needs: build-cuda
28-
uses: ./.github/workflows/test-cuda.yml
25+
test-cpu-python:
26+
name: Test CPU Python
27+
needs: build-cpu
28+
uses: ./.github/workflows/test-cpu-python.yml
2929
with:
30-
artifact-name: monarch-cuda-${{ github.sha }}
30+
artifact-name: monarch-cpu-${{ github.sha }}
3131

32-
test-cpu:
33-
name: Test CPU
32+
test-cpu-rust:
33+
name: Test CPU Rust
3434
needs: build-cpu
35-
uses: ./.github/workflows/test-cpu.yml
35+
uses: ./.github/workflows/test-cpu-rust.yml
3636
with:
3737
artifact-name: monarch-cpu-${{ github.sha }}
3838

39+
test-gpu-python:
40+
name: Test GPU Python
41+
needs: build-cuda
42+
uses: ./.github/workflows/test-gpu-python.yml
43+
with:
44+
artifact-name: monarch-cuda-${{ github.sha }}
45+
46+
test-gpu-rust:
47+
name: Test GPU Rust
48+
needs: build-cuda
49+
uses: ./.github/workflows/test-gpu-rust.yml
50+
with:
51+
artifact-name: monarch-cuda-${{ github.sha }}
52+
3953
status-check:
4054
name: Status Check
4155
runs-on: ubuntu-latest
42-
needs: [test-cuda, test-cpu]
56+
needs: [test-cpu-python, test-cpu-rust, test-gpu-python, test-gpu-rust]
4357
if: always()
4458
steps:
4559
- name: Check all jobs status
4660
run: |
47-
if [[ "${{ needs.test-cuda.result }}" != "success" ]] ||
48-
[[ "${{ needs.test-cpu.result }}" != "success" ]]; then
61+
if [[ "${{ needs.test-cpu-python.result }}" != "success" ]] ||
62+
[[ "${{ needs.test-cpu-rust.result }}" != "success" ]] ||
63+
[[ "${{ needs.test-gpu-python.result }}" != "success" ]] ||
64+
[[ "${{ needs.test-gpu-rust.result }}" != "success" ]]; then
4965
echo "One or more jobs failed"
5066
exit 1
5167
else

.github/workflows/test-cpu.yml renamed to .github/workflows/test-cpu-python.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CPU
1+
name: Test CPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-cpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cpu-no-tensor-engine:
17-
name: Test CPU - No Tensor Engine
16+
test-cpu-python:
17+
name: Test CPU Python - No Tensor Engine
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
with:
2020
timeout: 60
@@ -38,3 +38,7 @@ jobs:
3838
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
3939
# We will just ensure monarch can be imported successfully.
4040
python -c "import monarch; print('Monarch imported successfully')"
41+
42+
# Run CPU Python tests (excluding GPU/tensor engine tests)
43+
# TODO: Add appropriate test filters for CPU-only Python tests
44+
echo "CPU Python tests would run here - currently placeholder"
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
name: Test CPU Rust
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
artifact-name:
7+
description: 'Wheel artifact name from build workflow'
8+
required: true
9+
type: string
10+
11+
concurrency:
12+
group: test-cpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test-cpu-rust:
17+
name: Test CPU Rust - No Tensor Engine
18+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19+
with:
20+
timeout: 60
21+
runner: linux.4xlarge
22+
submodules: recursive
23+
download-artifact: ${{ inputs.artifact-name }}
24+
script: |
25+
# Source common setup functions
26+
source scripts/common-setup.sh
27+
28+
# Setup test environment
29+
setup_conda_environment
30+
31+
# Disable tensor engine
32+
export USE_TENSOR_ENGINE=0
33+
34+
# Install Rust toolchain for cargo tests
35+
# Check if rustup is already available
36+
if ! command -v rustup &> /dev/null; then
37+
# Install rust from dnf package manager (same as other system deps in common-setup.sh)
38+
dnf install -y rust cargo
39+
fi
40+
# Use standard rustup commands if available
41+
if command -v rustup &> /dev/null; then
42+
rustup update stable --no-self-update && rustup default stable
43+
fi
44+
45+
# Run CPU Rust tests (excluding GPU-specific tests)
46+
# TODO: Add appropriate test filters for CPU-only tests
47+
echo "CPU Rust tests would run here - currently placeholder"

.github/workflows/test-cuda.yml renamed to .github/workflows/test-gpu-python.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test CUDA
1+
name: Test GPU Python
22

33
on:
44
workflow_call:
@@ -9,12 +9,12 @@ on:
99
type: string
1010

1111
concurrency:
12-
group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1313
cancel-in-progress: true
1414

1515
jobs:
16-
test-cuda:
17-
name: Test CUDA (cuda12.6-py3.10)
16+
test-gpu-python:
17+
name: Test GPU Python (cuda12.6-py3.10)
1818
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
1919
strategy:
2020
fail-fast: true
@@ -51,7 +51,7 @@ jobs:
5151
# pyre currently does not check these assertions
5252
pyright python/tests/test_python_actors.py
5353
54-
# Run CUDA tests
54+
# Run GPU Python tests
5555
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
5656
# TODO(meriksen): temporarily disabled to unblock lands while debugging
5757
# mock CUDA issues on the OSS setup
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
name: Test GPU Rust
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
artifact-name:
7+
description: 'Wheel artifact name from build workflow'
8+
required: true
9+
type: string
10+
11+
concurrency:
12+
group: test-gpu-rust-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test-gpu-rust:
17+
name: Test GPU Rust (cuda12.6)
18+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19+
strategy:
20+
fail-fast: true
21+
matrix:
22+
include:
23+
- name: 4xlargegpu
24+
runs-on: linux.g5.4xlarge.nvidia.gpu
25+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
26+
gpu-arch-type: "cuda"
27+
gpu-arch-version: "12.6"
28+
with:
29+
timeout: 120
30+
runner: ${{ matrix.runs-on }}
31+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
32+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
33+
submodules: recursive
34+
download-artifact: ${{ inputs.artifact-name }}
35+
script: |
36+
# Source common setup functions
37+
source scripts/common-setup.sh
38+
39+
# Setup test environment
40+
setup_test_environment
41+
42+
# Setup Tensor Engine dependencies
43+
setup_tensor_engine
44+
45+
export CUDA_LIB_DIR=/usr/lib64
46+
47+
# Install Rust toolchain for cargo tests
48+
# Check if rustup is already available
49+
if ! command -v rustup &> /dev/null; then
50+
# Install rust from dnf package manager (same as other system deps in common-setup.sh)
51+
dnf install -y rust cargo
52+
fi
53+
# Use standard rustup commands if available
54+
if command -v rustup &> /dev/null; then
55+
rustup update stable --no-self-update && rustup default stable
56+
fi
57+
58+
# Run GPU Rust tests
59+
cargo test -p monarch_rdma

0 commit comments

Comments
 (0)