Skip to content

Commit 856f2b5

Browse files
committed
Add integration tests for compiler toolkit experiments
1 parent 8228c08 commit 856f2b5

File tree

3 files changed

+147
-0
lines changed

3 files changed

+147
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
name: Compiler Toolkit 8 GPU Integration Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/compiler_toolkit/**'
8+
- 'torchtitan/experiments/simple_fsdp/**'
9+
- '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
10+
pull_request:
11+
paths:
12+
- 'torchtitan/experiments/compiler_toolkit/**'
13+
- 'torchtitan/experiments/simple_fsdp/**'
14+
- '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
15+
schedule:
16+
# Runs every 12 hours
17+
- cron: '0 */12 * * *'
18+
19+
concurrency:
20+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
21+
cancel-in-progress: true
22+
23+
defaults:
24+
run:
25+
shell: bash -l -eo pipefail {0}
26+
27+
jobs:
28+
build-test:
29+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
30+
with:
31+
runner: linux.g5.48xlarge.nvidia.gpu
32+
gpu-arch-type: cuda
33+
gpu-arch-version: "12.6"
34+
# This image is faster to clone than the default, but it lacks CC needed by triton
35+
# (1m25s vs 2m37s).
36+
docker-image: torchtitan-ubuntu-20.04-clang12
37+
repository: pytorch/torchtitan
38+
upload-artifact: outputs
39+
script: |
40+
set -eux
41+
42+
# The generic Linux job chooses to use base env, not the one setup by the image
43+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
44+
conda activate "${CONDA_ENV}"
45+
46+
# Log CUDA driver version for debugging.
47+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
48+
echo "CUDA driver version: ${DRIVER_VERSION}"
49+
50+
pip config --user set global.progress_bar off
51+
52+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
53+
54+
mkdir artifacts-to-be-uploaded
55+
python -m torchtitan.experiments.compiler_toolkit.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
key is the config file name and value is a list of OverrideDefinitions
17+
that is used to generate variations of integration tests based on the
18+
same root config file.
19+
"""
20+
integration_tests_flavors = [
21+
OverrideDefinitions(
22+
[
23+
[
24+
"--model.name compiler_toolkit.deepseek_v3",
25+
"--parallelism.data_parallel_shard_degree 2",
26+
"--parallelism.tensor_parallel_degree 2",
27+
"--parallelism.expert_parallel_degree 4",
28+
"--parallelism.expert_tensor_parallel_degree 1",
29+
"--activation_checkpoint.mode none",
30+
],
31+
],
32+
"FSDP+TP+EP",
33+
"fsdp+tp+ep",
34+
ngpu=4,
35+
),
36+
OverrideDefinitions(
37+
[
38+
[
39+
"--model.name compiler_toolkit.deepseek_v3",
40+
"--parallelism.data_parallel_shard_degree 2",
41+
"--parallelism.tensor_parallel_degree 2",
42+
"--parallelism.expert_parallel_degree 4",
43+
"--parallelism.expert_tensor_parallel_degree 1",
44+
"--activation_checkpoint.mode none",
45+
"--model.flavor debugmodel_flex_attn",
46+
],
47+
],
48+
"FSDP+TP+EP+FlexAttention",
49+
"fsdp+tp+ep+flexattention",
50+
ngpu=4,
51+
),
52+
]
53+
return integration_tests_flavors
54+
55+
56+
_TEST_SUITES_FUNCTION = {
57+
"compiler_toolkit": build_compiler_toolkit_test_list,
58+
}
59+
60+
61+
def main():
62+
parser = argparse.ArgumentParser()
63+
parser.add_argument("output_dir")
64+
parser.add_argument(
65+
"--config_path",
66+
default="./tests/integration_tests/base_config.toml",
67+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
68+
)
69+
parser.add_argument(
70+
"--test_name",
71+
default="all",
72+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
73+
)
74+
parser.add_argument("--ngpu", default=8, type=int)
75+
args = parser.parse_args()
76+
77+
if not os.path.exists(args.output_dir):
78+
os.makedirs(args.output_dir)
79+
if os.listdir(args.output_dir):
80+
raise RuntimeError("Please provide an empty output directory.")
81+
82+
test_list = _TEST_SUITES_FUNCTION["compiler_toolkit"]()
83+
run_tests(args, test_list)
84+
85+
86+
if __name__ == "__main__":
87+
main()

0 commit comments

Comments
 (0)