Skip to content

Commit bb122fe

Browse files
committed
Add integration tests for compiler toolkit experiments
1 parent 2a7a148 commit bb122fe

File tree

5 files changed

+175
-1
lines changed

5 files changed

+175
-1
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Compiler Toolkit 8 GPU Integration Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/compiler_toolkit/**'
8+
- '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
9+
pull_request:
10+
paths:
11+
- 'torchtitan/experiments/compiler_toolkit/**'
12+
- '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
13+
schedule:
14+
# Runs every 12 hours
15+
- cron: '0 */12 * * *'
16+
17+
concurrency:
18+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
19+
cancel-in-progress: true
20+
21+
defaults:
22+
run:
23+
shell: bash -l -eo pipefail {0}
24+
25+
jobs:
26+
build-test:
27+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
28+
with:
29+
runner: linux.g5.48xlarge.nvidia.gpu
30+
gpu-arch-type: cuda
31+
gpu-arch-version: "12.6"
32+
# This image is faster to clone than the default, but it lacks CC needed by triton
33+
# (1m25s vs 2m37s).
34+
docker-image: torchtitan-ubuntu-20.04-clang12
35+
repository: pytorch/torchtitan
36+
upload-artifact: outputs
37+
script: |
38+
set -eux
39+
40+
# The generic Linux job chooses to use base env, not the one setup by the image
41+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
42+
conda activate "${CONDA_ENV}"
43+
44+
# Log CUDA driver version for debugging.
45+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
46+
echo "CUDA driver version: ${DRIVER_VERSION}"
47+
48+
pip config --user set global.progress_bar off
49+
50+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
51+
52+
mkdir artifacts-to-be-uploaded
53+
python -m torchtitan.experiments.compiler_toolkit.tests.integration_tests artifacts-to-be-uploaded --ngpu 8

torchtitan/experiments/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ We provide this `experiments/` folder to host experiments that add significant v
3030
| [torchcomms](./torchcomms/) | TBA | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) |
3131
| [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
3232
| [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
33-
| [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
33+
| [compiler_toolkit](./compiler_tookit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)] | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |

torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import torch
1111

1212
from torch.fx.traceback import annotate_fn
13+
1314
from torchtitan.config import JobConfig
1415
from torchtitan.distributed import ParallelDims
1516
from torchtitan.experiments.compiler_toolkit.common_utils import (
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
returns a list of OverrideDefinitions that is used to generate
17+
variations of integration tests based on the same root config file.
18+
"""
19+
integration_tests_flavors = [
20+
# llama3 tests
21+
OverrideDefinitions(
22+
[
23+
[
24+
"--model.name compiler_toolkit.llama3",
25+
"--parallelism.data_parallel_shard_degree 2",
26+
"--parallelism.tensor_parallel_degree 4",
27+
"--activation_checkpoint.mode none",
28+
],
29+
],
30+
"llama3 FSDP+TP",
31+
"llama3_fsdp_tp",
32+
ngpu=8,
33+
),
34+
OverrideDefinitions(
35+
[
36+
[
37+
"--model.name compiler_toolkit.llama3",
38+
"--parallelism.data_parallel_shard_degree 2",
39+
"--parallelism.tensor_parallel_degree 4",
40+
"--model.flavor debugmodel_flex_attn",
41+
"--activation_checkpoint.mode none",
42+
],
43+
],
44+
"llama3 FSDP+TP+FlexAttn",
45+
"llama3_fsdp_tp_flexattn",
46+
ngpu=8,
47+
),
48+
# deepseek_v3 tests
49+
OverrideDefinitions(
50+
[
51+
[
52+
"--model.name compiler_toolkit.deepseek_v3",
53+
"--parallelism.data_parallel_shard_degree 2",
54+
"--parallelism.tensor_parallel_degree 2",
55+
"--parallelism.expert_parallel_degree 4",
56+
"--parallelism.expert_tensor_parallel_degree 1",
57+
"--activation_checkpoint.mode none",
58+
],
59+
],
60+
"deepseek_v3 FSDP+TP+EP",
61+
"deepseekv3_fsdp_tp_ep",
62+
ngpu=4,
63+
),
64+
OverrideDefinitions(
65+
[
66+
[
67+
"--model.name compiler_toolkit.deepseek_v3",
68+
"--parallelism.data_parallel_shard_degree 2",
69+
"--parallelism.tensor_parallel_degree 2",
70+
"--parallelism.expert_parallel_degree 4",
71+
"--parallelism.expert_tensor_parallel_degree 1",
72+
"--activation_checkpoint.mode none",
73+
"--model.flavor debugmodel_flex_attn",
74+
],
75+
],
76+
"deepseek_v3 FSDP+TP+EP+FlexAttention",
77+
"deepseekv3_fsdp_tp_ep_flexattention",
78+
ngpu=4,
79+
),
80+
]
81+
return integration_tests_flavors
82+
83+
84+
_TEST_SUITES_FUNCTION = {
85+
"compiler_toolkit": build_compiler_toolkit_test_list,
86+
}
87+
88+
89+
def main():
90+
parser = argparse.ArgumentParser()
91+
parser.add_argument("output_dir")
92+
parser.add_argument(
93+
"--config_path",
94+
default="./tests/integration_tests/base_config.toml",
95+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
96+
)
97+
parser.add_argument(
98+
"--test_name",
99+
default="all",
100+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
101+
)
102+
parser.add_argument("--ngpu", default=8, type=int)
103+
args = parser.parse_args()
104+
105+
if not os.path.exists(args.output_dir):
106+
os.makedirs(args.output_dir)
107+
if os.listdir(args.output_dir):
108+
raise RuntimeError("Please provide an empty output directory.")
109+
110+
test_list = _TEST_SUITES_FUNCTION["compiler_toolkit"]()
111+
run_tests(args, test_list)
112+
113+
114+
if __name__ == "__main__":
115+
main()

0 commit comments

Comments
 (0)