Skip to content

Commit 8e9da71

Browse files
committed
[Refactor] Modular Integration Test Framework with DeepSeek-v3 Support (#1431)
### Integration Tests Restructuring * Split current integration tests into two sets: 1. Depth Test - `features.py`: Use llama3 model, to test all the *main components* of torchtitan are functioning as expected 2. Breath Test - `models.py` : As we are supporting more models in torchtitan core, setup parallelsim related tests for each model, to test model architecture / args related changes. Make sure the Integration test implementation is easy to extend to new models. * Moved integration test files from the root directory to a dedicated `tests/integration_tests/` directory * Added a base configuration file `base_config.toml` for integration tests, as most of the train_configs shared 90% same settings * Separate control logic and test case definition: `run_tests.py` for control logic, other files for test case definition.
1 parent 41286bf commit 8e9da71

File tree

22 files changed

+619
-489
lines changed

22 files changed

+619
-489
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: 8 GPU Feature Tests
2+
on:
3+
push:
4+
branches: [ main ]
5+
paths-ignore:
6+
- 'torchtitan/experiments/**'
7+
pull_request:
8+
paths-ignore:
9+
- 'torchtitan/experiments/**'
10+
schedule:
11+
# Runs every 6 hours
12+
- cron: '0 */6 * * *'
13+
14+
concurrency:
15+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
16+
cancel-in-progress: true
17+
18+
defaults:
19+
run:
20+
shell: bash -l -eo pipefail {0}
21+
22+
jobs:
23+
build-test:
24+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
25+
with:
26+
runner: linux.g5.48xlarge.nvidia.gpu
27+
gpu-arch-type: cuda
28+
gpu-arch-version: "12.6"
29+
# This image is faster to clone than the default, but it lacks CC needed by triton
30+
# (1m25s vs 2m37s).
31+
docker-image: torchtitan-ubuntu-20.04-clang12
32+
repository: pytorch/torchtitan
33+
upload-artifact: outputs
34+
script: |
35+
set -eux
36+
37+
# The generic Linux job chooses to use base env, not the one setup by the image
38+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
39+
conda activate "${CONDA_ENV}"
40+
41+
# Log CUDA driver version for debugging.
42+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
43+
echo "CUDA driver version: ${DRIVER_VERSION}"
44+
45+
pip config --user set global.progress_bar off
46+
47+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
48+
49+
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
50+
51+
mkdir artifacts-to-be-uploaded
52+
python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8

.github/workflows/integration_test_8gpu_h100.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,4 @@ jobs:
5353
mkdir artifacts-to-be-uploaded
5454
5555
# Enable CPP stacktraces for debugging symmetric memory initialization errors.
56-
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests_h100 artifacts-to-be-uploaded --ngpu 8
56+
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8

.github/workflows/integration_test_8gpu.yaml renamed to .github/workflows/integration_test_8gpu_models.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: 8 GPU Integration Test
1+
name: 8 GPU Model Tests
22

33
on:
44
push:
@@ -50,4 +50,4 @@ jobs:
5050
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
5151
5252
mkdir artifacts-to-be-uploaded
53-
python -m tests.integration_tests artifacts-to-be-uploaded --ngpu 8
53+
python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8

.github/workflows/integration_test_8gpu_torchft.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,5 @@ jobs:
5353
RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
5454
echo "ft_integration_test"
5555
# Getting error - Cuda failure 217 'peer access is not supported between these two devices'
56-
python -m tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
56+
python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8
5757
# pkill -9 torchft_lighthouse

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
#### A PyTorch native platform for training generative AI models
66

7-
[![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
7+
[![8 GPU Feature Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_features.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
8+
[![8 GPU Model Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_models.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
89
[![arXiv](https://img.shields.io/badge/arXiv-2410.06511-b31b1b.svg)](https://arxiv.org/abs/2410.06511)
910
[![ICLR](https://img.shields.io/badge/ICLR-2025-violet.svg)](https://iclr.cc/virtual/2025/poster/29620)
1011
[![forum](https://img.shields.io/badge/pytorch-forum-DE3412.svg)](https://discuss.pytorch.org/c/distributed/torchtitan/44)

tests/README.md

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
# Tests
22

3-
This directory contains tests for the TorchTitan project, including unit tests and integration tests.
3+
This directory contains tests for the torchtitan project, including unit tests and integration tests.
44

55
## Test Structure
66

77
- `unit_tests/`: Contains unit tests for individual components
8-
- `integration_tests.py`: Contains integration tests that test multiple components together
9-
- `integration_tests_h100.py`: Contains integration tests specifically designed for H100 GPUs, which utilize symmetric memory and float8.
8+
- `integration_tests/`: Contains integration tests that test multiple components together
9+
- `base_config.toml`: Base configuration file for integration tests
10+
- `features.py`: Tests for torchtitan features and composability
11+
- `ft.py`: Fault-tolerance integration tests
12+
- `h100.py`: Tests cases for H100 GPUs
13+
- `models.py`: Tests for specific model architectures and configurations
1014
- `assets/`: Contains test assets and fixtures used by the tests
15+
- `tokenizer/`: Tokenizer configuration and vocabulary files for testing
16+
- `custom_schedule.csv`: Custom PP schedule for testing
1117

1218
## Running Tests
1319

@@ -16,7 +22,7 @@ This directory contains tests for the TorchTitan project, including unit tests a
1622
Ensure you have all development dependencies installed:
1723

1824
```bash
19-
pip install -r dev-requirements.txt
25+
pip install -r requirements-dev.txt
2026
pip install -r requirements.txt
2127
```
2228

@@ -25,25 +31,27 @@ pip install -r requirements.txt
2531
To run the integration tests:
2632

2733
```bash
28-
python ./tests/integration_tests.py <output_dir> [--config_dir CONFIG_DIR] [--test TEST] [--ngpu NGPU]
34+
python -m tests.integration_tests.run_tests <output_dir> [--config_path CONFIG_PATH] [--test_suite TEST_SUITE] [--test_name TEST_NAME] [--ngpu NGPU]
2935
```
3036

3137
Arguments:
3238
- `output_dir`: (Required) Directory where test outputs will be stored
33-
- `--config_dir`: (Optional) Directory containing configuration files (default: "./torchtitan/models/llama3/train_configs")
34-
- `--test`: (Optional) Specific test to run, use test names from the `build_test_list()` function (default: "all")
39+
- `--test_suite`: (Optional) Specific test suite to run by name (default: "features")
40+
- `--config_path`: (Optional) Path to the base config file (default: "./tests/integration_tests/base_config.toml")
41+
- `--test_name`: (Optional) Specific test to run by name (default: "all")
3542
- `--ngpu`: (Optional) Number of GPUs to use for testing (default: 8)
3643

3744
Examples:
3845
```bash
39-
# Run all integration tests with 8 GPUs
40-
python ./tests/integration_tests.py ./test_output
46+
# Run all model integration tests with 8 GPUs
47+
python -m tests.integration_tests.run_tests test_output
4148

42-
# Run a specific test with 4 GPUs
43-
python ./tests/integration_tests.py ./test_output --test default --ngpu 4
49+
# Run only core functionality tests for features
50+
python -m tests.integration_tests.run_tests test_output --test_suite features
51+
52+
# Run a specific test with 2 GPUs
53+
python -m tests.integration_tests.run_tests test_output --test_suite features --test_name gradient_accumulation --ngpu 2
4454

45-
# Run all tests with a custom config directory
46-
python ./tests/integration_tests.py ./test_output --config_dir ./my_configs
4755
```
4856

4957
### Running Unit Tests
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from dataclasses import dataclass
8+
from typing import Sequence
9+
10+
__all__ = [
11+
"OverrideDefinitions",
12+
]
13+
14+
15+
@dataclass
16+
class OverrideDefinitions:
17+
"""
18+
This class is used to define the override definitions for the integration tests.
19+
"""
20+
21+
override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
22+
test_descr: str = "default"
23+
test_name: str = "default"
24+
ngpu: int = 4
25+
26+
def __repr__(self):
27+
return self.test_descr
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
[job]
2+
dump_folder = "./outputs"
3+
description = "model debug training for integration test"
4+
print_args = false
5+
6+
[profiling]
7+
enable_profiling = false
8+
save_traces_folder = "profile_trace"
9+
profile_freq = 10
10+
enable_memory_snapshot = false
11+
save_memory_snapshot_folder = "memory_snapshot"
12+
13+
[metrics]
14+
log_freq = 1
15+
disable_color_printing = false
16+
enable_tensorboard = false
17+
save_tb_folder = "tb"
18+
enable_wandb = false
19+
20+
[model]
21+
name = "llama3"
22+
flavor = "debugmodel"
23+
# test folder with tokenizer.json, for debug purpose only
24+
tokenizer_path = "./tests/assets/tokenizer"
25+
# converters = ["float8"]
26+
27+
[optimizer]
28+
name = "AdamW"
29+
lr = 8e-4
30+
eps = 1e-8
31+
32+
[lr_scheduler]
33+
warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps
34+
decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps
35+
decay_type = "linear"
36+
lr_min = 0.0
37+
38+
[training]
39+
local_batch_size = 8
40+
seq_len = 2048
41+
max_norm = 1.0 # grad norm clipping
42+
steps = 10
43+
compile = false
44+
dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)
45+
46+
[parallelism]
47+
data_parallel_replicate_degree = 1
48+
data_parallel_shard_degree = -1
49+
fsdp_reshard_after_forward = "default" # default / never / always
50+
tensor_parallel_degree = 1
51+
enable_async_tensor_parallel = false
52+
pipeline_parallel_degree = 1
53+
context_parallel_degree = 1
54+
55+
[checkpoint]
56+
enable_checkpoint = false
57+
folder = "checkpoint"
58+
interval = 10
59+
last_save_model_only = false
60+
export_dtype = "float32"
61+
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
62+
63+
[activation_checkpoint]
64+
mode = "selective" # ["none", "selective", "full"]
65+
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy
66+
67+
[float8]
68+
enable_fsdp_float8_all_gather = false
69+
precompute_float8_dynamic_scale_for_fsdp = false
70+
filter_fqns = ["output"]
71+
72+
[validation]
73+
enabled = false
74+
dataset = "c4_validation"
75+
freq = 5
76+
steps = 10

0 commit comments

Comments
 (0)