pytorch
diff --git a/‎.github/workflows/integration_test_8gpu_features.yaml‎
Lines changed: 52 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_features.yaml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_h100.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration_test_8gpu_h100.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration_test_8gpu.yaml‎ renamed to ‎.github/workflows/integration_test_8gpu_models.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/integration_test_8gpu.yaml‎ renamed to ‎.github/workflows/integration_test_8gpu_models.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_torchft.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration_test_8gpu_torchft.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/README.md‎
Lines changed: 21 additions & 13 deletions b/‎tests/README.md‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎tests/integration_tests/__init__.py‎
Lines changed: 27 additions & 0 deletions b/‎tests/integration_tests/__init__.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tests/integration_tests/base_config.toml‎
Lines changed: 76 additions & 0 deletions b/‎tests/integration_tests/base_config.toml‎
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,52 @@
+name: 8 GPU Feature Tests
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
@@ -53,4 +53,4 @@ jobs:
         mkdir artifacts-to-be-uploaded
 
         # Enable CPP stacktraces for debugging symmetric memory initialization errors.
-        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests_h100 artifacts-to-be-uploaded --ngpu 8
+        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
@@ -1,4 +1,4 @@
-name: 8 GPU Integration Test
+name: 8 GPU Model Tests
 
 on:
   push:
@@ -50,4 +50,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
-        python -m tests.integration_tests artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
@@ -53,5 +53,5 @@ jobs:
         RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
         echo "ft_integration_test"
         # Getting error - Cuda failure 217 'peer access is not supported between these two devices'
-        python -m tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8
         # pkill -9 torchft_lighthouse
@@ -4,7 +4,8 @@
 
 #### A PyTorch native platform for training generative AI models
 
-[![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
+[![8 GPU Feature Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_features.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
+[![8 GPU Model Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_models.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
 [![arXiv](https://img.shields.io/badge/arXiv-2410.06511-b31b1b.svg)](https://arxiv.org/abs/2410.06511)
 [![ICLR](https://img.shields.io/badge/ICLR-2025-violet.svg)](https://iclr.cc/virtual/2025/poster/29620)
 [![forum](https://img.shields.io/badge/pytorch-forum-DE3412.svg)](https://discuss.pytorch.org/c/distributed/torchtitan/44)
 
@@ -1,13 +1,19 @@
 # Tests
 
-This directory contains tests for the TorchTitan project, including unit tests and integration tests.
+This directory contains tests for the torchtitan project, including unit tests and integration tests.
 
 ## Test Structure
 
 - `unit_tests/`: Contains unit tests for individual components
-- `integration_tests.py`: Contains integration tests that test multiple components together
-- `integration_tests_h100.py`: Contains integration tests specifically designed for H100 GPUs, which utilize symmetric memory and float8.
+- `integration_tests/`: Contains integration tests that test multiple components together
+  - `base_config.toml`: Base configuration file for integration tests
+  - `features.py`: Tests for torchtitan features and composability
+  - `ft.py`: Fault-tolerance integration tests
+  - `h100.py`: Tests cases for H100 GPUs
+  - `models.py`: Tests for specific model architectures and configurations
 - `assets/`: Contains test assets and fixtures used by the tests
+  - `tokenizer/`: Tokenizer configuration and vocabulary files for testing
+  - `custom_schedule.csv`: Custom PP schedule for testing
 
 ## Running Tests
 
@@ -16,7 +22,7 @@ This directory contains tests for the TorchTitan project, including unit tests a
 Ensure you have all development dependencies installed:
 
 ```bash
-pip install -r dev-requirements.txt
+pip install -r requirements-dev.txt
 pip install -r requirements.txt
 ```
 
@@ -25,25 +31,27 @@ pip install -r requirements.txt
 To run the integration tests:
 
 ```bash
-python ./tests/integration_tests.py <output_dir> [--config_dir CONFIG_DIR] [--test TEST] [--ngpu NGPU]
+python -m tests.integration_tests.run_tests <output_dir> [--config_path CONFIG_PATH] [--test_suite TEST_SUITE] [--test_name TEST_NAME] [--ngpu NGPU]
 ```
 
 Arguments:
 - `output_dir`: (Required) Directory where test outputs will be stored
-- `--config_dir`: (Optional) Directory containing configuration files (default: "./torchtitan/models/llama3/train_configs")
-- `--test`: (Optional) Specific test to run, use test names from the `build_test_list()` function (default: "all")
+- `--test_suite`: (Optional) Specific test suite to run by name (default: "features")
+- `--config_path`: (Optional) Path to the base config file (default: "./tests/integration_tests/base_config.toml")
+- `--test_name`: (Optional) Specific test to run by name (default: "all")
 - `--ngpu`: (Optional) Number of GPUs to use for testing (default: 8)
 
 Examples:
 ```bash
-# Run all integration tests with 8 GPUs
-python ./tests/integration_tests.py ./test_output
+# Run all model integration tests with 8 GPUs
+python -m tests.integration_tests.run_tests test_output
 
-# Run a specific test with 4 GPUs
-python ./tests/integration_tests.py ./test_output --test default --ngpu 4
+# Run only core functionality tests for features
+python -m tests.integration_tests.run_tests test_output --test_suite features
+
+# Run a specific test with 2 GPUs
+python -m tests.integration_tests.run_tests test_output --test_suite features --test_name gradient_accumulation --ngpu 2
 
-# Run all tests with a custom config directory
-python ./tests/integration_tests.py ./test_output --config_dir ./my_configs
 ```
 
 ### Running Unit Tests
 
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Sequence
+
+__all__ = [
+    "OverrideDefinitions",
+]
+
+
+@dataclass
+class OverrideDefinitions:
+    """
+    This class is used to define the override definitions for the integration tests.
+    """
+
+    override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
+    test_descr: str = "default"
+    test_name: str = "default"
+    ngpu: int = 4
+
+    def __repr__(self):
+        return self.test_descr
@@ -0,0 +1,76 @@
+[job]
+dump_folder = "./outputs"
+description = "model debug training for integration test"
+print_args = false
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "llama3"
+flavor = "debugmodel"
+# test folder with tokenizer.json, for debug purpose only
+tokenizer_path = "./tests/assets/tokenizer"
+# converters = ["float8"]
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+
+[training]
+local_batch_size = 8
+seq_len = 2048
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+
+[checkpoint]
+enable_checkpoint = false
+folder = "checkpoint"
+interval = 10
+last_save_model_only = false
+export_dtype = "float32"
+async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = "selective"  # ["none", "selective", "full"]
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = ["output"]
+
+[validation]
+enabled = false
+dataset = "c4_validation"
+freq = 5
+steps = 10