refactor v1

wwwjn · wwwjn · commit 2dfda3e877e1 · 2025-07-21T11:51:54.000-07:00
diff --git a/.github/workflows/integration_test_8gpu_core.yaml b/.github/workflows/integration_test_8gpu_core.yaml
@@ -46,5 +46,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py --config_dir ./torchtitan/models/llama3/train_configs artifacts-to-be-uploaded/llama3 --ngpu 8
-        python ./tests/integration_tests.py --config_dir ./torchtitan/models/deepseek_v3/train_configs artifacts-to-be-uploaded/deepseek --ngpu 4
+        python ./tests/integration_tests/integration_tests.py artifacts-to-be-uploaded --test_suite core --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -47,4 +47,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+        python ./tests/integration_tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_parallelsim.yaml b/.github/workflows/integration_test_8gpu_parallelsim.yaml
@@ -0,0 +1,49 @@
+name: 8 GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python ./tests/integration_tests/integration_tests.py artifacts-to-be-uploaded --test_suite parallelsim --ngpu 8
diff --git a/tests/README.md b/tests/README.md
@@ -5,8 +5,10 @@ This directory contains tests for the TorchTitan project, including unit tests a
 ## Test Structure
 
 - `unit_tests/`: Contains unit tests for individual components
-- `integration_tests.py`: Contains integration tests that test multiple components together
-- `integration_tests_h100.py`: Contains integration tests specifically designed for H100 GPUs, which utilize symmetric memory and float8.
+- `integration_tests/`: Contains integration tests that test multiple components together
+  - `integration_tests.py`: Main integration tests for various model configurations
+  - `integration_tests_h100.py`: Tests specifically designed for H100 GPUs, utilizing symmetric memory and float8
+  - `base_config.toml`: Base configuration file for integration tests
 - `assets/`: Contains test assets and fixtures used by the tests
 
 ## Running Tests
@@ -25,25 +27,27 @@ pip install -r requirements.txt
 To run the integration tests:
 
 ```bash
-python ./tests/integration_tests.py <output_dir> [--config_dir CONFIG_DIR] [--test TEST] [--ngpu NGPU]
+python -m tests.integration_tests.integration_tests <output_dir> [--config_path CONFIG_PATH] [--test_name TEST_NAME] [--test_suite TEST_SUITE] [--model MODEL] [--ngpu NGPU]
 ```
 
 Arguments:
 - `output_dir`: (Required) Directory where test outputs will be stored
-- `--config_dir`: (Optional) Directory containing configuration files (default: "./torchtitan/models/llama3/train_configs")
-- `--test`: (Optional) Specific test to run, use test names from the `build_test_list()` function (default: "all")
+- `--config_path`: (Optional) Path to the base config file (default: "./tests/integration_tests/base_config.toml")
+- `--test_name`: (Optional) Specific test to run by name (default: "all")
+- `--test_suite`: (Optional) Test suite to run: 'core', 'parallelism', or 'all' (default: "all")
+- `--model`: (Optional) Specify the model to run tests on (default: "all")
 - `--ngpu`: (Optional) Number of GPUs to use for testing (default: 8)
 
 Examples:
 ```bash
 # Run all integration tests with 8 GPUs
-python ./tests/integration_tests.py ./test_output
+python -m tests.integration_tests.integration_tests ./test_output
 
 # Run a specific test with 4 GPUs
-python ./tests/integration_tests.py ./test_output --test default --ngpu 4
+python -m tests.integration_tests.integration_tests ./test_output --test_name tp_only --ngpu 4
 
-# Run all tests with a custom config directory
-python ./tests/integration_tests.py ./test_output --config_dir ./my_configs
+# Run only core functionality tests
+python -m tests.integration_tests.integration_tests ./test_output --test_suite core
 ```
 
 ### Running Unit Tests
diff --git a/tests/integration_tests/base_config.toml b/tests/integration_tests/base_config.toml
@@ -21,7 +21,7 @@ save_tb_folder = "tb"
 enable_wandb = false
 
 [model]
-name = "deepseek_v3"
+name = "llama3"  # option: llama3, deepseek_v3
 flavor = "debugmodel"
 # test tokenizer, for debug purpose only
 tokenizer_path = "./tests/assets/tokenizer"
diff --git a/tests/integration_tests/integration_tests.py b/tests/integration_tests/integration_tests.py
diff --git a/tests/integration_tests/integration_tests_h100.py b/tests/integration_tests/integration_tests_h100.py