pytorch · yiming0416 · Oct 29, 2025 · Oct 27, 2025
@@ -0,0 +1,53 @@
+name: Compiler Toolkit 8 GPU Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/compiler_toolkit/**'
+      - '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/compiler_toolkit/**'
+      - '.github/workflows/integration_test_8gpu_compiler_toolkit.yaml'
+  schedule:
+    # Runs every 12 hours
+    - cron: '0 */12 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.compiler_toolkit.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
@@ -30,4 +30,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [torchcomms](./torchcomms/) | TBA | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) |
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
-| [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
+| [compiler_toolkit](./compiler_tookit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
diff --git a/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py b/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py
@@ -10,6 +10,7 @@
 import torch
 
 from torch.fx.traceback import annotate_fn
+
 from torchtitan.config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.experiments.compiler_toolkit.common_utils import (

diff --git a/torchtitan/experiments/compiler_toolkit/graph_utils.py b/torchtitan/experiments/compiler_toolkit/graph_utils.py
@@ -155,7 +155,7 @@ def joint_graph_builder(
         model_kwargs: Dict of model input keyword arguments
         fw_compiler: Optional custom forward compiler function
         bw_compiler: Optional custom backward compiler function
-        validation_fn: Optional function to validate the joint graph
+        joint_custom_pass: Optional custom pass to run on the joint graph
     """
     assert isinstance(model_args, tuple)
     for arg in model_args:

diff --git a/torchtitan/experiments/compiler_toolkit/tests/__init__.py b/torchtitan/experiments/compiler_toolkit/tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py b/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+from tests.integration_tests import OverrideDefinitions
+from tests.integration_tests.run_tests import run_tests
+
+
+def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
+    """
+    returns a list of OverrideDefinitions that is used to generate
+    variations of integration tests based on the same root config file.
+    """
+    integration_tests_flavors = [
+        # llama3 tests
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.llama3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--activation_checkpoint.mode none",
+                ],
+            ],
+            "llama3 FSDP+TP",
+            "llama3_fsdp_tp",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.llama3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--model.flavor debugmodel_flex_attn",
+                    "--activation_checkpoint.mode none",
+                ],
+            ],
+            "llama3 FSDP+TP+FlexAttn",
+            "llama3_fsdp_tp_flexattn",
+            ngpu=4,
+        ),
+        # deepseek_v3 tests
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.deepseek_v3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.expert_parallel_degree 4",
+                    "--parallelism.expert_tensor_parallel_degree 1",
+                    "--activation_checkpoint.mode none",
+                ],
+            ],
+            "deepseek_v3 FSDP+TP+EP",
+            "deepseekv3_fsdp_tp_ep",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.deepseek_v3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.expert_parallel_degree 4",
+                    "--parallelism.expert_tensor_parallel_degree 1",
+                    "--activation_checkpoint.mode none",
+                    "--model.flavor debugmodel_flex_attn",
+                ],
+            ],
+            "deepseek_v3 FSDP+TP+EP+FlexAttention",
+            "deepseekv3_fsdp_tp_ep_flexattention",
+            ngpu=4,
+        ),
+    ]
+    return integration_tests_flavors
+
+
+_TEST_SUITES_FUNCTION = {
+    "compiler_toolkit": build_compiler_toolkit_test_list,
+}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_path",
+        default="./tests/integration_tests/base_config.toml",
+        help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
+    )
+    parser.add_argument(
+        "--test_name",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=8, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+
+    test_list = _TEST_SUITES_FUNCTION["compiler_toolkit"]()
+    run_tests(args, test_list)
+
+
+if __name__ == "__main__":
+    main()