pytorch
diff --git a/‎.github/workflows/integration_test_8gpu_torchft.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration_test_8gpu_torchft.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/integration_tests/__init__.py‎ b/‎tests/integration_tests/__init__.py‎
diff --git a/‎tests/integration_tests/integration_tests.py‎
Lines changed: 0 additions & 21 deletions b/‎tests/integration_tests/integration_tests.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎tests/integration_tests_ft.py‎ renamed to ‎tests/integration_tests/integration_tests_ft.py‎
Lines changed: 38 additions & 30 deletions b/‎tests/integration_tests_ft.py‎ renamed to ‎tests/integration_tests/integration_tests_ft.py‎
Lines changed: 38 additions & 30 deletions
diff --git a/‎tests/integration_tests/integration_tests_h100.py‎
Lines changed: 3 additions & 18 deletions b/‎tests/integration_tests/integration_tests_h100.py‎
Lines changed: 3 additions & 18 deletions
@@ -49,5 +49,5 @@ jobs:
         RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
         echo "ft_integration_test"
         # Getting error - Cuda failure 217 'peer access is not supported between these two devices'
-        python -m tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
         # pkill -9 torchft_lighthouse
@@ -81,27 +81,6 @@ def build_core_functionality_tests() -> List[TestCaseConfigs]:
                 "Checkpoint Integration Test - Save Load Full Checkpoint",
                 "full_checkpoint",
             ),
-            TestCaseConfigs(
-                [
-                    [
-                        "--checkpoint.enable_checkpoint",
-                        "--checkpoint.last_save_model_weights_only",
-                    ],
-                ],
-                "Checkpoint Integration Test - Save Model Weights Only fp32",
-                "last_save_model_weights_only_fp32",
-            ),
-            TestCaseConfigs(
-                [
-                    [
-                        "--checkpoint.enable_checkpoint",
-                        "--checkpoint.last_save_model_weights_only",
-                        "--checkpoint.export_dtype bfloat16",
-                    ],
-                ],
-                "Checkpoint Integration Test - Save Model Weights Only bf16",
-                "last_save_model_weights_only_bf16",
-            ),
             TestCaseConfigs(
                 [
                     [
 
@@ -11,45 +11,41 @@
 import subprocess
 from collections import defaultdict
 
-from tests.integration_tests import OverrideDefinitions
+from .integration_tests import TestCaseConfigs
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-try:
-    import tomllib
-except ModuleNotFoundError:
-    import tomli as tomllib
-
 
 def build_test_list():
     """
-    key is the config file name and value is a list of OverrideDefinitions
+    key is the config file name and value is a list of TestCaseConfigs
     that is used to generate variations of integration tests based on the
     same root config file.
     """
-    integration_tests_flavors = defaultdict(list)
-    integration_tests_flavors["debug_model.toml"] = [
-        OverrideDefinitions(
+    integration_tests_flavors = []
+    integration_tests_flavors.append([
+        TestCaseConfigs(
             [
                 ["--training.steps 10", "--checkpoint.enable_checkpoint"],
             ],
             "Default TorchFT integration test",
             "default_torchft",
             ngpu=8,
         )
-    ]
+    ])
     return integration_tests_flavors
 
 
 def _run_cmd(cmd):
     return subprocess.run([cmd], text=True, shell=True)
 
 
-def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
+def run_single_test(test_flavor: TestCaseConfigs, model_name: str, full_path: str, output_dir: str):
     # run_test supports sequence of tests.
     test_name = test_flavor.test_name
     dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
+    model_name_arg = f"--model.name {model_name}"
 
     # Use all 8 GPUs in a single replica
     # TODO: Use two replica groups
@@ -70,6 +66,7 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
             )
 
             cmd += " " + dump_folder_arg
+            cmd += " " + model_name_arg
             if override_arg:
                 cmd += " " + " ".join(override_arg)
 
@@ -100,35 +97,46 @@ def run_tests(args):
     if args.ngpu < 8:
         logger.info("Skipping TorchFT integration tests as we need 8 GPUs.")
         return
-
-    for config_file in os.listdir(args.config_dir):
-        if not config_file.endswith(".toml"):
-            continue
-
-        full_path = os.path.join(args.config_dir, config_file)
-        with open(full_path, "rb") as f:
-            config = tomllib.load(f)
-            is_integration_test = config["job"].get("use_for_integration_test", False)
-            if not is_integration_test:
+    
+    for test_flavor in integration_tests_flavors:
+        model_names = test_flavor.supported_models
+        for model_name in model_names:
+            # Filter by test_name if specified
+            if args.test_name != "all" and test_flavor.test_name != args.test_name:
                 continue
 
-            for test_flavor in integration_tests_flavors[config_file]:
-                if not (args.test == "all" or test_flavor.test_name == args.test):
-                    continue
-
-                run_test(test_flavor, full_path, args.output_dir)
+            # Check if config file exists
+            assert args.config_path.endswith(
+                ".toml"
+            ), "Base config path must end with .toml"
+            assert os.path.exists(
+                args.config_path
+            ), f"Base config path {args.config_path} does not exist"
+
+            # Check if we have enough GPUs
+            if args.ngpu < test_flavor.ngpu:
+                logger.info(
+                    f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus,"
+                    f" because --ngpu arg is {args.ngpu}"
+                )
+            else:
+                run_single_test(
+                    test_flavor, model_name, args.config_path, args.output_dir
+                )
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("output_dir")
     parser.add_argument(
-        "--config_dir", default="./torchtitan/models/llama3/train_configs"
+        "--config_path",
+        default="./tests/integration_tests/base_config.toml",
+        help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
     )
     parser.add_argument(
-        "--test",
+        "--test_name",
         default="all",
-        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+        help="Specific test name to run (e.g., 'tp_only', 'full_checkpoint'). Use 'all' to run all tests (default: all)",
     )
     parser.add_argument("--ngpu", default=8, type=int)
     args = parser.parse_args()
 
@@ -7,28 +7,16 @@
 import argparse
 import logging
 import os
-<<<<<<< HEAD
-import subprocess
-from collections import defaultdict
-
-from .integration_tests import OverrideDefinitions
-=======
 
 from .integration_tests import run_single_test, TestCaseConfigs
->>>>>>> 2dfda3e (refactor v1)
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-<<<<<<< HEAD
-
-def build_test_list():
-=======
 def build_h100_test_list():
->>>>>>> 2dfda3e (refactor v1)
     """
-    key is the config file name and value is a list of OverrideDefinitions
+    key is the config file name and value is a list of TestCaseConfigs
     that is used to generate variations of integration tests based on the
     same root config file.
     """
@@ -102,11 +90,8 @@ def build_h100_test_list():
     return integration_tests_flavors
 
 
-def run_h100_tests(args):
-    # If user specifies a specific test name, the test_suite argument is ignored
-    if args.test_name != "all":
-        args.test_suite = "all"
-
+def run_tests(args):
+    """Run all H100 integration tests"""
     # build integration tests list
     test_list = build_h100_test_list()