cleanup

ananthsub · ananthsub · commit b11340b2bb78 · 2025-09-26T20:54:03.000-07:00
Signed-off-by: Ananth Subramaniam &lt;ansubramania@nvidia.com&gt;
diff --git a/src/megatron/bridge/data/loaders.py b/src/megatron/bridge/data/loaders.py
@@ -109,7 +109,7 @@ def cyclic_iter(iter: Iterable) -> Iterator:
 def get_train_valid_test_num_samples(cfg: ConfigContainer) -> tuple[int, int, int]:
     """Calculate the number of samples for train, validation, and test sets.
 
-    Determines sample counts based on training mode (iteration-based vs sample-based),
+    Determines sample counts based on training mode either specified iterations or samples,
     global batch size, and evaluation interval/iterations specified in the config.
 
     Args:
diff --git a/src/megatron/bridge/recipes/utils/optimizer_utils.py b/src/megatron/bridge/recipes/utils/optimizer_utils.py
@@ -113,9 +113,9 @@ def distributed_fused_adam_with_cosine_annealing_samples(
         end_weight_decay=0.033,
         weight_decay_incr_style="constant",
         lr_decay_style="cosine",
-        lr_warmup_samples=lr_warmup_samples,  # Sample-based warmup
+        lr_warmup_samples=lr_warmup_samples,
         lr_warmup_init=0.0,
-        lr_decay_samples=lr_decay_samples,  # Sample-based decay
+        lr_decay_samples=lr_decay_samples,
         override_opt_param_scheduler=True,
     )
 
diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
@@ -1149,15 +1149,13 @@ def _validate_training_scheduler_compatibility(self) -> None:
 
     def _calculate_scheduler_steps(self) -> None:
         """Calculate scheduler steps for both iteration-based and sample-based training."""
-        # Determine original training mode (before train_iters was calculated from train_samples)
         is_sample_based = self.train.train_samples is not None
 
         if is_sample_based:
-            # Sample-based training
             if self.scheduler.lr_decay_samples is None:
                 self.scheduler.lr_decay_samples = self.train.train_samples
-            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples  # Direct sample count
-            self.scheduler.wd_incr_steps = self.train.train_samples  # Direct sample count
+            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples
+            self.scheduler.wd_incr_steps = self.train.train_samples
 
             if self.scheduler.lr_wsd_decay_samples is not None:
                 self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_samples
diff --git a/tests/functional_tests/L2_Launch_training.sh b/tests/functional_tests/L2_Launch_training.sh
@@ -18,26 +18,24 @@ set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
 export CUDA_VISIBLE_DEVICES="0,1"
 
 # Run standard tests first (excluding inprocess restart tests)
-#python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training -k "not test_inprocess_restart"
+python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training -k "not test_inprocess_restart"
 
-python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training/test_sample_based_training.py -k "not test_inprocess_restart"
-
-# # Run inprocess restart tests with ft_launcher if available
-# if command -v ft_launcher >/dev/null 2>&1; then
-#     echo "ft_launcher found, running inprocess restart tests..."
+# Run inprocess restart tests with ft_launcher if available
+if command -v ft_launcher >/dev/null 2>&1; then
+    echo "ft_launcher found, running inprocess restart tests..."
     
-#     # Set torch log level to reduce noise for inprocess restart tests
-#     export TORCH_CPP_LOG_LEVEL="error"
+    # Set torch log level to reduce noise for inprocess restart tests
+    export TORCH_CPP_LOG_LEVEL="error"
     
-#     ft_launcher \
-#       --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
-#       --nnodes=1 --nproc-per-node=2 \
-#       --ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
-#       --ft-param-rank_out_of_section_timeout=300 \
-#       --monitor-interval=5 --max-restarts=3 \
-#       --ft-restart-policy=min-healthy \
-#       -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
-#       tests/functional_tests/training/test_inprocess_restart.py
-# fi
+    ft_launcher \
+      --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
+      --nnodes=1 --nproc-per-node=2 \
+      --ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
+      --ft-param-rank_out_of_section_timeout=300 \
+      --monitor-interval=5 --max-restarts=3 \
+      --ft-restart-policy=min-healthy \
+      -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
+      tests/functional_tests/training/test_inprocess_restart.py
+fi
 
-# coverage combine -q
+coverage combine -q