Skip to content

Commit b11340b

Browse files
committed
cleanup
Signed-off-by: Ananth Subramaniam <[email protected]>
1 parent 5fa6953 commit b11340b

File tree

4 files changed

+22
-26
lines changed

4 files changed

+22
-26
lines changed

src/megatron/bridge/data/loaders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def cyclic_iter(iter: Iterable) -> Iterator:
109109
def get_train_valid_test_num_samples(cfg: ConfigContainer) -> tuple[int, int, int]:
110110
"""Calculate the number of samples for train, validation, and test sets.
111111
112-
Determines sample counts based on training mode (iteration-based vs sample-based),
112+
Determines sample counts based on training mode either specified iterations or samples,
113113
global batch size, and evaluation interval/iterations specified in the config.
114114
115115
Args:

src/megatron/bridge/recipes/utils/optimizer_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,9 @@ def distributed_fused_adam_with_cosine_annealing_samples(
113113
end_weight_decay=0.033,
114114
weight_decay_incr_style="constant",
115115
lr_decay_style="cosine",
116-
lr_warmup_samples=lr_warmup_samples, # Sample-based warmup
116+
lr_warmup_samples=lr_warmup_samples,
117117
lr_warmup_init=0.0,
118-
lr_decay_samples=lr_decay_samples, # Sample-based decay
118+
lr_decay_samples=lr_decay_samples,
119119
override_opt_param_scheduler=True,
120120
)
121121

src/megatron/bridge/training/config.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,15 +1149,13 @@ def _validate_training_scheduler_compatibility(self) -> None:
11491149

11501150
def _calculate_scheduler_steps(self) -> None:
11511151
"""Calculate scheduler steps for both iteration-based and sample-based training."""
1152-
# Determine original training mode (before train_iters was calculated from train_samples)
11531152
is_sample_based = self.train.train_samples is not None
11541153

11551154
if is_sample_based:
1156-
# Sample-based training
11571155
if self.scheduler.lr_decay_samples is None:
11581156
self.scheduler.lr_decay_samples = self.train.train_samples
1159-
self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples # Direct sample count
1160-
self.scheduler.wd_incr_steps = self.train.train_samples # Direct sample count
1157+
self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples
1158+
self.scheduler.wd_incr_steps = self.train.train_samples
11611159

11621160
if self.scheduler.lr_wsd_decay_samples is not None:
11631161
self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_samples

tests/functional_tests/L2_Launch_training.sh

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,24 @@ set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
1818
export CUDA_VISIBLE_DEVICES="0,1"
1919

2020
# Run standard tests first (excluding inprocess restart tests)
21-
#python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training -k "not test_inprocess_restart"
21+
python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training -k "not test_inprocess_restart"
2222

23-
python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training/test_sample_based_training.py -k "not test_inprocess_restart"
24-
25-
# # Run inprocess restart tests with ft_launcher if available
26-
# if command -v ft_launcher >/dev/null 2>&1; then
27-
# echo "ft_launcher found, running inprocess restart tests..."
23+
# Run inprocess restart tests with ft_launcher if available
24+
if command -v ft_launcher >/dev/null 2>&1; then
25+
echo "ft_launcher found, running inprocess restart tests..."
2826

29-
# # Set torch log level to reduce noise for inprocess restart tests
30-
# export TORCH_CPP_LOG_LEVEL="error"
27+
# Set torch log level to reduce noise for inprocess restart tests
28+
export TORCH_CPP_LOG_LEVEL="error"
3129

32-
# ft_launcher \
33-
# --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
34-
# --nnodes=1 --nproc-per-node=2 \
35-
# --ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
36-
# --ft-param-rank_out_of_section_timeout=300 \
37-
# --monitor-interval=5 --max-restarts=3 \
38-
# --ft-restart-policy=min-healthy \
39-
# -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
40-
# tests/functional_tests/training/test_inprocess_restart.py
41-
# fi
30+
ft_launcher \
31+
--rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
32+
--nnodes=1 --nproc-per-node=2 \
33+
--ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
34+
--ft-param-rank_out_of_section_timeout=300 \
35+
--monitor-interval=5 --max-restarts=3 \
36+
--ft-restart-policy=min-healthy \
37+
-m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
38+
tests/functional_tests/training/test_inprocess_restart.py
39+
fi
4240

43-
# coverage combine -q
41+
coverage combine -q

0 commit comments

Comments
 (0)