@@ -18,26 +18,24 @@ set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
1818export  CUDA_VISIBLE_DEVICES=" 0,1" 
1919
2020#  Run standard tests first (excluding inprocess restart tests)
21- # python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA tests/functional_tests/training -k "not test_inprocess_restart"
21+ python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace/ --parallel-mode -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m " not pleasefixme" " not test_inprocess_restart" 
2222
23- python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m " not pleasefixme" " not test_inprocess_restart" 
24- 
25- #  # Run inprocess restart tests with ft_launcher if available
26- #  if command -v ft_launcher >/dev/null 2>&1; then
27- #      echo "ft_launcher found, running inprocess restart tests..."
23+ #  Run inprocess restart tests with ft_launcher if available
24+ if  command  -v ft_launcher > /dev/null 2>&1 ;  then 
25+     echo  " ft_launcher found, running inprocess restart tests..." 
2826
29- #       # Set torch log level to reduce noise for inprocess restart tests
30- #       export TORCH_CPP_LOG_LEVEL="error"
27+     #  Set torch log level to reduce noise for inprocess restart tests
28+     export  TORCH_CPP_LOG_LEVEL=" error" 
3129
32- #       ft_launcher \
33- #         --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
34- #         --nnodes=1 --nproc-per-node=2 \
35- #         --ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
36- #         --ft-param-rank_out_of_section_timeout=300 \
37- #         --monitor-interval=5 --max-restarts=3 \
38- #         --ft-restart-policy=min-healthy \
39- #         -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
40- #         tests/functional_tests/training/test_inprocess_restart.py
41- #   fi
30+     ft_launcher \
31+       --rdzv_backend=c10d --rdzv_endpoint=127.0.0.1:29500 \
32+       --nnodes=1 --nproc-per-node=2 \
33+       --ft-param-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
34+       --ft-param-rank_out_of_section_timeout=300 \
35+       --monitor-interval=5 --max-restarts=3 \
36+       --ft-restart-policy=min-healthy \
37+       -m pytest -o log_cli=true -o log_cli_level=INFO -v -s -x -m " not pleasefixme" 
38+       tests/functional_tests/training/test_inprocess_restart.py
39+ fi 
4240
43- #   coverage combine -q
41+ coverage combine -q
0 commit comments