Skip to content

Commit 60f8e55

Browse files
authored
Reduce CI time (#1948)
Not much time (~20s) is saved but we should delete checkpoint folder anyway as there is no use to upload the checkpoint folder.
1 parent ff07852 commit 60f8e55

8 files changed

+11
-1
lines changed

.github/workflows/integration_test_8gpu_features.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,6 @@ jobs:
7575
7676
export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
7777
python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
78+
79+
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
80+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_h100.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,4 @@ jobs:
5454
5555
# Enable CPP stacktraces for debugging symmetric memory initialization errors.
5656
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
57+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_models.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,4 @@ jobs:
5353
mkdir artifacts-to-be-uploaded
5454
python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
5555
python -m tests.integration_tests.flux artifacts-to-be-uploaded/flux --ngpu 8
56+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_simple_fsdp.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,4 @@ jobs:
5454
5555
# Run the numerics unit tests of SimpleFSDP
5656
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v
57+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_torchcomms.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ jobs:
5151
5252
mkdir artifacts-to-be-uploaded
5353
TEST_BACKEND=ncclx TRAIN_FILE=torchtitan.experiments.torchcomms.train python -m torchtitan.experiments.torchcomms.integration_tests artifacts-to-be-uploaded --ngpu 8
54+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_torchft.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,4 @@ jobs:
5757
# Getting error - Cuda failure 217 'peer access is not supported between these two devices'
5858
python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8
5959
# pkill -9 torchft_lighthouse
60+
rm -rf artifacts-to-be-uploaded/*/checkpoint

.github/workflows/integration_test_8gpu_vlm.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,4 @@ jobs:
5353
5454
mkdir artifacts-to-be-uploaded
5555
python -m torchtitan.experiments.vlm.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
56+
rm -rf artifacts-to-be-uploaded/*/checkpoint

tests/integration_tests/run_tests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import argparse
88
import os
99
import subprocess
10+
import time
1011

1112
from torchtitan.tools.logging import logger
1213

@@ -53,7 +54,7 @@ def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir
5354
if override_arg:
5455
cmd += " " + " ".join(override_arg)
5556
logger.info(
56-
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
57+
f"===== {time.strftime('%Y-%m-%d %H:%M:%S')} Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
5758
)
5859

5960
# save checkpoint (idx == 0) and load it for generation (idx == 1)

0 commit comments

Comments
 (0)