diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml index 7e9983a532..1f37cd296b 100644 --- a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml +++ b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml @@ -4,10 +4,10 @@ description = "DeepSeek-V3 16B model training" print_args = false [profiling] -enable_profiling = false +enable_profiling = true save_traces_folder = "profile_trace" profile_freq = 10 -enable_memory_snapshot = false +enable_memory_snapshot = true save_memory_snapshot_folder = "memory_snapshot" [metrics] @@ -35,10 +35,10 @@ decay_type = "cosine" min_lr_factor = 0.1 [training] -local_batch_size = 8 -seq_len = 4096 +local_batch_size = 4 +seq_len = 2048 max_norm = 1.0 # grad norm clipping -steps = 1000 +steps = 100 dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) [parallelism] @@ -49,7 +49,7 @@ tensor_parallel_degree = 1 enable_async_tensor_parallel = false pipeline_parallel_degree = 1 pipeline_parallel_schedule = "Interleaved1F1B" -expert_parallel_degree = 8 +expert_parallel_degree = 4 expert_tensor_parallel_degree = 1 [checkpoint] @@ -61,11 +61,11 @@ export_dtype = "float32" async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem]" [activation_checkpoint] -mode = "selective" # ["none", "selective", "full"] +mode = "full" # ["none", "selective", "full"] selective_ac_option = 'op' # 'int' = ac every positive int layer or 'op', ac based on ops policy [compile] -enable=true +enable=false components = ["loss"] # ["model", "loss"] [float8]