pytorch · wwwjn · Sep 12, 2025
@@ -4,10 +4,10 @@ description = "DeepSeek-V3 16B model training"
 print_args = false
 
 [profiling]
-enable_profiling = false
+enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 10
-enable_memory_snapshot = false
+enable_memory_snapshot = true
 save_memory_snapshot_folder = "memory_snapshot"
 
 [metrics]
@@ -35,10 +35,10 @@ decay_type = "cosine"
 min_lr_factor = 0.1
 
 [training]
-local_batch_size = 8
-seq_len = 4096
+local_batch_size = 4
+seq_len = 2048
 max_norm = 1.0  # grad norm clipping
-steps = 1000
+steps = 100
 dataset = "c4"  # supported datasets: c4_test (2K), c4 (177M)
 
 [parallelism]
@@ -49,7 +49,7 @@ tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
 pipeline_parallel_degree = 1
 pipeline_parallel_schedule = "Interleaved1F1B"
-expert_parallel_degree = 8
+expert_parallel_degree = 4
 expert_tensor_parallel_degree = 1
 
 [checkpoint]
@@ -61,11 +61,11 @@ export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem]"
 
 [activation_checkpoint]
-mode = "selective"  # ["none", "selective", "full"]
+mode = "full"  # ["none", "selective", "full"]
 selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [compile]
-enable=true
+enable=false
 components = ["loss"] # ["model", "loss"]
 
 [float8]